struct cgroup {unsignedlong flags; /* "unsigned long" so bitops work */ /* count users of this cgroup. >0 means busy, but doesn't * necessarily indicate the number of tasks in the * cgroup */atomic_t count; /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. */struct list_head sibling; /* my parent's children */struct list_head children; /* my children */struct cgroup *parent; /* my parent */struct dentry *dentry; /* cgroup fs entry */ /* Private pointers for each registered subsystem */struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];struct cgroupfs_root *root;struct cgroup *top_cgroup; /* * List of cg_cgroup_links pointing at css_sets with * tasks in this cgroup. Protected by css_set_lock */struct list_head css_sets; /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by * release_list_lock */struct list_head release_list;};
cgroupfs_root -- Hierarchy
/* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active * hierarchy */struct cgroupfs_root {struct super_block *sb; /* * The bitmask of subsystems intended to be attached to this * hierarchy */unsignedlong subsys_bits; /* The bitmask of subsystems currently attached to this hierarchy */unsignedlong actual_subsys_bits; /* A list running through the attached subsystems */struct list_head subsys_list; /* The root cgroup for this hierarchy */struct cgroup top_cgroup; /* Tracks how many cgroups are currently defined in hierarchy.*/int number_of_cgroups; /* A list running through the mounted hierarchies */struct list_head root_list; /* Hierarchy-specific flags */unsignedlong flags; /* The path to use for release notifications. No locking * between setting and use - so if userspace updates this * while child cgroups exist, you could miss a * notification. We ensure that it's always a valid * NUL-terminated string */char release_agent_path[PATH_MAX];};
/* * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the * subsystems that are otherwise unattached - it never has more than a * single cgroup, and all tasks are part of that cgroup. */staticstruct cgroupfs_root rootnode; 虚拟根节点,用来绑定全部的资源管理器/* The list of hierarchy roots */ 每次 mount 就会创建一个 cgroup rootfs hierachy), 挂载在这,最开始的 dummy top也在staticLIST_HEAD(roots);staticint root_count;/* dummytop is a shorthand for the dummy hierarchy's top cgroup */#definedummytop (&rootnode.top_cgroup)
值得一提的是,最开始初始化的时候,创建了一个虚拟的 root( dummy top ),用来挂载内核全部的资源管理器。
其实,最开始的创建的一个 dummy top 就可以认为是一个 Hierarchy,只是它连接了全部的资源管理器。
这里只画出了俩个 subsystems
当用户挂载一个新的 Hierarchy的时候,会指定它需要的 资源管理器(subsystem)。后来新建的 Hierarchy 和 最开始的 dummy top 是平级的,挂载在一个全局的链表上。
我习惯把它理解为,一个具体的 “资源”,subsystem 只是强调了 资源管理器 的存在,而到底拥有多少资源,是这个结构说了算,通过 suffix “ state ”也能略晓一二,描述的是资源的状态。
以 cpu 这个 subsys 来说,就是同一个 Hierarchy 中的进程可能也不在一个 进程组,所以有了这个结构的存在。
/* Per-subsystem/per-cgroup state maintained by the system. */struct cgroup_subsys_state { /* The cgroup that this subsystem is attached to. Useful * for subsystems that want to know about the cgroup * hierarchy structure */struct cgroup *cgroup; /* State maintained by the cgroup system to allow * subsystems to be "busy". Should be accessed via css_get() * and css_put() */atomic_t refcnt;unsignedlong flags;};
struct css_set { /* Reference count */struct kref ref; /* * List running through all cgroup groups. Protected by * css_set_lock */struct list_head list; /* * List running through all tasks using this cgroup * group. Protected by css_set_lock */struct list_head tasks; /* * List of cg_cgroup_link objects on link chains from * cgroups referenced from this css_set. Protected by * css_set_lock */struct list_head cg_links; /* * Set of subsystem states, one for each subsystem. This array * is immutable after creation apart from the init_css_set * during subsystem registration (at boot time). */struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];};/* Link structure for associating css_set objects with cgroups */struct cg_cgroup_link { /* * List running through cg_cgroup_links associated with a * cgroup, anchored on cgroup->css_sets */struct list_head cgrp_link_list; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links */struct list_head cg_link_list;struct css_set *cg;};
Each task in the system has a reference-counted pointer to a css_set.
A css_set contains a set of reference-counted pointers to cgroup_subsys_state objects, one for each cgroup subsystem registered in the system. There is no direct link from a task to the cgroup of which it's a member in each hierarchy, but this can be determined by following pointers through the cgroup_subsys_state objects. This is because accessing the subsystem state is something that's expected to happen frequently and in performance-critical code, whereas operations that require a task's actual cgroup assignments (in particular, moving between cgroups) are less common. A linked list runs through the cg_list field of each task_struct using the css_set, anchored at css_set->tasks.
比如,我们最开始的例子,进程就受到 cpuset cpu 俩个 资源 控制,限制了它的 CPU 核 以及 CPU 的资源占用,那么可以说对于 b c 俩个进程,它们都在一个 css_set,如果还有其它进程也在跟他们在一个组内(受同样的 资源 控制),那么只需要增加 css_set 的一个引用计数就好了,它是为了节约资源才设计的。
可能还有一个进程,只受到 cpuset 的限制,但是它并不受到 另外一个组的 cpu 资源的限制,所以它们不在一个 set,为了应对各种复杂的情况,所以出现了这个 css_set
内核也有默认的css_set,目的是为了连接 dummy top 上的 css 还有上面所有的进程,以及其余所有的 css_set,这里没有画出来,剩余的全部进程可能都在最开始的 css_set 上。
/* The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */staticstruct css_set init_css_set;staticstruct cg_cgroup_link init_css_set_link;/* css_set_lock protects the list of css_set objects, and the * chain of tasks off each css_set. Nests outside task->alloc_lock * due to cgroup_iter_start() */staticDEFINE_RWLOCK(css_set_lock);staticint css_set_count;/* A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct * object and speeds up fork()/exit(), since a single inc/dec and a * list_add()/del() can bump the reference count on the entire * cgroup set for a task. */
/* Link structure for associating css_set objects with cgroups */struct cg_cgroup_link { /* * List running through cg_cgroup_links associated with a * cgroup, anchored on cgroup->css_sets */struct list_head cgrp_link_list; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links */struct list_head cg_link_list;struct css_set *cg;};
struct css_set { /* Reference count */struct kref ref; /* * List running through all cgroup groups. Protected by * css_set_lock */struct list_head list; /* * List running through all tasks using this cgroup * group. Protected by css_set_lock */struct list_head tasks; /* * List of cg_cgroup_link objects on link chains from * cgroups referenced from this css_set. Protected by * css_set_lock */struct list_head cg_links; /* * Set of subsystem states, one for each subsystem. This array * is immutable after creation apart from the init_css_set * during subsystem registration (at boot time). */struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];};
/** * cgroup_init_early - initialize cgroups at system boot, and * initialize any subsystems that request early init. */int __init cgroup_init_early(void){int i;kref_init(&init_css_set.ref);kref_get(&init_css_set.ref);INIT_LIST_HEAD(&init_css_set.list);INIT_LIST_HEAD(&init_css_set.cg_links);INIT_LIST_HEAD(&init_css_set.tasks); css_set_count =1;init_cgroup_root(&rootnode);list_add(&rootnode.root_list,&roots); root_count =1; /******************************/init_task.cgroups =&init_css_set; /******************************/init_css_set_link.cg =&init_css_set;list_add(&init_css_set_link.cgrp_link_list,&rootnode.top_cgroup.css_sets);list_add(&init_css_set_link.cg_link_list,&init_css_set.cg_links);for (i =0; i < CGROUP_SUBSYS_COUNT; i++) {struct cgroup_subsys *ss = subsys[i]; ...if (ss->early_init)cgroup_init_subsys(ss); }return0;}staticvoidcgroup_init_subsys(struct cgroup_subsys *ss){struct cgroup_subsys_state *css;struct list_head *l;printk(KERN_INFO "Initializing cgroup subsys %s\n",ss->name); /* Create the top cgroup state for this subsystem */ss->root =&rootnode; css =ss->create(ss, dummytop); /* We don't handle early failures gracefully */BUG_ON(IS_ERR(css));init_cgroup_css(css, ss, dummytop); /* Update all cgroup groups to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence all cgroup * groups are in the subsystem's top cgroup. */write_lock(&css_set_lock); l =&init_css_set.list;do {struct css_set *cg =list_entry(l,struct css_set, list);cg->subsys[ss->subsys_id] =dummytop->subsys[ss->subsys_id]; l =l->next; } while (l !=&init_css_set.list);write_unlock(&css_set_lock); .... need_forkexit_callback |=ss->fork ||ss->exit;ss->active =1;}
staticintcgroup_get_sb(struct file_system_type *fs_type,int flags,constchar*unused_dev_name,void*data,struct vfsmount *mnt){struct cgroup_sb_opts opts;int ret =0;struct super_block *sb;struct cgroupfs_root *root;struct list_head tmp_cg_links,*l;INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ ret =parse_cgroupfs_options(data,&opts);if (ret) {if (opts.release_agent)kfree(opts.release_agent);return ret; } root =kzalloc(sizeof(*root), GFP_KERNEL);if (!root)return-ENOMEM;init_cgroup_root(root);root->subsys_bits =opts.subsys_bits;root->flags =opts.flags;if (opts.release_agent) {strcpy(root->release_agent_path,opts.release_agent);kfree(opts.release_agent); } sb =sget(fs_type, cgroup_test_super, cgroup_set_super, root);if (sb->s_fs_info != root) { /* Reusing an existing superblock */BUG_ON(sb->s_root ==NULL);kfree(root); root =NULL; } else { /* New superblock */struct cgroup *cgrp =&root->top_cgroup;struct inode *inode; ret =cgroup_get_rootdir(sb); inode =sb->s_root->d_inode;mutex_lock(&inode->i_mutex);mutex_lock(&cgroup_mutex); /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be * increased by someone holding cgroup_lock, and * that's us. The worst that can happen is that we * have some link structures left over */ ret =allocate_cg_links(css_set_count,&tmp_cg_links); ret =rebind_subsystems(root,root->subsys_bits);list_add(&root->root_list,&roots); root_count++;sb->s_root->d_fsdata =&root->top_cgroup;root->top_cgroup.dentry =sb->s_root; /* Link the top cgroup in this hierarchy into all * the css_set objects */write_lock(&css_set_lock); l =&init_css_set.list;do {struct css_set *cg;struct cg_cgroup_link *link; cg =list_entry(l,struct css_set, list);BUG_ON(list_empty(&tmp_cg_links)); link =list_entry(tmp_cg_links.next,struct cg_cgroup_link, cgrp_link_list);list_del(&link->cgrp_link_list);link->cg = cg;list_add(&link->cgrp_link_list,&root->top_cgroup.css_sets);list_add(&link->cg_link_list,&cg->cg_links); l =l->next; } while (l !=&init_css_set.list);write_unlock(&css_set_lock);free_cg_links(&tmp_cg_links);cgroup_populate_dir(cgrp);mutex_unlock(&inode->i_mutex);mutex_unlock(&cgroup_mutex); }returnsimple_set_mnt(mnt, sb);}
staticstruct cgroup_subsys_state *cpu_cgroup_create(struct cgroup_subsys *ss,struct cgroup *cgrp){struct task_group *tg;if (!cgrp->parent) { /* This is early initialization for the top cgroup */init_task_group.css.cgroup = cgrp;return&init_task_group.css; } /* we support only 1-level deep hierarchical scheduler atm */if (cgrp->parent->parent)returnERR_PTR(-EINVAL); tg =sched_create_group();if (IS_ERR(tg))returnERR_PTR(-ENOMEM); /* Bind the cgroup to task_group object we just created */tg->css.cgroup = cgrp;return&tg->css;}
现在在来看看,task_group 这个结构,上面调用的函数无非是分配了一个 task_group
/* task group related information */struct task_group {#ifdefCONFIG_FAIR_CGROUP_SCHEDstruct cgroup_subsys_state css;#endif /* schedulable entities of this group on each cpu */struct sched_entity **se; /* runqueue "owned" by this group on each cpu */struct cfs_rq **cfs_rq;unsignedlong shares; /* spinlock to serialize modification to shares */spinlock_t lock;struct rcu_head rcu;};
/* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */staticvoidenqueue_task_fair(struct rq *rq,struct task_struct *p,int wakeup){struct cfs_rq *cfs_rq;struct sched_entity *se =&p->se;for_each_sched_entity(se) {if (se->on_rq)break; cfs_rq =cfs_rq_of(se);enqueue_entity(cfs_rq, se, wakeup); wakeup =1; }}
那么,文章开头的例子,是怎么样的呢?这里也给出一张图,首先,全部进程都只能在一个核上运行,这意味着它们都受到一个 cpuset 的资源控制( css ),我们假设 cpuset 绑定在了一个 Hierarchy,同时,b,c被限制在一个组内,所以它们受 cpu 资源控制,注意进程 a 不受 cpu 控制,这意味着它还是拥有最初的 也就是 dummy top 创建的 css(cpu)的资源。
struct cgroup { ... /* Private pointers for each registered subsystem */struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];};struct css_set { ... /* * Set of subsystem states, one for each subsystem. This array * is immutable after creation apart from the init_css_set * during subsystem registration (at boot time). */struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];};
staticstruct cftype cpu_files[]= { { .name ="shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, },};staticintcpu_shares_write_uint(struct cgroup *cgrp,struct cftype *cftype, u64 shareval){returnsched_group_set_shares(cgroup_tg(cgrp), shareval);}intsched_group_set_shares(struct task_group *tg,unsignedlong shares){int i; /* * A weight of 0 or 1 can cause arithmetics problems. * (The default weight is 1024 - so there's no practical * limitation from this.) */if (shares <2) shares =2;spin_lock(&tg->lock);if (tg->shares == shares)goto done;tg->shares = shares;for_each_possible_cpu(i)set_se_shares(tg->se[i], shares);done:spin_unlock(&tg->lock);return0;}
这便是,文件系统的威力,值得一提,刚才我们调用 ss->attach 的时候,其实只添加了 task_group 进入当前的 cpu 的核的队列,而不是每个 cpu 核的队列,那么到底其他队列是如何添加的。
答案是进程会在CPU之间,迁移
/* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're * attempting to rebalance this task on exec (sched_exec). * * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. * * Returns non-zero if task was successfully migrated. */staticint__migrate_task(struct task_struct *p,int src_cpu,int dest_cpu){struct rq *rq_dest,*rq_src;int ret =0, on_rq;if (unlikely(cpu_is_offline(dest_cpu)))return ret; rq_src =cpu_rq(src_cpu); rq_dest =cpu_rq(dest_cpu);double_rq_lock(rq_src, rq_dest); /* Already moved. */if (task_cpu(p)!= src_cpu)goto out; /* Affinity changed (again). */if (!cpu_isset(dest_cpu,p->cpus_allowed))goto out; on_rq =p->se.on_rq;if (on_rq)deactivate_task(rq_src, p,0);set_task_cpu(p, dest_cpu);if (on_rq) {activate_task(rq_dest, p,0);check_preempt_curr(rq_dest, p); } ret =1;out:double_rq_unlock(rq_src, rq_dest);return ret;}staticinlinevoid__set_task_cpu(struct task_struct *p,unsignedint cpu){set_task_cfs_rq(p, cpu);#ifdefCONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfuly executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */smp_wmb();task_thread_info(p)->cpu = cpu;#endif}/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */staticinlinevoidset_task_cfs_rq(struct task_struct *p,unsignedint cpu){p->se.cfs_rq =task_group(p)->cfs_rq[cpu];p->se.parent =task_group(p)->se[cpu];}
注释其实写的很明白,就是在 groups 或者 CPUs 之间迁移,中间调用可能有些复杂,但是本质就是这几个函数。
末
非常长的一篇笔记,因为实现起来确实有些复杂,其实还有 cpuset 这个资源管理器没有提到,但是本质不难猜测,就是控制进程的在 CPU 之间的迁移,中间加一些判断,就可以知道是否有目标CPU的资源,就实现了。