struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */
/* count users of this cgroup. >0 means busy, but doesn't
* necessarily indicate the number of tasks in the
* cgroup */
atomic_t count;
/*
* We link our 'sibling' struct into our parent's 'children'.
* Our children link their 'sibling' into our 'children'.
*/
struct list_head sibling; /* my parent's children */
struct list_head children; /* my children */
struct cgroup *parent; /* my parent */
struct dentry *dentry; /* cgroup fs entry */
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
struct cgroupfs_root *root;
struct cgroup *top_cgroup;
/*
* List of cg_cgroup_links pointing at css_sets with
* tasks in this cgroup. Protected by css_set_lock
*/
struct list_head css_sets;
/*
* Linked list running through all cgroups that can
* potentially be reaped by the release agent. Protected by
* release_list_lock
*/
struct list_head release_list;
};
cgroupfs_root -- Hierarchy
/*
* A cgroupfs_root represents the root of a cgroup hierarchy,
* and may be associated with a superblock to form an active
* hierarchy
*/
struct cgroupfs_root {
struct super_block *sb;
/*
* The bitmask of subsystems intended to be attached to this
* hierarchy
*/
unsigned long subsys_bits;
/* The bitmask of subsystems currently attached to this hierarchy */
unsigned long actual_subsys_bits;
/* A list running through the attached subsystems */
struct list_head subsys_list;
/* The root cgroup for this hierarchy */
struct cgroup top_cgroup;
/* Tracks how many cgroups are currently defined in hierarchy.*/
int number_of_cgroups;
/* A list running through the mounted hierarchies */
struct list_head root_list;
/* Hierarchy-specific flags */
unsigned long flags;
/* The path to use for release notifications. No locking
* between setting and use - so if userspace updates this
* while child cgroups exist, you could miss a
* notification. We ensure that it's always a valid
* NUL-terminated string */
char release_agent_path[PATH_MAX];
};
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
* single cgroup, and all tasks are part of that cgroup.
*/
static struct cgroupfs_root rootnode;
虚拟根节点,用来绑定全部的资源管理器
/* The list of hierarchy roots */
每次 mount 就会创建一个 cgroup rootfs hierachy), 挂载在这,最开始的 dummy top也在
static LIST_HEAD(roots);
static int root_count;
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
值得一提的是,最开始初始化的时候,创建了一个虚拟的 root( dummy top ),用来挂载内核全部的资源管理器。
注意 dummytop 没有连接 subsystem
其实,最开始的创建的一个 dummy top 就可以认为是一个 Hierarchy,只是它连接了全部的资源管理器。
这里只画出了俩个 subsystems
当用户挂载一个新的 Hierarchy的时候,会指定它需要的 资源管理器(subsystem)。后来新建的 Hierarchy 和 最开始的 dummy top 是平级的,挂载在一个全局的链表上。
我习惯把它理解为,一个具体的 “资源”,subsystem 只是强调了 资源管理器 的存在,而到底拥有多少资源,是这个结构说了算,通过 suffix “ state ”也能略晓一二,描述的是资源的状态。
以 cpu 这个 subsys 来说,就是同一个 Hierarchy 中的进程可能也不在一个 进程组,所以有了这个结构的存在。
/* Per-subsystem/per-cgroup state maintained by the system. */
struct cgroup_subsys_state {
/* The cgroup that this subsystem is attached to. Useful
* for subsystems that want to know about the cgroup
* hierarchy structure */
struct cgroup *cgroup;
/* State maintained by the cgroup system to allow
* subsystems to be "busy". Should be accessed via css_get()
* and css_put() */
atomic_t refcnt;
unsigned long flags;
};
struct css_set {
/* Reference count */
struct kref ref;
/*
* List running through all cgroup groups. Protected by
* css_set_lock
*/
struct list_head list;
/*
* List running through all tasks using this cgroup
* group. Protected by css_set_lock
*/
struct list_head tasks;
/*
* List of cg_cgroup_link objects on link chains from
* cgroups referenced from this css_set. Protected by
* css_set_lock
*/
struct list_head cg_links;
/*
* Set of subsystem states, one for each subsystem. This array
* is immutable after creation apart from the init_css_set
* during subsystem registration (at boot time).
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
};
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
* List running through cg_cgroup_links associated with a
* cgroup, anchored on cgroup->css_sets
*/
struct list_head cgrp_link_list;
/*
* List running through cg_cgroup_links pointing at a
* single css_set object, anchored on css_set->cg_links
*/
struct list_head cg_link_list;
struct css_set *cg;
};
Each task in the system has a reference-counted pointer to a css_set.
A css_set contains a set of reference-counted pointers to cgroup_subsys_state objects, one for each cgroup subsystem registered in the system. There is no direct link from a task to the cgroup of which it's a member in each hierarchy, but this can be determined by following pointers through the cgroup_subsys_state objects. This is because accessing the subsystem state is something that's expected to happen frequently and in performance-critical code, whereas operations that require a task's actual cgroup assignments (in particular, moving between cgroups) are less common. A linked list runs through the cg_list field of each task_struct using the css_set, anchored at css_set->tasks.
比如,我们最开始的例子,进程就受到 cpuset cpu 俩个 资源 控制,限制了它的 CPU 核 以及 CPU 的资源占用,那么可以说对于 b c 俩个进程,它们都在一个 css_set,如果还有其它进程也在跟他们在一个组内(受同样的 资源 控制),那么只需要增加 css_set 的一个引用计数就好了,它是为了节约资源才设计的。
可能还有一个进程,只受到 cpuset 的限制,但是它并不受到 另外一个组的 cpu 资源的限制,所以它们不在一个 set,为了应对各种复杂的情况,所以出现了这个 css_set
内核也有默认的css_set,目的是为了连接 dummy top 上的 css 还有上面所有的进程,以及其余所有的 css_set,这里没有画出来,剩余的全部进程可能都在最开始的 css_set 上。
/* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;
/* A css_set is a structure holding pointers to a set of
* cgroup_subsys_state objects. This saves space in the task struct
* object and speeds up fork()/exit(), since a single inc/dec and a
* list_add()/del() can bump the reference count on the entire
* cgroup set for a task.
*/
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
* List running through cg_cgroup_links associated with a
* cgroup, anchored on cgroup->css_sets
*/
struct list_head cgrp_link_list;
/*
* List running through cg_cgroup_links pointing at a
* single css_set object, anchored on css_set->cg_links
*/
struct list_head cg_link_list;
struct css_set *cg;
};
struct css_set {
/* Reference count */
struct kref ref;
/*
* List running through all cgroup groups. Protected by
* css_set_lock
*/
struct list_head list;
/*
* List running through all tasks using this cgroup
* group. Protected by css_set_lock
*/
struct list_head tasks;
/*
* List of cg_cgroup_link objects on link chains from
* cgroups referenced from this css_set. Protected by
* css_set_lock
*/
struct list_head cg_links;
/*
* Set of subsystem states, one for each subsystem. This array
* is immutable after creation apart from the init_css_set
* during subsystem registration (at boot time).
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
};
/**
* cgroup_init_early - initialize cgroups at system boot, and
* initialize any subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
int i;
kref_init(&init_css_set.ref);
kref_get(&init_css_set.ref);
INIT_LIST_HEAD(&init_css_set.list);
INIT_LIST_HEAD(&init_css_set.cg_links);
INIT_LIST_HEAD(&init_css_set.tasks);
css_set_count = 1;
init_cgroup_root(&rootnode);
list_add(&rootnode.root_list, &roots);
root_count = 1;
/******************************/
init_task.cgroups = &init_css_set;
/******************************/
init_css_set_link.cg = &init_css_set;
list_add(&init_css_set_link.cgrp_link_list,
&rootnode.top_cgroup.css_sets);
list_add(&init_css_set_link.cg_link_list,
&init_css_set.cg_links);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
...
if (ss->early_init)
cgroup_init_subsys(ss);
}
return 0;
}
static void cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
struct list_head *l;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
/* Create the top cgroup state for this subsystem */
ss->root = &rootnode;
css = ss->create(ss, dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
/* Update all cgroup groups to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence all cgroup
* groups are in the subsystem's top cgroup. */
write_lock(&css_set_lock);
l = &init_css_set.list;
do {
struct css_set *cg =
list_entry(l, struct css_set, list);
cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
l = l->next;
} while (l != &init_css_set.list);
write_unlock(&css_set_lock);
....
need_forkexit_callback |= ss->fork || ss->exit;
ss->active = 1;
}
static int cgroup_get_sb(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data, struct vfsmount *mnt)
{
struct cgroup_sb_opts opts;
int ret = 0;
struct super_block *sb;
struct cgroupfs_root *root;
struct list_head tmp_cg_links, *l;
INIT_LIST_HEAD(&tmp_cg_links);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
if (ret) {
if (opts.release_agent)
kfree(opts.release_agent);
return ret;
}
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
return -ENOMEM;
init_cgroup_root(root);
root->subsys_bits = opts.subsys_bits;
root->flags = opts.flags;
if (opts.release_agent) {
strcpy(root->release_agent_path, opts.release_agent);
kfree(opts.release_agent);
}
sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
if (sb->s_fs_info != root) {
/* Reusing an existing superblock */
BUG_ON(sb->s_root == NULL);
kfree(root);
root = NULL;
} else {
/* New superblock */
struct cgroup *cgrp = &root->top_cgroup;
struct inode *inode;
ret = cgroup_get_rootdir(sb);
inode = sb->s_root->d_inode;
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
/*
* We're accessing css_set_count without locking
* css_set_lock here, but that's OK - it can only be
* increased by someone holding cgroup_lock, and
* that's us. The worst that can happen is that we
* have some link structures left over
*/
ret = allocate_cg_links(css_set_count, &tmp_cg_links);
ret = rebind_subsystems(root, root->subsys_bits);
list_add(&root->root_list, &roots);
root_count++;
sb->s_root->d_fsdata = &root->top_cgroup;
root->top_cgroup.dentry = sb->s_root;
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock(&css_set_lock);
l = &init_css_set.list;
do {
struct css_set *cg;
struct cg_cgroup_link *link;
cg = list_entry(l, struct css_set, list);
BUG_ON(list_empty(&tmp_cg_links));
link = list_entry(tmp_cg_links.next,
struct cg_cgroup_link,
cgrp_link_list);
list_del(&link->cgrp_link_list);
link->cg = cg;
list_add(&link->cgrp_link_list,
&root->top_cgroup.css_sets);
list_add(&link->cg_link_list, &cg->cg_links);
l = l->next;
} while (l != &init_css_set.list);
write_unlock(&css_set_lock);
free_cg_links(&tmp_cg_links);
cgroup_populate_dir(cgrp);
mutex_unlock(&inode->i_mutex);
mutex_unlock(&cgroup_mutex);
}
return simple_set_mnt(mnt, sb);
}
static struct cgroup_subsys_state *
cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct task_group *tg;
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
init_task_group.css.cgroup = cgrp;
return &init_task_group.css;
}
/* we support only 1-level deep hierarchical scheduler atm */
if (cgrp->parent->parent)
return ERR_PTR(-EINVAL);
tg = sched_create_group();
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
/* Bind the cgroup to task_group object we just created */
tg->css.cgroup = cgrp;
return &tg->css;
}
现在在来看看,task_group 这个结构,上面调用的函数无非是分配了一个 task_group
/* task group related information */
struct task_group {
#ifdef CONFIG_FAIR_CGROUP_SCHED
struct cgroup_subsys_state css;
#endif
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
/* spinlock to serialize modification to shares */
spinlock_t lock;
struct rcu_head rcu;
};
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
}
那么,文章开头的例子,是怎么样的呢?这里也给出一张图,首先,全部进程都只能在一个核上运行,这意味着它们都受到一个 cpuset 的资源控制( css ),我们假设 cpuset 绑定在了一个 Hierarchy,同时,b,c被限制在一个组内,所以它们受 cpu 资源控制,注意进程 a 不受 cpu 控制,这意味着它还是拥有最初的 也就是 dummy top 创建的 css(cpu)的资源。
struct cgroup {
...
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
};
struct css_set {
...
/*
* Set of subsystem states, one for each subsystem. This array
* is immutable after creation apart from the init_css_set
* during subsystem registration (at boot time).
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
};
static struct cftype cpu_files[] = {
{
.name = "shares",
.read_uint = cpu_shares_read_uint,
.write_uint = cpu_shares_write_uint,
},
};
static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
return sched_group_set_shares(cgroup_tg(cgrp), shareval);
}
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
/*
* A weight of 0 or 1 can cause arithmetics problems.
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
if (shares < 2)
shares = 2;
spin_lock(&tg->lock);
if (tg->shares == shares)
goto done;
tg->shares = shares;
for_each_possible_cpu(i)
set_se_shares(tg->se[i], shares);
done:
spin_unlock(&tg->lock);
return 0;
}
这便是,文件系统的威力,值得一提,刚才我们调用 ss->attach 的时候,其实只添加了 task_group 进入当前的 cpu 的核的队列,而不是每个 cpu 核的队列,那么到底其他队列是如何添加的。
答案是进程会在CPU之间,迁移
/*
* Move (not current) task off this cpu, onto dest cpu. We're doing
* this because either it can't run here any more (set_cpus_allowed()
* away from this CPU, or CPU going down), or because we're
* attempting to rebalance this task on exec (sched_exec).
*
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*
* Returns non-zero if task was successfully migrated.
*/
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
struct rq *rq_dest, *rq_src;
int ret = 0, on_rq;
if (unlikely(cpu_is_offline(dest_cpu)))
return ret;
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != src_cpu)
goto out;
/* Affinity changed (again). */
if (!cpu_isset(dest_cpu, p->cpus_allowed))
goto out;
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p);
}
ret = 1;
out:
double_rq_unlock(rq_src, rq_dest);
return ret;
}
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_cfs_rq(p, cpu);
#ifdef CONFIG_SMP
/*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
* successfuly executed on another CPU. We must ensure that updates of
* per-task data have been completed by this moment.
*/
smp_wmb();
task_thread_info(p)->cpu = cpu;
#endif
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
{
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu];
}
注释其实写的很明白,就是在 groups 或者 CPUs 之间迁移,中间调用可能有些复杂,但是本质就是这几个函数。
末
非常长的一篇笔记,因为实现起来确实有些复杂,其实还有 cpuset 这个资源管理器没有提到,但是本质不难猜测,就是控制进程的在 CPU 之间的迁移,中间加一些判断,就可以知道是否有目标CPU的资源,就实现了。