Subject: sched: fair: cfs_root_rq From: Peter Zijlstra Now that we have a single tree, there is no need to carry this in the hierarchy. Strip the tree info from cfs_rq and put it in cfs_root_rq. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 45 +++++++++----- kernel/sched_debug.c | 69 +++++++++++++--------- kernel/sched_fair.c | 154 +++++++++++++++++++++++++++------------------------ 3 files changed, 152 insertions(+), 116 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -373,18 +373,26 @@ static inline struct task_group *task_gr #endif /* CONFIG_GROUP_SCHED */ -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running; - - u64 exec_clock; +struct cfs_root_rq { u64 min_vruntime; u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; + struct sched_entity *next; + +#ifdef CONFIG_SCHEDSTATS + unsigned long nr_spread_over; + u64 exec_clock; +#endif +}; + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running; + struct list_head tasks; struct list_head *balance_iterator; @@ -392,9 +400,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next; - - unsigned long nr_spread_over; + struct sched_entity *curr; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -526,6 +532,7 @@ struct rq { unsigned long nr_load_updates; u64 nr_switches; + struct cfs_root_rq cfs_root; struct cfs_rq cfs; struct rt_rq rt; @@ -1793,7 +1800,8 @@ task_hot(struct task_struct *p, u64 now, /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == task_rq(p)->cfs.next)) + if (sched_feat(CACHE_HOT_BUDDY) && + (&p->se == task_rq(p)->cfs_root.next)) return 1; if (p->sched_class != &fair_sched_class) @@ -1814,8 +1822,8 @@ void set_task_cpu(struct task_struct *p, { int old_cpu = task_cpu(p); struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); + struct cfs_root_rq *old_cfs_r_rq = &old_rq->cfs_root, + *new_cfs_r_rq = &new_rq->cfs_root; u64 clock_offset; clock_offset = old_rq->clock - new_rq->clock; @@ -1833,8 +1841,8 @@ void set_task_cpu(struct task_struct *p, schedstat_inc(p, se.nr_forced2_migrations); } #endif - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; + p->se.vruntime -= old_cfs_r_rq->min_vruntime - + new_cfs_r_rq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -7824,14 +7832,18 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } +static void init_cfs_root_rq(struct cfs_root_rq *cfs_r_rq) +{ + cfs_r_rq->tasks_timeline = RB_ROOT; + cfs_r_rq->min_vruntime = (u64)(-(1LL << 20)); +} + static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { - cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) @@ -8011,6 +8023,7 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + init_cfs_root_rq(&rq->cfs_root); init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED Index: linux-2.6/kernel/sched_debug.c =================================================================== --- linux-2.6.orig/kernel/sched_debug.c +++ linux-2.6/kernel/sched_debug.c @@ -111,57 +111,47 @@ static void print_rq(struct seq_file *m, read_unlock_irqrestore(&tasklist_lock, flags); } -void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +void print_cfs_root(struct seq_file *m, int cpu) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, spread, rq0_min_vruntime, spread0; struct rq *rq = &per_cpu(runqueues, cpu); + struct cfs_root_rq *cfs_r_rq = &rq->cfs_root; struct sched_entity *last; unsigned long flags; -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128] = ""; - struct cgroup *cgroup = NULL; - struct task_group *tg = cfs_rq->tg; - - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); - - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); -#else - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#endif - - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", - SPLIT_NS(cfs_rq->exec_clock)); + SEQ_printf(m, "\ncfs_root_rq\n"); spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; - last = __pick_last_entity(cfs_rq); + if (cfs_r_rq->rb_leftmost) + MIN_vruntime = (__pick_next_entity(cfs_r_rq))->vruntime; + last = __pick_last_entity(cfs_r_rq); if (last) max_vruntime = last->vruntime; - min_vruntime = rq->cfs.min_vruntime; - rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; + min_vruntime = cfs_r_rq->min_vruntime; + rq0_min_vruntime = per_cpu(runqueues, 0).cfs_root.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", SPLIT_NS(min_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); + #ifdef CONFIG_SCHEDSTATS + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", + SPLIT_NS(cfs_r_rq->exec_clock)); + SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + cfs_r_rq->nr_spread_over); + #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); P(yld_exp_empty); @@ -180,8 +170,29 @@ void print_cfs_rq(struct seq_file *m, in #undef P #endif - SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", - cfs_rq->nr_spread_over); +} + +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +{ +#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#else + char path[128] = ""; + struct cgroup *cgroup = NULL; + struct task_group *tg = cfs_rq->tg; + + if (tg) + cgroup = tg->css.cgroup; + + if (cgroup) + cgroup_path(cgroup, path, sizeof(path)); + + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#endif + + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); @@ -260,6 +271,8 @@ static void print_cpu(struct seq_file *m #undef P #undef PN + print_cfs_root(m, cpu); + print_cfs_stats(m, cpu); print_rt_stats(m, cpu); Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -216,15 +216,14 @@ static inline u64 min_vruntime(u64 min_v return min_vruntime; } -static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline +s64 entity_key(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) { - return se->vruntime - cfs_rq->min_vruntime; + return se->vruntime - cfs_r_rq->min_vruntime; } -/* - * Enqueue an entity into the rb-tree: - */ -static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void +__enqueue_timeline(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) { struct rb_node **link; struct rb_node *parent = NULL; @@ -232,16 +231,8 @@ static void __enqueue_entity(struct cfs_ s64 key; int leftmost = 1; - if (!entity_is_task(se)) - return; - - if (se == cfs_rq->curr) - return; - - cfs_rq = &rq_of(cfs_rq)->cfs; - - link = &cfs_rq->tasks_timeline.rb_node; - key = entity_key(cfs_rq, se); + link = &cfs_r_rq->tasks_timeline.rb_node; + key = entity_key(cfs_r_rq, se); /* * Find the right place in the rbtree: */ @@ -252,7 +243,7 @@ static void __enqueue_entity(struct cfs_ * We dont care about collisions. Nodes with * the same key stay together. */ - if (key < entity_key(cfs_rq, entry)) { + if (key < entity_key(cfs_r_rq, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; @@ -265,56 +256,83 @@ static void __enqueue_entity(struct cfs_ * used): */ if (leftmost) { - cfs_rq->rb_leftmost = &se->run_node; + cfs_r_rq->rb_leftmost = &se->run_node; /* * maintain cfs_rq->min_vruntime to be a monotonic increasing * value tracking the leftmost vruntime in the tree. */ - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, se->vruntime); + cfs_r_rq->min_vruntime = + max_vruntime(cfs_r_rq->min_vruntime, se->vruntime); } rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + rb_insert_color(&se->run_node, &cfs_r_rq->tasks_timeline); } -static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void +__dequeue_timeline(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) { + if (cfs_r_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; struct sched_entity *next; next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; + cfs_r_rq->rb_leftmost = next_node; if (next_node) { next = rb_entry(next_node, struct sched_entity, run_node); - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, + cfs_r_rq->min_vruntime = + max_vruntime(cfs_r_rq->min_vruntime, next->vruntime); } } - if (cfs_rq->next == se) - cfs_rq->next = NULL; + if (cfs_r_rq->next == se) + cfs_r_rq->next = NULL; - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase(&se->run_node, &cfs_r_rq->tasks_timeline); } -static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return cfs_rq->rb_leftmost; + if (!entity_is_task(se)) + return; + + if (se == cfs_rq->curr) + return; + + __enqueue_timeline(&rq_of(cfs_rq)->cfs_root, se); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); + if (!entity_is_task(se)) + return; + + if (se == cfs_rq->curr) + return; + + __dequeue_timeline(&rq_of(cfs_rq)->cfs_root, se); } -static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +static inline struct rb_node *first_fair(struct cfs_root_rq *cfs_r_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + return cfs_r_rq->rb_leftmost; +} + +static struct sched_entity *__pick_next_entity(struct cfs_root_rq *cfs_r_rq) +{ + return rb_entry(first_fair(cfs_r_rq), struct sched_entity, run_node); +} + +static inline +struct sched_entity *__pick_last_entity(struct cfs_root_rq *cfs_r_rq) +{ + struct rb_node *last = rb_last(&cfs_r_rq->tasks_timeline); if (!last) return NULL; @@ -484,14 +502,11 @@ static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { - unsigned long delta_exec_weighted; - schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - curr->vruntime += delta_exec_weighted; + schedstat_add(&rq_of(cfs_rq)->cfs_root, exec_clock, delta_exec); + curr->vruntime += calc_delta_fair(delta_exec, curr); } static void update_curr(struct cfs_rq *cfs_rq) @@ -518,11 +533,8 @@ static void update_curr(struct cfs_rq *c __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - cpuacct_charge(curtask, delta_exec); - } + if (entity_is_task(curr)) + cpuacct_charge(task_of(curr), delta_exec); } static inline void @@ -670,26 +682,28 @@ static void enqueue_sleeper(struct cfs_r static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; + struct cfs_root_rq *cfs_r_rq = &rq_of(cfs_rq)->cfs_root; + s64 d = se->vruntime - cfs_r_rq->min_vruntime; if (d < 0) d = -d; if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); + schedstat_inc(cfs_r_rq, nr_spread_over); #endif } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { + struct cfs_root_rq *cfs_r_rq = &rq_of(cfs_rq)->cfs_root; u64 vruntime; - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(cfs_rq->min_vruntime, - __pick_next_entity(cfs_rq)->vruntime); + if (first_fair(cfs_r_rq)) { + vruntime = min_vruntime(cfs_r_rq->min_vruntime, + __pick_next_entity(cfs_r_rq)->vruntime); } else - vruntime = cfs_rq->min_vruntime; + vruntime = cfs_r_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, @@ -975,7 +989,7 @@ static void yield_task_fair(struct rq *r /* * Find the rightmost entry in the rbtree: */ - rightmost = __pick_last_entity(&rq->cfs); + rightmost = __pick_last_entity(&rq->cfs_root); /* * Already in the rightmost position? */ @@ -1339,7 +1353,7 @@ static void check_preempt_wakeup(struct if (unlikely(se == pse)) return; - rq->cfs.next = pse; + rq->cfs_root.next = pse; /* * Batch tasks do not preempt (their preemption is driven by @@ -1356,33 +1370,32 @@ static void check_preempt_wakeup(struct } static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq) +pick_next_entity(struct cfs_root_rq *cfs_r_rq) { - struct rq *rq = rq_of(cfs_rq); - u64 pair_slice = rq->clock - cfs_rq->pair_start; + struct rq *rq = container_of(cfs_r_rq, struct rq, cfs_root); + u64 pair_slice = rq->clock - cfs_r_rq->pair_start; + struct sched_entity *next = cfs_r_rq->next; - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { - cfs_rq->pair_start = rq->clock; - return __pick_next_entity(cfs_rq); + if (!next || pair_slice > sched_slice(cfs_rq_of(next), next)) { + cfs_r_rq->pair_start = rq->clock; + return __pick_next_entity(cfs_r_rq); } - return cfs_rq->next; + return next; } static struct task_struct *pick_next_task_fair(struct rq *rq) { struct task_struct *p; - struct cfs_rq *cfs_rq = &rq->cfs; + struct cfs_root_rq *cfs_r_rq = &rq->cfs_root; struct sched_entity *se, *next; - if (!first_fair(cfs_rq)) + if (!first_fair(cfs_r_rq)) return NULL; - next = se = pick_next_entity(cfs_rq); - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - set_next_entity(cfs_rq, se); - } + next = se = pick_next_entity(cfs_r_rq); + for_each_sched_entity(se) + set_next_entity(cfs_rq_of(se), se); p = task_of(next); hrtick_start_fair(rq, p); @@ -1396,12 +1409,9 @@ static struct task_struct *pick_next_tas static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; - struct cfs_rq *cfs_rq; - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se); - } + for_each_sched_entity(se) + put_prev_entity(cfs_rq_of(se), se); } #ifdef CONFIG_SMP