Subject: sched: fair-group: single RQ approach From: Peter Zijlstra The current hierarchical RQ group scheduler suffers from a number of problems: - yield - wakeup preemption - sleeper fairness All these problems are due to the isolation caused by the multiple RQ design. They are caused by the fact that vruntime becomes a local property. Solve this by returning to a single RQ model. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 kernel/sched_fair.c | 120 +++++++++++++++++++++------------------------------- 2 files changed, 51 insertions(+), 71 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -1793,7 +1793,7 @@ task_hot(struct task_struct *p, u64 now, /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) + if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == task_rq(p)->cfs.next)) return 1; if (p->sched_class != &fair_sched_class) Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -226,12 +226,22 @@ static inline s64 entity_key(struct cfs_ */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link; struct rb_node *parent = NULL; struct sched_entity *entry; - s64 key = entity_key(cfs_rq, se); + s64 key; int leftmost = 1; + if (!entity_is_task(se)) + return; + + if (se == cfs_rq->curr) + return; + + cfs_rq = &rq_of(cfs_rq)->cfs; + + link = &cfs_rq->tasks_timeline.rb_node; + key = entity_key(cfs_rq, se); /* * Find the right place in the rbtree: */ @@ -493,6 +503,11 @@ static void update_curr(struct cfs_rq *c if (unlikely(!curr)) return; + if (!entity_is_task(curr)) + return; + + cfs_rq = &rq_of(cfs_rq)->cfs; + /* * Get the amount of time the current task was running * since the last time we changed load (this cannot @@ -722,8 +737,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); + __enqueue_entity(cfs_rq, se); } static void @@ -748,8 +762,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st #endif } - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); + __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); } @@ -778,6 +791,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s * runqueue. */ update_stats_wait_end(cfs_rq, se); + if (WARN_ON_ONCE(cfs_rq->curr)) + cfs_rq->curr = NULL; __dequeue_entity(cfs_rq, se); } @@ -797,33 +812,6 @@ set_next_entity(struct cfs_rq *cfs_rq, s se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static struct sched_entity * -pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct rq *rq = rq_of(cfs_rq); - u64 pair_slice = rq->clock - cfs_rq->pair_start; - - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { - cfs_rq->pair_start = rq->clock; - return se; - } - - return cfs_rq->next; -} - -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = NULL; - - if (first_fair(cfs_rq)) { - se = __pick_next_entity(cfs_rq); - se = pick_next(cfs_rq, se); - set_next_entity(cfs_rq, se); - } - - return se; -} - static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -834,12 +822,12 @@ static void put_prev_entity(struct cfs_r update_curr(cfs_rq); check_spread(cfs_rq, prev); + cfs_rq->curr = NULL; if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); } - cfs_rq->curr = NULL; } static void @@ -850,6 +838,9 @@ entity_tick(struct cfs_rq *cfs_rq, struc */ update_curr(cfs_rq); + if (!entity_is_task(curr)) + return; + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -867,7 +858,8 @@ entity_tick(struct cfs_rq *cfs_rq, struc return; #endif - if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) + if (rq_of(cfs_rq)->load.weight != curr->load.weight || + !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr); } @@ -968,7 +960,7 @@ static void yield_task_fair(struct rq *r /* * Are we the only task in the tree? */ - if (unlikely(cfs_rq->nr_running == 1)) + if (unlikely(rq->load.weight == curr->se.load.weight)) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { @@ -983,7 +975,7 @@ static void yield_task_fair(struct rq *r /* * Find the rightmost entry in the rbtree: */ - rightmost = __pick_last_entity(cfs_rq); + rightmost = __pick_last_entity(&rq->cfs); /* * Already in the rightmost position? */ @@ -1336,7 +1328,6 @@ static void check_preempt_wakeup(struct struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1348,7 +1339,7 @@ static void check_preempt_wakeup(struct if (unlikely(se == pse)) return; - cfs_rq_of(pse)->next = pse; + rq->cfs.next = pse; /* * Batch tasks do not preempt (their preemption is driven by @@ -1360,51 +1351,40 @@ static void check_preempt_wakeup(struct if (!sched_feat(WAKEUP_PREEMPT)) return; - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); +} - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + u64 pair_slice = rq->clock - cfs_rq->pair_start; - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); + if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + cfs_rq->pair_start = rq->clock; + return __pick_next_entity(cfs_rq); } - if (wakeup_preempt_entity(se, pse) == 1) - resched_task(curr); + return cfs_rq->next; } static struct task_struct *pick_next_task_fair(struct rq *rq) { struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; + struct sched_entity *se, *next; - if (unlikely(!cfs_rq->nr_running)) + if (!first_fair(cfs_rq)) return NULL; - do { - se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); + next = se = pick_next_entity(cfs_rq); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + set_next_entity(cfs_rq, se); + } - p = task_of(se); + p = task_of(next); hrtick_start_fair(rq, p); return p;