Subject: sched: fair: avg_vruntime From: Peter Zijlstra In order to implement a deadline scheduler we need to be able to test egibility. This requires knowing the current virtual time. We use a property of fair schedulers to determine this in an numerically stable way, namely the sum of all lags is 0. Therefore the average of all virtual times is the position of lag=0. We can't just take the average of vruntime - as it will use the full range of its u64 and will wrap around. Instead we'll use the average of (vruntime - min_vruntime) \Sum_{i}^{n} 1/n (v_{i} - v) = 1/n (\Sum_{i}^{n} v_{i}) - vn By factoring out the 1/n (never storing that) we avoid rounding, which would bring an accumulating error. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++ kernel/sched_debug.c | 5 ++ kernel/sched_fair.c | 89 +++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 86 insertions(+), 13 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -382,6 +382,11 @@ struct cfs_root_rq { struct sched_entity *next; +#ifdef CONFIG_FAIR_GROUP_SCHED + s64 avg_vruntime; +#endif + long nr_queued; + #ifdef CONFIG_SCHEDSTATS unsigned long nr_spread_over; u64 exec_clock; Index: linux-2.6/kernel/sched_debug.c =================================================================== --- linux-2.6.orig/kernel/sched_debug.c +++ linux-2.6/kernel/sched_debug.c @@ -146,6 +146,11 @@ void print_cfs_root(struct seq_file *m, SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", + SPLIT_NS(avg_vruntime(cfs_r_rq))); +#endif + #ifdef CONFIG_SCHEDSTATS SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_r_rq->exec_clock)); Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -222,6 +222,76 @@ s64 entity_key(struct cfs_root_rq *cfs_r return se->vruntime - cfs_r_rq->min_vruntime; } +#ifdef CONFIG_FAIR_GROUP_SCHED +static void +avg_vruntime_add(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + s64 key = entity_key(cfs_r_rq, se); + cfs_r_rq->avg_vruntime += key; + cfs_r_rq->nr_queued++; +} + +static void +avg_vruntime_sub(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + s64 key = entity_key(cfs_r_rq, se); + cfs_r_rq->avg_vruntime -= key; + cfs_r_rq->nr_queued--; +} + +static inline +void avg_vruntime_update(struct cfs_root_rq *cfs_r_rq, s64 delta) +{ + cfs_r_rq->avg_vruntime -= cfs_r_rq->nr_queued * delta; +} + +static u64 avg_vruntime(struct cfs_root_rq *cfs_r_rq) +{ + s64 avg = cfs_r_rq->avg_vruntime; + + if (cfs_r_rq->nr_queued) + avg = div_s64(avg, cfs_r_rq->nr_queued); + + return cfs_r_rq->min_vruntime + avg; +} + +#else /* CONFIG_FAIR_GROUP_SCHED */ + +static inline +void avg_vruntime_add(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + cfs_r_rq->nr_queued++; +} + +static inline +void avg_vruntime_sub(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + cfs_r_rq->nr_queued--; +} + +static inline +void avg_vruntime_update(struct cfs_root_rq *cfs_r_rq, s64 delta) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ +static void +update_min_vruntime(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + /* + * open coded max_vruntime() to allow updating avg_vruntime + */ + s64 delta = (s64)(se->vruntime - cfs_r_rq->min_vruntime); + if (delta > 0) { + avg_vruntime_update(cfs_r_rq, delta); + cfs_r_rq->min_vruntime = se->vruntime; + } +} + static void __enqueue_timeline(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) { @@ -257,12 +327,7 @@ __enqueue_timeline(struct cfs_root_rq *c */ if (leftmost) { cfs_r_rq->rb_leftmost = &se->run_node; - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - cfs_r_rq->min_vruntime = - max_vruntime(cfs_r_rq->min_vruntime, se->vruntime); + update_min_vruntime(cfs_r_rq, se); } rb_link_node(&se->run_node, parent, link); @@ -274,17 +339,13 @@ __dequeue_timeline(struct cfs_root_rq *c { if (cfs_r_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; - struct sched_entity *next; next_node = rb_next(&se->run_node); cfs_r_rq->rb_leftmost = next_node; if (next_node) { - next = rb_entry(next_node, - struct sched_entity, run_node); - cfs_r_rq->min_vruntime = - max_vruntime(cfs_r_rq->min_vruntime, - next->vruntime); + update_min_vruntime(cfs_r_rq, rb_entry(next_node, + struct sched_entity, run_node)); } } @@ -305,6 +366,7 @@ static void __enqueue_entity(struct cfs_ if (se == cfs_rq->curr) return; + avg_vruntime_add(&rq_of(cfs_rq)->cfs_root, se); __enqueue_timeline(&rq_of(cfs_rq)->cfs_root, se); } @@ -317,6 +379,7 @@ static void __dequeue_entity(struct cfs_ return; __dequeue_timeline(&rq_of(cfs_rq)->cfs_root, se); + avg_vruntime_sub(&rq_of(cfs_rq)->cfs_root, se); } static inline struct rb_node *first_fair(struct cfs_root_rq *cfs_r_rq) @@ -974,7 +1037,7 @@ static void yield_task_fair(struct rq *r /* * Are we the only task in the tree? */ - if (unlikely(rq->load.weight == curr->se.load.weight)) + if (unlikely(!rq->cfs_root.nr_queued)) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {