Subject: sched: single-rq: EEDF Inspired by the BFQ people their WF2Q implementation, an EEVDF variant. Single run-queue group scheduling has two issues: - weight - latency isolation Weight is tackled by calculating it on the fly in a hierarchical fashion, see calc_delta_fair() and calc_delta_weight() - those walk up the hierarchy and perform a weight calculation for each layer. Latency isolation means that even if another group runs 1000 tasks your group with only 5 tasks will not have immense latencies. Ensuring this is the hardest part. The following will try to quickly describe how this code tries to solve that issue. ** DEADLINES ** So each group will have its own latency goal, with this we can associate a deadline for each task. On enqueue we calculate its group's latency goal and calculate a task's deadline. ** LAG ** lag(x) := PFS(x) - CFS(x) [1] Where PFS stands for Perfect Fair Scheduler, the ideal scheduler that has no problems with infinitely small time shares and no scheduling overhead and CFS is our scheduler. Due to quantization CFS will either be ahead or behind PFS for any given task. ** ELIGIBILITY ** A task 'x' is eligible when lag(x) > 0, that means the task has received less than its fair share of cpu time. Testing eligibility uses the property that the average lag over all tasks is 0 for a fair scheduler. Therefore: lag(x) = avg(vruntime) - x.vruntime [2] avg(vruntime) = (\Sum_{i} i.vruntime) / nr_running lag(x) > 0 -> [3] avg(vruntime) - x.vruntime > 0 -> avg(vruntime) > x.vruntime -> (\Sum_{i} i.vruntime) / nr_running > x.vruntime -> \Sum_{i} i.vruntime > x.vruntime * nr_running [4] ** EARLIEST ELIGIBLE DEADLINE FIRST ** So now we need a policy that takes two notions into account: fairness and latency. One such a policy is: EEDF. EEDF will pick that task which has the earliest deadline between those tasks that are eligible. By only taking from the eligible tasks it stays fair, by preferring that task which has the earliest deadline it strives to meet the latency goal. In order to find this task EEDF augments the normal vruntime tree. For each node 'n' it keeps: n.min_deadline = min(n.deadline, n.left.min_deadline, n.right.min_deadline) [5] By using this 'inverse heap' properly it is possible to find the entry with min_deadline for each sub-tree in O(log n). Signed-off-by: Peter Zijlstra CC: Fabio Checconi --- include/linux/sched.h | 6 kernel/sched_debug.c | 25 ++- kernel/sched_fair.c | 378 ++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched_features.h | 4 4 files changed, 379 insertions(+), 34 deletions(-) Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -956,8 +956,12 @@ struct load_weight { struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; - struct list_head group_node; +#ifdef CONFIG_FAIR_GROUP_SCHED + u64 deadline; + u64 min_deadline; +#endif unsigned int on_rq; + struct list_head group_node; u64 exec_start; u64 sum_exec_runtime; Index: linux-2.6/kernel/sched_debug.c =================================================================== --- linux-2.6.orig/kernel/sched_debug.c +++ linux-2.6/kernel/sched_debug.c @@ -61,9 +61,18 @@ print_task(struct seq_file *m, struct rq else SEQ_printf(m, " "); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, "%15s %5d %9Ld.%06ld%c %9Ld.%06ld%c %9Ld %5d ", p->comm, p->pid, SPLIT_NS(p->se.vruntime), +#ifdef CONFIG_FAIR_GROUP_SCHED + entity_eligible(&rq->cfs_root, &p->se) ? 'e' : ' ', + SPLIT_NS(p->se.deadline), + entity_expired(&rq->cfs_root, &p->se) ? 'x' : ' ', +#else + ' ', + SPLIT_NS(0ULL), + ' ', +#endif (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS @@ -94,10 +103,12 @@ static void print_rq(struct seq_file *m, SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" - " exec-runtime sum-exec sum-sleep\n" - "------------------------------------------------------" - "----------------------------------------------------\n"); + " task PID timeline-key deadline-key " + " switches prio exec-runtime sum-exec" + " sum-sleep\n" + "----------------------------------------------------------" + "--------------------------------------------------" + "-----------------\n"); read_lock_irqsave(&tasklist_lock, flags); @@ -124,8 +135,8 @@ void print_cfs_root(struct seq_file *m, spin_lock_irqsave(&rq->lock, flags); if (cfs_r_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_r_rq))->vruntime; - last = __pick_last_entity(cfs_r_rq); + MIN_vruntime = (__pick_next_timeline(cfs_r_rq))->vruntime; + last = __pick_last_timeline(cfs_r_rq); if (last) max_vruntime = last->vruntime; min_vruntime = cfs_r_rq->min_vruntime; Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -222,13 +222,118 @@ s64 entity_key(struct cfs_root_rq *cfs_r return se->vruntime - cfs_r_rq->min_vruntime; } +static inline struct rb_node *first_fair(struct cfs_root_rq *cfs_r_rq) +{ + return cfs_r_rq->rb_leftmost; +} + +static struct sched_entity *__pick_next_timeline(struct cfs_root_rq *cfs_r_rq) +{ + return rb_entry(first_fair(cfs_r_rq), + struct sched_entity, run_node); +} + +static inline +struct sched_entity *__pick_last_timeline(struct cfs_root_rq *cfs_r_rq) +{ + struct rb_node *last = rb_last(&cfs_r_rq->tasks_timeline); + + if (!last) + return NULL; + + return rb_entry(last, struct sched_entity, run_node); +} + #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Single run-queue group scheduling has two issues: + * - weight + * - latency isolation + * + * Weight is tackled by calculating it on the fly in a hierarchical fashion, + * see calc_delta_fair() and calc_delta_weight() - those walk up the hierarchy + * and perform a weight calculation for each layer. + * + * Latency isolation means that even if another group runs 1000 tasks your group + * with only 5 tasks will not have immense latencies. Ensuring this is the + * hardest part. The following will try to quickly describe how this code tries + * to solve that issue. + * + * ** DEADLINES ** + * + * So each group will have its own latency goal, with this we can associate a + * deadline for each task. On enqueue we calculate its group's latency goal + * and calculate a task's deadline. + * + * ** LAG ** + * + * lag(x) := PFS(x) - CFS(x) [1] + * + * Where PFS stands for Perfect Fair Scheduler, the ideal scheduler that has no + * problems with infinitely small time shares and no scheduling overhead and + * CFS is our scheduler. Due to quantization CFS will either be ahead or behind + * PFS for any given task. + * + * ** ELIGIBILITY ** + * + * A task 'x' is eligible when lag(x) > 0, that means the task has received + * less than its fair share of cpu time. + * + * Testing eligibility uses the property that the average lag over all tasks + * is 0 for a fair scheduler. Therefore: + * + * lag(x) = avg(vruntime) - x.vruntime [2] + * + * avg(vruntime) = (\Sum_{i} i.vruntime) / nr_running + * + * lag(x) > 0 -> [3] + * avg(vruntime) - x.vruntime > 0 -> + * avg(vruntime) > x.vruntime -> + * (\Sum_{i} i.vruntime) / nr_running > x.vruntime -> + * \Sum_{i} i.vruntime > x.vruntime * nr_running [4] + * + * ** EARLIEST ELIGIBLE DEADLINE FIRST ** + * + * So now we need a policy that takes two notions into account: fairness + * and latency. One such a policy is: EEDF. + * + * EEDF will pick that task which has the earliest deadline between those + * tasks that are eligible. By only taking from the eligible tasks it stays + * fair, by preferring that task which has the earliest deadline it strives + * to meet the latency goal. + * + * In order to find this task EEDF augments the normal vruntime tree. + * For each node 'n' it keeps: + * + * n.min_deadline = + * min(n.deadline, n.left.min_deadline, n.right.min_deadline) [5] + * + * By using this 'inverse heap' properly it is possible to find the entry with + * min_deadline for each sub-tree in O(log n). + * + */ + +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se); + +static void +sched_calc_deadline(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + struct rq *rq = container_of(cfs_r_rq, struct rq, cfs_root); + + se->deadline = rq->clock + sched_vslice_add(cfs_rq_of(se), se); +} + static void avg_vruntime_add(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) { s64 key = entity_key(cfs_r_rq, se); cfs_r_rq->avg_vruntime += key; cfs_r_rq->nr_queued++; + + /* + * should be done before inserting into the tree + */ + sched_calc_deadline(cfs_r_rq, se); } static void @@ -264,6 +369,222 @@ static u64 avg_vruntime(struct cfs_root_ return cfs_r_rq->min_vruntime + avg; } +static inline +int entity_eligible(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + s64 vruntime = entity_key(cfs_r_rq, se); + + return (vruntime * cfs_r_rq->nr_queued) <= cfs_r_rq->avg_vruntime; +} + +static inline +int deadline_expired(struct cfs_root_rq *cfs_r_rq, u64 deadline) +{ + struct rq *rq = container_of(cfs_r_rq, struct rq, cfs_root); + + return (s64)(deadline - rq->clock) <= 0; +} + +static inline +int entity_expired(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + return deadline_expired(cfs_r_rq, se->deadline); +} + +static inline struct sched_entity *se_of(struct rb_node *node) +{ + return rb_entry(node, struct sched_entity, run_node); +} + +static inline s64 deadline_key(struct cfs_root_rq *cfs_r_rq, u64 deadline) +{ + struct rq *rq = container_of(cfs_r_rq, struct rq, cfs_root); + + return (s64)(deadline - rq->clock); +} + +#define deadline_gt(cfs_r_rq, field, lnode, rnode) \ +({ \ + deadline_key(cfs_r_rq, se_of(lnode)->field) > \ + deadline_key(cfs_r_rq, se_of(rnode)->field); \ +}) + +/* + * For a given sub-tree find the entity with min_deadline + */ +static struct sched_entity * +___pick_next_edf(struct cfs_root_rq *cfs_r_rq, struct rb_node *tree) +{ + struct rb_node *node = tree; + + for (; node; ) { + if (se_of(tree)->min_deadline == se_of(node)->deadline) + return se_of(node); + + if (node->rb_left && (se_of(node)->min_deadline == + se_of(node->rb_left)->min_deadline)) + node = node->rb_left; + else + node = node->rb_right; + } + + BUG(); +} + +/* + * Earliest Eligible Deadline First + * + * Walk down the tree, dividing eligible (left) from non eligible (right). + * While walking the dividing line, keep track of two nodes, @path and @tree. + * @path is the entity with the smallest deadline on the dividing line, + * @tree is the left sub-tree with the smallest deadline. + * + * At the bottom, pick between @path and EDF(@tree). + */ +static struct sched_entity *__pick_next_eedf(struct cfs_root_rq *cfs_r_rq) +{ + struct rb_node *node = cfs_r_rq->tasks_timeline.rb_node; + struct rb_node *tree = NULL, *path = NULL; + + while (node) { + if (entity_eligible(cfs_r_rq, se_of(node))) { + if (!path || deadline_gt(cfs_r_rq, deadline, + path, node)) + path = node; + + if (!tree || (node->rb_left && + deadline_gt(cfs_r_rq, min_deadline, + tree, node->rb_left))) + tree = node->rb_left; + + node = node->rb_right; + } else + node = node->rb_left; + } + + if (!tree || deadline_gt(cfs_r_rq, min_deadline, tree, path)) + return se_of(path); + + return ___pick_next_edf(cfs_r_rq, tree); +} + +/* + * Get the Earliest Deadline of the whole tree. + */ +static struct sched_entity *__pick_next_edf(struct cfs_root_rq *cfs_r_rq) +{ + return ___pick_next_edf(cfs_r_rq, cfs_r_rq->tasks_timeline.rb_node); +} + +static struct sched_entity *__pick_next_entity(struct cfs_root_rq *cfs_r_rq) +{ + if (!sched_feat(EEDF)) + return __pick_next_timeline(cfs_r_rq); + + return __pick_next_eedf(cfs_r_rq); +} + +/* + * se->min_deadline = min(se->min_deadline, node->min_deadline) + */ +static void update_min_deadline(struct cfs_root_rq *cfs_r_rq, + struct sched_entity *se, struct rb_node *node) +{ + if (node && deadline_gt(cfs_r_rq, min_deadline, &se->run_node, node)) + se->min_deadline = se_of(node)->min_deadline; +} + +/* + * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + */ +static void update_node(struct cfs_root_rq *cfs_r_rq, struct rb_node *node) +{ + struct sched_entity *se = rb_entry(node, + struct sched_entity, run_node); + + se->min_deadline = se->deadline; + update_min_deadline(cfs_r_rq, se, node->rb_right); + update_min_deadline(cfs_r_rq, se, node->rb_left); +} + +/* + * update min_deadline for all nodes that could have been affected by + * a rebalance pass up from @node. + */ +static void update_tree(struct cfs_root_rq *cfs_r_rq, struct rb_node *node) +{ + struct rb_node *parent; +up: + update_node(cfs_r_rq, node); + + parent = rb_parent(node); + if (!parent) + return; + + if (node == parent->rb_left && parent->rb_right) + update_node(cfs_r_rq, parent->rb_right); + else if (parent->rb_left) + update_node(cfs_r_rq, parent->rb_left); + + node = parent; + goto up; +} + +/* + * after inserting @se into the tree, update min_deadline to account for + * both the new deadline and any damage done by rebalance + */ +static void +update_tree_enqueue(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + struct rb_node *node = &se->run_node; + + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + + update_tree(cfs_r_rq, node); +} + +/* + * before removing the node, find the deepest node on the rebalance path that + * will still be there after @se gets removed + */ +static struct rb_node * +update_tree_dequeue_begin(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + struct rb_node *deepest; + struct rb_node *node = &se->run_node; + + if (!node->rb_right && !node->rb_left) + deepest = rb_parent(node); + else if (!node->rb_right) + deepest = node->rb_left; + else if (!node->rb_left) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/* + * now that the entity got removed, update min_deadline to undo the missing + * deadline and any rebalance damage + */ +static void +update_tree_dequeue_end(struct cfs_root_rq *cfs_r_rq, struct rb_node *node) +{ + if (node) + update_tree(cfs_r_rq, node); +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline @@ -282,6 +603,28 @@ static inline void avg_vruntime_update(struct cfs_root_rq *cfs_r_rq, s64 delta) { } + +static inline +void update_tree_enqueue(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ +} + +static struct rb_node * +update_tree_dequeue_begin(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) +{ + return NULL; +} + +static void +update_tree_dequeue_end(struct cfs_root_rq *cfs_r_rq, struct rb_node *node) +{ +} + +static inline +struct sched_entity *__pick_next_entity(struct cfs_root_rq *cfs_r_rq) +{ + return __pick_next_timeline(cfs_r_rq); +} #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -341,11 +684,15 @@ __enqueue_timeline(struct cfs_root_rq *c rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_r_rq->tasks_timeline); + + update_tree_enqueue(cfs_r_rq, se); } static void __dequeue_timeline(struct cfs_root_rq *cfs_r_rq, struct sched_entity *se) { + struct rb_node *node = update_tree_dequeue_begin(cfs_r_rq, se); + if (cfs_r_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; @@ -354,7 +701,7 @@ __dequeue_timeline(struct cfs_root_rq *c if (next_node) { update_min_vruntime(cfs_r_rq, rb_entry(next_node, - struct sched_entity, run_node)); + struct sched_entity, run_node)); } } @@ -362,6 +709,8 @@ __dequeue_timeline(struct cfs_root_rq *c cfs_r_rq->next = NULL; rb_erase(&se->run_node, &cfs_r_rq->tasks_timeline); + + update_tree_dequeue_end(cfs_r_rq, node); } /* @@ -394,27 +743,6 @@ static void __dequeue_entity(struct cfs_ avg_vruntime_sub(&rq_of(cfs_rq)->cfs_root, se); } -static inline struct rb_node *first_fair(struct cfs_root_rq *cfs_r_rq) -{ - return cfs_r_rq->rb_leftmost; -} - -static struct sched_entity *__pick_next_entity(struct cfs_root_rq *cfs_r_rq) -{ - return rb_entry(first_fair(cfs_r_rq), struct sched_entity, run_node); -} - -static inline -struct sched_entity *__pick_last_entity(struct cfs_root_rq *cfs_r_rq) -{ - struct rb_node *last = rb_last(&cfs_r_rq->tasks_timeline); - - if (!last) - return NULL; - - return rb_entry(last, struct sched_entity, run_node); -} - /************************************************************** * Scheduling class statistics methods: */ @@ -472,7 +800,7 @@ calc_delta_fair(unsigned long delta, str * * p = (nr <= nl) ? l : l*nr/nl */ -static u64 __sched_period(unsigned long nr_running) +static inline u64 __sched_period(unsigned long nr_running) { u64 period = sysctl_sched_latency; unsigned long nr_latency = sched_nr_latency; @@ -789,7 +1117,7 @@ place_entity(struct cfs_rq *cfs_rq, stru if (first_fair(cfs_r_rq)) { vruntime = min_vruntime(cfs_r_rq->min_vruntime, - __pick_next_entity(cfs_r_rq)->vruntime); + __pick_next_timeline(cfs_r_rq)->vruntime); } else vruntime = cfs_r_rq->min_vruntime; @@ -1094,7 +1422,7 @@ static void yield_task_fair(struct rq *r /* * Find the rightmost entry in the rbtree: */ - rightmost = __pick_last_entity(&rq->cfs_root); + rightmost = __pick_last_timeline(&rq->cfs_root); /* * Already in the rightmost position? */ Index: linux-2.6/kernel/sched_features.h =================================================================== --- linux-2.6.orig/kernel/sched_features.h +++ linux-2.6/kernel/sched_features.h @@ -8,6 +8,8 @@ SCHED_FEAT(LB_BREAK, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(NORMALIZED_SLEEPER, 1) -SCHED_FEAT(DEADLINE, 1) +#ifdef CONFIG_FAIR_GROUP_SCHED +SCHED_FEAT(EEDF, 1) +#endif SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 0)