From 0765933dae9efa84e41d993d18c7508c05c94446 Mon Sep 17 00:00:00 2001 From: mav Date: Mon, 27 Feb 2012 10:31:54 +0000 Subject: [PATCH 085/175] Rework CPU load balancing in SCHED_ULE: - In sched_pickcpu() be more careful taking previous CPU on SMT systems. Do it only if all other logical CPUs of that physical one are idle to avoid extra resource sharing. - In sched_pickcpu() change general logic of CPU selection. First look for idle CPU, sharing last level cache with previously used one, skipping SMT CPU groups. If none found, search all CPUs for the least loaded one, where the thread with its priority can run now. If none found, search just for the least loaded CPU. - Make cpu_search() compare lowest/highest CPU load when comparing CPU groups with equal load. That allows to differentiate 1+1 and 2+0 loads. - Make cpu_search() to prefer specified (previous) CPU or group if load is equal. This improves cache affinity for more complicated topologies. - Randomize CPU selection if above factors are equal. Previous code tend to prefer CPUs with lower IDs, causing unneeded collisions. - Rework periodic balancer in sched_balance_group(). With cpu_search() more intelligent now, make balansing process flat, removing recursion over the topology tree. That fixes double swap problem and makes load distribution more even and predictable. All together this gives 10-15% performance improvement in many tests on CPUs with SMT, such as Core i7, for number of threads is less then number of logical CPUs. In some tests it also gives positive effect to systems without SMT. Reviewed by: jeff Tested by: flo, hackers@ MFC after: 1 month Sponsored by: iXsystems, Inc. git-svn-id: http://svn.freebsd.org/base/head@232207 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f (cherry picked from commit 2838dc43b58f8619e988d643153e7dcc98cc6f24) Signed-off-by: Xin Li --- sys/kern/sched_ule.c | 330 ++++++++++++++++++++++++++++---------------------- 1 file changed, 182 insertions(+), 148 deletions(-) diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 2065b9f..126c135 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -258,7 +258,6 @@ struct cpu_group *cpu_top; /* CPU topology */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ static int affinity; -static int steal_htt = 1; static int steal_idle = 1; static int steal_thresh = 2; @@ -268,6 +267,7 @@ static int steal_thresh = 2; static struct tdq tdq_cpu[MAXCPU]; static struct tdq *balance_tdq; static int balance_ticks; +static DPCPU_DEFINE(uint32_t, randomval); #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) @@ -553,9 +553,11 @@ tdq_setlowpri(struct tdq *tdq, struct thread *ctd) #ifdef SMP struct cpu_search { cpuset_t cs_mask; - u_int cs_load; - u_int cs_cpu; - int cs_limit; /* Min priority for low min load for high. */ + u_int cs_prefer; + int cs_pri; /* Min priority for low. */ + int cs_limit; /* Max load for low, min load for high. */ + int cs_cpu; + int cs_load; }; #define CPU_SEARCH_LOWEST 0x1 @@ -566,44 +568,14 @@ struct cpu_search { for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \ if (CPU_ISSET(cpu, &mask)) -static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low, +static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match); -int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low); -int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high); -int cpu_search_both(struct cpu_group *cg, struct cpu_search *low, +int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low); +int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high); +int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high); /* - * This routine compares according to the match argument and should be - * reduced in actual instantiations via constant propagation and dead code - * elimination. - */ -static __inline int -cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high, - const int match) -{ - struct tdq *tdq; - - tdq = TDQ_CPU(cpu); - if (match & CPU_SEARCH_LOWEST) - if (CPU_ISSET(cpu, &low->cs_mask) && - tdq->tdq_load < low->cs_load && - tdq->tdq_lowpri > low->cs_limit) { - low->cs_cpu = cpu; - low->cs_load = tdq->tdq_load; - } - if (match & CPU_SEARCH_HIGHEST) - if (CPU_ISSET(cpu, &high->cs_mask) && - tdq->tdq_load >= high->cs_limit && - tdq->tdq_load > high->cs_load && - tdq->tdq_transferable) { - high->cs_cpu = cpu; - high->cs_load = tdq->tdq_load; - } - return (tdq->tdq_load); -} - -/* * Search the tree of cpu_groups for the lowest or highest loaded cpu * according to the match argument. This routine actually compares the * load on all paths through the tree and finds the least loaded cpu on @@ -615,33 +587,42 @@ cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high, * also recursive to the depth of the tree. */ static __inline int -cpu_search(struct cpu_group *cg, struct cpu_search *low, +cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match) { - int total; + struct cpu_search lgroup; + struct cpu_search hgroup; + cpuset_t cpumask; + struct cpu_group *child; + struct tdq *tdq; + int cpu, i, hload, lload, load, total, rnd; total = 0; - if (cg->cg_children) { - struct cpu_search lgroup; - struct cpu_search hgroup; - struct cpu_group *child; - u_int lload; - int hload; - int load; - int i; - - lload = -1; + cpumask = cg->cg_mask; + if (match & CPU_SEARCH_LOWEST) { + lload = INT_MAX; + low->cs_load = INT_MAX; + lgroup = *low; + } + if (match & CPU_SEARCH_HIGHEST) { hload = -1; - for (i = 0; i < cg->cg_children; i++) { + high->cs_load = -1; + hgroup = *high; + } + + /* Iterate through the child CPU groups and then remaining CPUs. */ + for (i = 0, cpu = 0; i <= cg->cg_children; ) { + if (i >= cg->cg_children) { + while (cpu <= mp_maxid && !CPU_ISSET(cpu, &cpumask)) + cpu++; + if (cpu > mp_maxid) + break; + child = NULL; + } else child = &cg->cg_child[i]; - if (match & CPU_SEARCH_LOWEST) { - lgroup = *low; - lgroup.cs_load = -1; - } - if (match & CPU_SEARCH_HIGHEST) { - hgroup = *high; - lgroup.cs_load = 0; - } + + if (child) { /* Handle child CPU group. */ + CPU_NAND(&cpumask, &child->cg_mask); switch (match) { case CPU_SEARCH_LOWEST: load = cpu_search_lowest(child, &lgroup); @@ -653,23 +634,52 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low, load = cpu_search_both(child, &lgroup, &hgroup); break; } - total += load; - if (match & CPU_SEARCH_LOWEST) - if (load < lload || low->cs_cpu == -1) { - *low = lgroup; - lload = load; + } else { /* Handle child CPU. */ + tdq = TDQ_CPU(cpu); + load = tdq->tdq_load * 256; + rnd = DPCPU_SET(randomval, + DPCPU_GET(randomval) * 69069 + 5) >> 26; + if (match & CPU_SEARCH_LOWEST) { + if (cpu == low->cs_prefer) + load -= 64; + /* If that CPU is allowed and get data. */ + if (CPU_ISSET(cpu, &lgroup.cs_mask) && + tdq->tdq_lowpri > lgroup.cs_pri && + tdq->tdq_load <= lgroup.cs_limit) { + lgroup.cs_cpu = cpu; + lgroup.cs_load = load - rnd; } - if (match & CPU_SEARCH_HIGHEST) - if (load > hload || high->cs_cpu == -1) { - hload = load; - *high = hgroup; + } + if (match & CPU_SEARCH_HIGHEST) + if (CPU_ISSET(cpu, &hgroup.cs_mask) && + tdq->tdq_load >= hgroup.cs_limit && + tdq->tdq_transferable) { + hgroup.cs_cpu = cpu; + hgroup.cs_load = load - rnd; } } - } else { - int cpu; - - CPUSET_FOREACH(cpu, cg->cg_mask) - total += cpu_compare(cpu, low, high, match); + total += load; + + /* We have info about child item. Compare it. */ + if (match & CPU_SEARCH_LOWEST) { + if ((load < lload) || + (load == lload && lgroup.cs_load < low->cs_load)) { + lload = load; + low->cs_cpu = lgroup.cs_cpu; + low->cs_load = lgroup.cs_load; + } + } + if (match & CPU_SEARCH_HIGHEST) + if ((load > hload) || + (load == hload && hgroup.cs_load > high->cs_load)) { + hload = load; + high->cs_cpu = hgroup.cs_cpu; + high->cs_load = hgroup.cs_load; + } + if (child) + i++; + else + cpu++; } return (total); } @@ -679,19 +689,19 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low, * optimization. */ int -cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low) +cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low) { return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); } int -cpu_search_highest(struct cpu_group *cg, struct cpu_search *high) +cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high) { return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); } int -cpu_search_both(struct cpu_group *cg, struct cpu_search *low, +cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high) { return cpu_search(cg, low, high, CPU_SEARCH_BOTH); @@ -703,14 +713,16 @@ cpu_search_both(struct cpu_group *cg, struct cpu_search *low, * acceptable. */ static inline int -sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri) +sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload, + int prefer) { struct cpu_search low; low.cs_cpu = -1; - low.cs_load = -1; + low.cs_prefer = prefer; low.cs_mask = mask; - low.cs_limit = pri; + low.cs_pri = pri; + low.cs_limit = maxload; cpu_search_lowest(cg, &low); return low.cs_cpu; } @@ -719,12 +731,11 @@ sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri) * Find the cpu with the highest load via the highest loaded path. */ static inline int -sched_highest(struct cpu_group *cg, cpuset_t mask, int minload) +sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload) { struct cpu_search high; high.cs_cpu = -1; - high.cs_load = 0; high.cs_mask = mask; high.cs_limit = minload; cpu_search_highest(cg, &high); @@ -735,17 +746,17 @@ sched_highest(struct cpu_group *cg, cpuset_t mask, int minload) * Simultaneously find the highest and lowest loaded cpu reachable via * cg. */ -static inline void -sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu) +static inline void +sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu) { struct cpu_search high; struct cpu_search low; low.cs_cpu = -1; - low.cs_limit = -1; - low.cs_load = -1; + low.cs_prefer = -1; + low.cs_pri = -1; + low.cs_limit = INT_MAX; low.cs_mask = mask; - high.cs_load = 0; high.cs_cpu = -1; high.cs_limit = -1; high.cs_mask = mask; @@ -758,30 +769,45 @@ sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu) static void sched_balance_group(struct cpu_group *cg) { - cpuset_t mask; - int high; - int low; - int i; + cpuset_t hmask, lmask; + int high, low, anylow; - CPU_FILL(&mask); + CPU_FILL(&hmask); for (;;) { - sched_both(cg, mask, &low, &high); - if (low == high || low == -1 || high == -1) + high = sched_highest(cg, hmask, 1); + /* Stop if there is no more CPU with transferrable threads. */ + if (high == -1) break; - if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) + CPU_CLR(high, &hmask); + CPU_COPY(&hmask, &lmask); + /* Stop if there is no more CPU left for low. */ + if (CPU_EMPTY(&lmask)) break; - /* - * If we failed to move any threads determine which cpu - * to kick out of the set and try again. - */ - if (TDQ_CPU(high)->tdq_transferable == 0) - CPU_CLR(high, &mask); - else - CPU_CLR(low, &mask); + anylow = 1; +nextlow: + low = sched_lowest(cg, lmask, -1, + TDQ_CPU(high)->tdq_load - 1, high); + /* Stop if we looked well and found no less loaded CPU. */ + if (anylow && low == -1) + break; + /* Go to next high if we found no less loaded CPU. */ + if (low == -1) + continue; + /* Transfer thread from high to low. */ + if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) { + /* CPU that got thread can no longer be a donor. */ + CPU_CLR(low, &hmask); + } else { + /* + * If failed, then there is no threads on high + * that can run on this low. Drop low from low + * mask and look for different one. + */ + CPU_CLR(low, &lmask); + anylow = 0; + goto nextlow; + } } - - for (i = 0; i < cg->cg_children; i++) - sched_balance_group(&cg->cg_child[i]); } static void @@ -834,32 +860,17 @@ tdq_unlock_pair(struct tdq *one, struct tdq *two) static int sched_balance_pair(struct tdq *high, struct tdq *low) { - int transferable; - int high_load; - int low_load; int moved; - int move; int cpu; - int diff; - int i; tdq_lock_pair(high, low); - transferable = high->tdq_transferable; - high_load = high->tdq_load; - low_load = low->tdq_load; moved = 0; /* * Determine what the imbalance is and then adjust that to how many * threads we actually have to give up (transferable). */ - if (transferable != 0) { - diff = high_load - low_load; - move = diff / 2; - if (diff & 0x1) - move++; - move = min(move, transferable); - for (i = 0; i < move; i++) - moved += tdq_move(high, low); + if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && + (moved = tdq_move(high, low)) > 0) { /* * In case the target isn't the current cpu IPI it to force a * reschedule with the new workload. @@ -1003,8 +1014,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start) { struct rqbits *rqb; struct rqhead *rqh; - struct thread *td; - int first; + struct thread *td, *first; int bit; int pri; int i; @@ -1012,7 +1022,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start) rqb = &rq->rq_status; bit = start & (RQB_BPW -1); pri = 0; - first = 0; + first = NULL; again: for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { if (rqb->rqb_bits[i] == 0) @@ -1031,7 +1041,7 @@ again: if (first && THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); - first = 1; + first = td; } } if (start != 0) { @@ -1039,6 +1049,9 @@ again: goto again; } + if (first && THREAD_CAN_MIGRATE(first) && + THREAD_CAN_SCHED(first, cpu)) + return (first); return (NULL); } @@ -1140,13 +1153,11 @@ SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); static int sched_pickcpu(struct thread *td, int flags) { - struct cpu_group *cg; + struct cpu_group *cg, *ccg; struct td_sched *ts; struct tdq *tdq; cpuset_t mask; - int self; - int pri; - int cpu; + int cpu, pri, self; self = PCPU_GET(cpuid); ts = td->td_sched; @@ -1161,45 +1172,70 @@ sched_pickcpu(struct thread *td, int flags) * Prefer to run interrupt threads on the processors that generate * the interrupt. */ + pri = td->td_priority; if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level && ts->ts_cpu != self) { SCHED_STAT_INC(pickcpu_intrbind); ts->ts_cpu = self; + if (TDQ_CPU(self)->tdq_lowpri > pri) { + SCHED_STAT_INC(pickcpu_affinity); + return (ts->ts_cpu); + } } /* * If the thread can run on the last cpu and the affinity has not * expired or it is idle run it there. */ - pri = td->td_priority; tdq = TDQ_CPU(ts->ts_cpu); - if (THREAD_CAN_SCHED(td, ts->ts_cpu)) { - if (tdq->tdq_lowpri > PRI_MIN_IDLE) { + cg = tdq->tdq_cg; + if (THREAD_CAN_SCHED(td, ts->ts_cpu) && + tdq->tdq_lowpri >= PRI_MIN_IDLE && + SCHED_AFFINITY(ts, CG_SHARE_L2)) { + if (cg->cg_flags & CG_FLAG_THREAD) { + CPUSET_FOREACH(cpu, cg->cg_mask) { + if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) + break; + } + } else + cpu = INT_MAX; + if (cpu > mp_maxid) { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } - if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) { - SCHED_STAT_INC(pickcpu_affinity); - return (ts->ts_cpu); - } } /* - * Search for the highest level in the tree that still has affinity. + * Search for the last level cache CPU group in the tree. + * Skip caches with expired affinity time and SMT groups. + * Affinity to higher level caches will be handled less aggressively. */ - cg = NULL; - for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent) - if (SCHED_AFFINITY(ts, cg->cg_level)) - break; + for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { + if (cg->cg_flags & CG_FLAG_THREAD) + continue; + if (!SCHED_AFFINITY(ts, cg->cg_level)) + continue; + ccg = cg; + } + if (ccg != NULL) + cg = ccg; cpu = -1; + /* Search the group for the less loaded idle CPU we can run now. */ mask = td->td_cpuset->cs_mask; - if (cg) - cpu = sched_lowest(cg, mask, pri); + if (cg != NULL && cg != cpu_top && + CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0) + cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE), + INT_MAX, ts->ts_cpu); + /* Search globally for the less loaded CPU we can run now. */ + if (cpu == -1) + cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu); + /* Search globally for the less loaded CPU. */ if (cpu == -1) - cpu = sched_lowest(cpu_top, mask, -1); + cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu); /* * Compare the lowest loaded cpu to current cpu. */ if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri && - TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) { + TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE && + TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } else @@ -2750,8 +2786,6 @@ SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, &balance_interval, 0, "Average frequency in stathz ticks to run the long-term balancer"); -SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, - "Steals work from another hyper-threaded core on idle"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, -- 1.7.9.4