From 0765933dae9efa84e41d993d18c7508c05c94446 Mon Sep 17 00:00:00 2001
From: mav <mav@ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f>
Date: Mon, 27 Feb 2012 10:31:54 +0000
Subject: [PATCH 085/175] Rework CPU load balancing in SCHED_ULE:  - In
 sched_pickcpu() be more careful taking previous CPU
 on SMT systems. Do it only if all other logical
 CPUs of that physical one are idle to avoid extra
 resource sharing.  - In sched_pickcpu() change
 general logic of CPU selection. First look for idle
 CPU, sharing last level cache with previously used
 one, skipping SMT CPU groups. If none found, search
 all CPUs for the least loaded one, where the thread
 with its priority can run now. If none found,
 search just for the least loaded CPU.  - Make
 cpu_search() compare lowest/highest CPU load when
 comparing CPU groups with equal load. That allows
 to differentiate 1+1 and 2+0 loads.  - Make
 cpu_search() to prefer specified (previous) CPU or
 group if load is equal. This improves cache
 affinity for more complicated topologies.  -
 Randomize CPU selection if above factors are equal.
 Previous code tend to prefer CPUs with lower IDs,
 causing unneeded collisions.  - Rework periodic
 balancer in sched_balance_group(). With
 cpu_search() more intelligent now, make balansing
 process flat, removing recursion over the topology
 tree. That fixes double swap problem and makes load
 distribution more even and predictable.

All together this gives 10-15% performance improvement in many tests on
CPUs with SMT, such as Core i7, for number of threads is less then number
of logical CPUs. In some tests it also gives positive effect to systems
without SMT.

Reviewed by:	jeff
Tested by:	flo, hackers@
MFC after:	1 month
Sponsored by:	iXsystems, Inc.

git-svn-id: http://svn.freebsd.org/base/head@232207 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f
(cherry picked from commit 2838dc43b58f8619e988d643153e7dcc98cc6f24)

Signed-off-by: Xin Li <d@delphij.net>
---
 sys/kern/sched_ule.c |  330 ++++++++++++++++++++++++++++----------------------
 1 file changed, 182 insertions(+), 148 deletions(-)

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 2065b9f..126c135 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -258,7 +258,6 @@ struct cpu_group *cpu_top;		/* CPU topology */
 static int rebalance = 1;
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int affinity;
-static int steal_htt = 1;
 static int steal_idle = 1;
 static int steal_thresh = 2;
 
@@ -268,6 +267,7 @@ static int steal_thresh = 2;
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq	*balance_tdq;
 static int balance_ticks;
+static DPCPU_DEFINE(uint32_t, randomval);
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
@@ -553,9 +553,11 @@ tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
 #ifdef SMP
 struct cpu_search {
 	cpuset_t cs_mask;
-	u_int	cs_load;
-	u_int	cs_cpu;
-	int	cs_limit;	/* Min priority for low min load for high. */
+	u_int	cs_prefer;
+	int	cs_pri;		/* Min priority for low. */
+	int	cs_limit;	/* Max load for low, min load for high. */
+	int	cs_cpu;
+	int	cs_load;
 };
 
 #define	CPU_SEARCH_LOWEST	0x1
@@ -566,44 +568,14 @@ struct cpu_search {
 	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
 		if (CPU_ISSET(cpu, &mask))
 
-static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
+static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match);
-int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
-int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
-int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high);
 
 /*
- * This routine compares according to the match argument and should be
- * reduced in actual instantiations via constant propagation and dead code
- * elimination.
- */ 
-static __inline int
-cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
-    const int match)
-{
-	struct tdq *tdq;
-
-	tdq = TDQ_CPU(cpu);
-	if (match & CPU_SEARCH_LOWEST)
-		if (CPU_ISSET(cpu, &low->cs_mask) &&
-		    tdq->tdq_load < low->cs_load &&
-		    tdq->tdq_lowpri > low->cs_limit) {
-			low->cs_cpu = cpu;
-			low->cs_load = tdq->tdq_load;
-		}
-	if (match & CPU_SEARCH_HIGHEST)
-		if (CPU_ISSET(cpu, &high->cs_mask) &&
-		    tdq->tdq_load >= high->cs_limit && 
-		    tdq->tdq_load > high->cs_load &&
-		    tdq->tdq_transferable) {
-			high->cs_cpu = cpu;
-			high->cs_load = tdq->tdq_load;
-		}
-	return (tdq->tdq_load);
-}
-
-/*
  * Search the tree of cpu_groups for the lowest or highest loaded cpu
  * according to the match argument.  This routine actually compares the
  * load on all paths through the tree and finds the least loaded cpu on
@@ -615,33 +587,42 @@ cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
  * also recursive to the depth of the tree.
  */
 static __inline int
-cpu_search(struct cpu_group *cg, struct cpu_search *low,
+cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match)
 {
-	int total;
+	struct cpu_search lgroup;
+	struct cpu_search hgroup;
+	cpuset_t cpumask;
+	struct cpu_group *child;
+	struct tdq *tdq;
+	int cpu, i, hload, lload, load, total, rnd;
 
 	total = 0;
-	if (cg->cg_children) {
-		struct cpu_search lgroup;
-		struct cpu_search hgroup;
-		struct cpu_group *child;
-		u_int lload;
-		int hload;
-		int load;
-		int i;
-
-		lload = -1;
+	cpumask = cg->cg_mask;
+	if (match & CPU_SEARCH_LOWEST) {
+		lload = INT_MAX;
+		low->cs_load = INT_MAX;
+		lgroup = *low;
+	}
+	if (match & CPU_SEARCH_HIGHEST) {
 		hload = -1;
-		for (i = 0; i < cg->cg_children; i++) {
+		high->cs_load = -1;
+		hgroup = *high;
+	}
+
+	/* Iterate through the child CPU groups and then remaining CPUs. */
+	for (i = 0, cpu = 0; i <= cg->cg_children; ) {
+		if (i >= cg->cg_children) {
+			while (cpu <= mp_maxid && !CPU_ISSET(cpu, &cpumask))
+				cpu++;
+			if (cpu > mp_maxid)
+				break;
+			child = NULL;
+		} else
 			child = &cg->cg_child[i];
-			if (match & CPU_SEARCH_LOWEST) {
-				lgroup = *low;
-				lgroup.cs_load = -1;
-			}
-			if (match & CPU_SEARCH_HIGHEST) {
-				hgroup = *high;
-				lgroup.cs_load = 0;
-			}
+
+		if (child) {			/* Handle child CPU group. */
+			CPU_NAND(&cpumask, &child->cg_mask);
 			switch (match) {
 			case CPU_SEARCH_LOWEST:
 				load = cpu_search_lowest(child, &lgroup);
@@ -653,23 +634,52 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
 				load = cpu_search_both(child, &lgroup, &hgroup);
 				break;
 			}
-			total += load;
-			if (match & CPU_SEARCH_LOWEST)
-				if (load < lload || low->cs_cpu == -1) {
-					*low = lgroup;
-					lload = load;
+		} else {			/* Handle child CPU. */
+			tdq = TDQ_CPU(cpu);
+			load = tdq->tdq_load * 256;
+			rnd = DPCPU_SET(randomval,
+			    DPCPU_GET(randomval) * 69069 + 5) >> 26;
+			if (match & CPU_SEARCH_LOWEST) {
+				if (cpu == low->cs_prefer)
+					load -= 64;
+				/* If that CPU is allowed and get data. */
+				if (CPU_ISSET(cpu, &lgroup.cs_mask) &&
+				    tdq->tdq_lowpri > lgroup.cs_pri &&
+				    tdq->tdq_load <= lgroup.cs_limit) {
+					lgroup.cs_cpu = cpu;
+					lgroup.cs_load = load - rnd;
 				}
-			if (match & CPU_SEARCH_HIGHEST) 
-				if (load > hload || high->cs_cpu == -1) {
-					hload = load;
-					*high = hgroup;
+			}
+			if (match & CPU_SEARCH_HIGHEST)
+				if (CPU_ISSET(cpu, &hgroup.cs_mask) &&
+				    tdq->tdq_load >= hgroup.cs_limit &&
+				    tdq->tdq_transferable) {
+					hgroup.cs_cpu = cpu;
+					hgroup.cs_load = load - rnd;
 				}
 		}
-	} else {
-		int cpu;
-
-		CPUSET_FOREACH(cpu, cg->cg_mask)
-			total += cpu_compare(cpu, low, high, match);
+		total += load;
+
+		/* We have info about child item. Compare it. */
+		if (match & CPU_SEARCH_LOWEST) {
+			if ((load < lload) ||
+			    (load == lload && lgroup.cs_load < low->cs_load)) {
+				lload = load;
+				low->cs_cpu = lgroup.cs_cpu;
+				low->cs_load = lgroup.cs_load;
+			}
+		}
+		if (match & CPU_SEARCH_HIGHEST)
+			if ((load > hload) ||
+			    (load == hload && hgroup.cs_load > high->cs_load)) {
+				hload = load;
+				high->cs_cpu = hgroup.cs_cpu;
+				high->cs_load = hgroup.cs_load;
+			}
+		if (child)
+			i++;
+		else
+			cpu++;
 	}
 	return (total);
 }
@@ -679,19 +689,19 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
  * optimization.
  */
 int
-cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
+cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
 {
 	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
 }
 
 int
-cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
+cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
 {
 	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
 }
 
 int
-cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high)
 {
 	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
@@ -703,14 +713,16 @@ cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
  * acceptable.
  */
 static inline int
-sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
+sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+    int prefer)
 {
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
-	low.cs_load = -1;
+	low.cs_prefer = prefer;
 	low.cs_mask = mask;
-	low.cs_limit = pri;
+	low.cs_pri = pri;
+	low.cs_limit = maxload;
 	cpu_search_lowest(cg, &low);
 	return low.cs_cpu;
 }
@@ -719,12 +731,11 @@ sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
-sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
+sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
 {
 	struct cpu_search high;
 
 	high.cs_cpu = -1;
-	high.cs_load = 0;
 	high.cs_mask = mask;
 	high.cs_limit = minload;
 	cpu_search_highest(cg, &high);
@@ -735,17 +746,17 @@ sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
  * Simultaneously find the highest and lowest loaded cpu reachable via
  * cg.
  */
-static inline void 
-sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
+static inline void
+sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
 {
 	struct cpu_search high;
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
-	low.cs_limit = -1;
-	low.cs_load = -1;
+	low.cs_prefer = -1;
+	low.cs_pri = -1;
+	low.cs_limit = INT_MAX;
 	low.cs_mask = mask;
-	high.cs_load = 0;
 	high.cs_cpu = -1;
 	high.cs_limit = -1;
 	high.cs_mask = mask;
@@ -758,30 +769,45 @@ sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
 static void
 sched_balance_group(struct cpu_group *cg)
 {
-	cpuset_t mask;
-	int high;
-	int low;
-	int i;
+	cpuset_t hmask, lmask;
+	int high, low, anylow;
 
-	CPU_FILL(&mask);
+	CPU_FILL(&hmask);
 	for (;;) {
-		sched_both(cg, mask, &low, &high);
-		if (low == high || low == -1 || high == -1)
+		high = sched_highest(cg, hmask, 1);
+		/* Stop if there is no more CPU with transferrable threads. */
+		if (high == -1)
 			break;
-		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
+		CPU_CLR(high, &hmask);
+		CPU_COPY(&hmask, &lmask);
+		/* Stop if there is no more CPU left for low. */
+		if (CPU_EMPTY(&lmask))
 			break;
-		/*
-		 * If we failed to move any threads determine which cpu
-		 * to kick out of the set and try again.
-	 	 */
-		if (TDQ_CPU(high)->tdq_transferable == 0)
-			CPU_CLR(high, &mask);
-		else
-			CPU_CLR(low, &mask);
+		anylow = 1;
+nextlow:
+		low = sched_lowest(cg, lmask, -1,
+		    TDQ_CPU(high)->tdq_load - 1, high);
+		/* Stop if we looked well and found no less loaded CPU. */
+		if (anylow && low == -1)
+			break;
+		/* Go to next high if we found no less loaded CPU. */
+		if (low == -1)
+			continue;
+		/* Transfer thread from high to low. */
+		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
+			/* CPU that got thread can no longer be a donor. */
+			CPU_CLR(low, &hmask);
+		} else {
+			/*
+			 * If failed, then there is no threads on high
+			 * that can run on this low. Drop low from low
+			 * mask and look for different one.
+			 */
+			CPU_CLR(low, &lmask);
+			anylow = 0;
+			goto nextlow;
+		}
 	}
-
-	for (i = 0; i < cg->cg_children; i++)
-		sched_balance_group(&cg->cg_child[i]);
 }
 
 static void
@@ -834,32 +860,17 @@ tdq_unlock_pair(struct tdq *one, struct tdq *two)
 static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
-	int transferable;
-	int high_load;
-	int low_load;
 	int moved;
-	int move;
 	int cpu;
-	int diff;
-	int i;
 
 	tdq_lock_pair(high, low);
-	transferable = high->tdq_transferable;
-	high_load = high->tdq_load;
-	low_load = low->tdq_load;
 	moved = 0;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
 	 */
-	if (transferable != 0) {
-		diff = high_load - low_load;
-		move = diff / 2;
-		if (diff & 0x1)
-			move++;
-		move = min(move, transferable);
-		for (i = 0; i < move; i++)
-			moved += tdq_move(high, low);
+	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
+	    (moved = tdq_move(high, low)) > 0) {
 		/*
 		 * In case the target isn't the current cpu IPI it to force a
 		 * reschedule with the new workload.
@@ -1003,8 +1014,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
 {
 	struct rqbits *rqb;
 	struct rqhead *rqh;
-	struct thread *td;
-	int first;
+	struct thread *td, *first;
 	int bit;
 	int pri;
 	int i;
@@ -1012,7 +1022,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
 	rqb = &rq->rq_status;
 	bit = start & (RQB_BPW -1);
 	pri = 0;
-	first = 0;
+	first = NULL;
 again:
 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
 		if (rqb->rqb_bits[i] == 0)
@@ -1031,7 +1041,7 @@ again:
 			if (first && THREAD_CAN_MIGRATE(td) &&
 			    THREAD_CAN_SCHED(td, cpu))
 				return (td);
-			first = 1;
+			first = td;
 		}
 	}
 	if (start != 0) {
@@ -1039,6 +1049,9 @@ again:
 		goto again;
 	}
 
+	if (first && THREAD_CAN_MIGRATE(first) &&
+	    THREAD_CAN_SCHED(first, cpu))
+		return (first);
 	return (NULL);
 }
 
@@ -1140,13 +1153,11 @@ SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
 static int
 sched_pickcpu(struct thread *td, int flags)
 {
-	struct cpu_group *cg;
+	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t mask;
-	int self;
-	int pri;
-	int cpu;
+	int cpu, pri, self;
 
 	self = PCPU_GET(cpuid);
 	ts = td->td_sched;
@@ -1161,45 +1172,70 @@ sched_pickcpu(struct thread *td, int flags)
 	 * Prefer to run interrupt threads on the processors that generate
 	 * the interrupt.
 	 */
+	pri = td->td_priority;
 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
 	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
 		SCHED_STAT_INC(pickcpu_intrbind);
 		ts->ts_cpu = self;
+		if (TDQ_CPU(self)->tdq_lowpri > pri) {
+			SCHED_STAT_INC(pickcpu_affinity);
+			return (ts->ts_cpu);
+		}
 	}
 	/*
 	 * If the thread can run on the last cpu and the affinity has not
 	 * expired or it is idle run it there.
 	 */
-	pri = td->td_priority;
 	tdq = TDQ_CPU(ts->ts_cpu);
-	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
-		if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
+	cg = tdq->tdq_cg;
+	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
+	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
+	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
+		if (cg->cg_flags & CG_FLAG_THREAD) {
+			CPUSET_FOREACH(cpu, cg->cg_mask) {
+				if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
+					break;
+			}
+		} else
+			cpu = INT_MAX;
+		if (cpu > mp_maxid) {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (ts->ts_cpu);
 		}
-		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
-			SCHED_STAT_INC(pickcpu_affinity);
-			return (ts->ts_cpu);
-		}
 	}
 	/*
-	 * Search for the highest level in the tree that still has affinity.
+	 * Search for the last level cache CPU group in the tree.
+	 * Skip caches with expired affinity time and SMT groups.
+	 * Affinity to higher level caches will be handled less aggressively.
 	 */
-	cg = NULL;
-	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
-		if (SCHED_AFFINITY(ts, cg->cg_level))
-			break;
+	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
+		if (cg->cg_flags & CG_FLAG_THREAD)
+			continue;
+		if (!SCHED_AFFINITY(ts, cg->cg_level))
+			continue;
+		ccg = cg;
+	}
+	if (ccg != NULL)
+		cg = ccg;
 	cpu = -1;
+	/* Search the group for the less loaded idle CPU we can run now. */
 	mask = td->td_cpuset->cs_mask;
-	if (cg)
-		cpu = sched_lowest(cg, mask, pri);
+	if (cg != NULL && cg != cpu_top &&
+	    CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
+		cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
+		    INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU we can run now. */
+	if (cpu == -1)
+		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU. */
 	if (cpu == -1)
-		cpu = sched_lowest(cpu_top, mask, -1);
+		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
 	/*
 	 * Compare the lowest loaded cpu to current cpu.
 	 */
 	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
-	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
+	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+	    TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	} else
@@ -2750,8 +2786,6 @@ SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
     &balance_interval, 0,
     "Average frequency in stathz ticks to run the long-term balancer");
-SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
-    "Steals work from another hyper-threaded core on idle");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
     "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
-- 
1.7.9.4