mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman <mgorman@suse.de>

commit: b8593bfda1652755136333cdd362de125b283a9c [log] [tgz]
author: Mel Gorman <mgorman@suse.de> Wed Nov 21 01:18:23 2012 +0000
committer: Mel Gorman <mgorman@suse.de> Tue Dec 11 14:42:55 2012 +0000
tree: c0395d9cf775fd9225e81b055fc8f5540a14333a
parent: e42c8ff2999de1239a57d434bfbd8e9f2a56e814 [diff]
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c5fffa2..e850a23 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h

@@ -410,6 +410,9 @@
 	 */
 	unsigned long numa_next_scan;
 
+	/* numa_next_reset is when the PTE scanner period will be reset */
+	unsigned long numa_next_reset;
+
 	/* Restart point for scanning and setting pte_numa */
 	unsigned long numa_scan_offset;
 

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d95a23..0f4ff2b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -1562,9 +1562,9 @@
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
 #ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int node, int pages);
+extern void task_numa_fault(int node, int pages, bool migrated);
 #else
-static inline void task_numa_fault(int node, int pages)
+static inline void task_numa_fault(int node, int pages, bool migrated)
 {
 }
 #endif
@@ -2009,6 +2009,7 @@
 extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_period_reset;
 extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_numa_balancing_settle_count;
 

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbfc484..9d255bc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c

@@ -1537,6 +1537,7 @@
 #ifdef CONFIG_NUMA_BALANCING
 	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
 		p->mm->numa_next_scan = jiffies;
+		p->mm->numa_next_reset = jiffies;
 		p->mm->numa_scan_seq = 0;
 	}
 

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd18087..4b57786 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c

@@ -784,7 +784,8 @@
  * numa task sample period in ms
  */
 unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*16;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
 
 /* Portion of address space to scan in MB */
 unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -806,20 +807,19 @@
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
-void task_numa_fault(int node, int pages)
+void task_numa_fault(int node, int pages, bool migrated)
 {
 	struct task_struct *p = current;
 
 	/* FIXME: Allocate task-specific structure for placement policy here */
 
 	/*
-	 * Assume that as faults occur that pages are getting properly placed
-	 * and fewer NUMA hints are required. Note that this is a big
-	 * assumption, it assumes processes reach a steady steady with no
-	 * further phase changes.
+	 * If pages are properly placed (did not migrate) then scan slower.
+	 * This is reset periodically in case of phase changes
 	 */
-	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
-				p->numa_scan_period + jiffies_to_msecs(2));
+        if (!migrated)
+		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+			p->numa_scan_period + jiffies_to_msecs(10));
 
 	task_numa_placement(p);
 }
@@ -858,6 +858,19 @@
 		return;
 
 	/*
+	 * Reset the scan period if enough time has gone by. Objective is that
+	 * scanning will be reduced if pages are properly placed. As tasks
+	 * can enter different phases this needs to be re-examined. Lacking
+	 * proper tracking of reference behaviour, this blunt hammer is used.
+	 */
+	migrate = mm->numa_next_reset;
+	if (time_after(now, migrate)) {
+		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+		xchg(&mm->numa_next_reset, next_scan);
+	}
+
+	/*
 	 * Enforce maximal scan/migration frequency..
 	 */
 	migrate = mm->numa_next_scan;

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 48a68cc..8906f90 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c

@@ -367,6 +367,13 @@
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "numa_balancing_scan_period_reset",
+		.data		= &sysctl_numa_balancing_scan_period_reset,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "numa_balancing_scan_period_max_ms",
 		.data		= &sysctl_numa_balancing_scan_period_max,
 		.maxlen		= sizeof(unsigned int),

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 79b9606..199b261 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c

@@ -1068,7 +1068,7 @@
 	spin_unlock(&mm->page_table_lock);
 	if (page) {
 		put_page(page);
-		task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+		task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false);
 	}
 	return 0;
 }

diff --git a/mm/memory.c b/mm/memory.c
index 84c6d9e..39edb11 100644
--- a/mm/memory.c
+++ b/mm/memory.c

@@ -3468,6 +3468,7 @@
 	spinlock_t *ptl;
 	int current_nid = -1;
 	int target_nid;
+	bool migrated = false;
 
 	/*
 	* The "pte" at this point cannot be used safely without
@@ -3509,12 +3510,13 @@
 	}
 
 	/* Migrate to the requested node */
-	if (migrate_misplaced_page(page, target_nid))
+	migrated = migrate_misplaced_page(page, target_nid);
+	if (migrated)
 		current_nid = target_nid;
 
 out:
 	if (current_nid != -1)
-		task_numa_fault(current_nid, 1);
+		task_numa_fault(current_nid, 1, migrated);
 	return 0;
 }
 
@@ -3554,6 +3556,7 @@
 		struct page *page;
 		int curr_nid = local_nid;
 		int target_nid;
+		bool migrated;
 		if (!pte_present(pteval))
 			continue;
 		if (!pte_numa(pteval))
@@ -3590,9 +3593,10 @@
 
 		/* Migrate to the requested node */
 		pte_unmap_unlock(pte, ptl);
-		if (migrate_misplaced_page(page, target_nid))
+		migrated = migrate_misplaced_page(page, target_nid);
+		if (migrated)
 			curr_nid = target_nid;
-		task_numa_fault(curr_nid, 1);
+		task_numa_fault(curr_nid, 1, migrated);
 
 		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	}
commit	b8593bfda1652755136333cdd362de125b283a9c	[log] [tgz]
author	Mel Gorman <mgorman@suse.de>	Wed Nov 21 01:18:23 2012 +0000
committer	Mel Gorman <mgorman@suse.de>	Tue Dec 11 14:42:55 2012 +0000
tree	c0395d9cf775fd9225e81b055fc8f5540a14333a
parent	e42c8ff2999de1239a57d434bfbd8e9f2a56e814 [diff]