linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [git pull] cpus4096 fixes
@ 2008-07-27 19:06 Ingo Molnar
  2008-07-27 20:15 ` Linus Torvalds
  2008-07-28  0:53 ` Rusty Russell
  0 siblings, 2 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-27 19:06 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton, Mike Travis, Rusty Russell

Linus,

Please pull the latest cpus4096-fixes git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git cpus4096

this fixes the cpumask_of_cpu API fallout described here:

   http://lkml.org/lkml/2008/7/23/76

... and the fix is wider than i'd like it to be, so close to -rc1 - but 
it's the cleanest one and it has Rusty's ack as well. These changes have 
been tested thoroughly on x86 in the past 2-3 days and caused no 
problems. Mike cross-compiled them on a lot of architectures:

   http://lkml.org/lkml/2008/7/24/483

or we could also delay it to after -rc1.

Thanks,

	Ingo

------------------>
Ingo Molnar (1):
      cpumask: export cpumask_of_cpu_map

Mike Travis (3):
      cpumask: make cpumask_of_cpu_map generic
      cpumask: put cpumask_of_cpu_map in the initdata section
      cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu


 arch/x86/kernel/acpi/cstate.c                    |    3 +-
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c       |   10 +--
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c        |   15 +--
 arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c |   12 +--
 arch/x86/kernel/cpu/cpufreq/speedstep-ich.c      |    3 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c            |    3 +-
 arch/x86/kernel/ldt.c                            |    6 +-
 arch/x86/kernel/microcode.c                      |   17 +---
 arch/x86/kernel/reboot.c                         |   11 +--
 arch/x86/kernel/setup_percpu.c                   |   10 +-
 drivers/acpi/processor_throttling.c              |   11 +--
 drivers/firmware/dcdbas.c                        |    3 +-
 drivers/misc/sgi-xp/xpc_main.c                   |    3 +-
 include/linux/cpumask.h                          |   41 +-------
 kernel/cpu.c                                     |  113 ++++++++++++++++++++++
 kernel/stop_machine.c                            |    3 +-
 kernel/time/tick-common.c                        |    8 +-
 kernel/trace/trace_sysprof.c                     |    4 +-
 lib/smp_processor_id.c                           |    5 +-
 net/sunrpc/svc.c                                 |    3 +-
 20 files changed, 159 insertions(+), 125 deletions(-)

diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 9220cf4..c2502eb 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -73,7 +73,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
 	cpumask_t saved_mask;
-	cpumask_of_cpu_ptr(new_mask, cpu);
 	int retval;
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int edx_part;
@@ -92,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 
 	/* Make sure we are running on right CPU */
 	saved_mask = current->cpus_allowed;
-	retval = set_cpus_allowed_ptr(current, new_mask);
+	retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 	if (retval)
 		return -1;
 
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ff2fff5..dd097b8 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -200,12 +200,10 @@ static void drv_read(struct drv_cmd *cmd)
 static void drv_write(struct drv_cmd *cmd)
 {
 	cpumask_t saved_mask = current->cpus_allowed;
-	cpumask_of_cpu_ptr_declare(cpu_mask);
 	unsigned int i;
 
 	for_each_cpu_mask_nr(i, cmd->mask) {
-		cpumask_of_cpu_ptr_next(cpu_mask, i);
-		set_cpus_allowed_ptr(current, cpu_mask);
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
 		do_drv_write(cmd);
 	}
 
@@ -269,12 +267,11 @@ static unsigned int get_measured_perf(unsigned int cpu)
 	} aperf_cur, mperf_cur;
 
 	cpumask_t saved_mask;
-	cpumask_of_cpu_ptr(cpu_mask, cpu);
 	unsigned int perf_percent;
 	unsigned int retval;
 
 	saved_mask = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, cpu_mask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 	if (get_cpu() != cpu) {
 		/* We were not able to run on requested processor */
 		put_cpu();
@@ -340,7 +337,6 @@ static unsigned int get_measured_perf(unsigned int cpu)
 
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
-	cpumask_of_cpu_ptr(cpu_mask, cpu);
 	struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
 	unsigned int freq;
 	unsigned int cached_freq;
@@ -353,7 +349,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 	}
 
 	cached_freq = data->freq_table[data->acpi_data->state].frequency;
-	freq = extract_freq(get_cur_val(cpu_mask), data);
+	freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data);
 	if (freq != cached_freq) {
 		/*
 		 * The dreaded BIOS frequency change behind our back.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 53c7b69..c45ca6d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -479,12 +479,11 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
 static int check_supported_cpu(unsigned int cpu)
 {
 	cpumask_t oldmask;
-	cpumask_of_cpu_ptr(cpu_mask, cpu);
 	u32 eax, ebx, ecx, edx;
 	unsigned int rc = 0;
 
 	oldmask = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, cpu_mask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 
 	if (smp_processor_id() != cpu) {
 		printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -1017,7 +1016,6 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
 static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
 {
 	cpumask_t oldmask;
-	cpumask_of_cpu_ptr(cpu_mask, pol->cpu);
 	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
 	u32 checkfid;
 	u32 checkvid;
@@ -1032,7 +1030,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
 
 	/* only run on specific CPU from here on */
 	oldmask = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, cpu_mask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
 
 	if (smp_processor_id() != pol->cpu) {
 		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1107,7 +1105,6 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 {
 	struct powernow_k8_data *data;
 	cpumask_t oldmask;
-	cpumask_of_cpu_ptr_declare(newmask);
 	int rc;
 
 	if (!cpu_online(pol->cpu))
@@ -1159,8 +1156,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 
 	/* only run on specific CPU from here on */
 	oldmask = current->cpus_allowed;
-	cpumask_of_cpu_ptr_next(newmask, pol->cpu);
-	set_cpus_allowed_ptr(current, newmask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
 
 	if (smp_processor_id() != pol->cpu) {
 		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1182,7 +1178,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	set_cpus_allowed_ptr(current, &oldmask);
 
 	if (cpu_family == CPU_HW_PSTATE)
-		pol->cpus = *newmask;
+		pol->cpus = cpumask_of_cpu(pol->cpu);
 	else
 		pol->cpus = per_cpu(cpu_core_map, pol->cpu);
 	data->available_cores = &(pol->cpus);
@@ -1248,7 +1244,6 @@ static unsigned int powernowk8_get (unsigned int cpu)
 {
 	struct powernow_k8_data *data;
 	cpumask_t oldmask = current->cpus_allowed;
-	cpumask_of_cpu_ptr(newmask, cpu);
 	unsigned int khz = 0;
 	unsigned int first;
 
@@ -1258,7 +1253,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
 	if (!data)
 		return -EINVAL;
 
-	set_cpus_allowed_ptr(current, newmask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 	if (smp_processor_id() != cpu) {
 		printk(KERN_ERR PFX
 			"limiting to CPU %d failed in powernowk8_get\n", cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index ca2ac13..15e13c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -324,10 +324,9 @@ static unsigned int get_cur_freq(unsigned int cpu)
 	unsigned l, h;
 	unsigned clock_freq;
 	cpumask_t saved_mask;
-	cpumask_of_cpu_ptr(new_mask, cpu);
 
 	saved_mask = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, new_mask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 	if (smp_processor_id() != cpu)
 		return 0;
 
@@ -585,15 +584,12 @@ static int centrino_target (struct cpufreq_policy *policy,
 		 * Best effort undo..
 		 */
 
-		if (!cpus_empty(*covered_cpus)) {
-			cpumask_of_cpu_ptr_declare(new_mask);
-
+		if (!cpus_empty(*covered_cpus))
 			for_each_cpu_mask_nr(j, *covered_cpus) {
-				cpumask_of_cpu_ptr_next(new_mask, j);
-				set_cpus_allowed_ptr(current, new_mask);
+				set_cpus_allowed_ptr(current,
+						     &cpumask_of_cpu(j));
 				wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
 			}
-		}
 
 		tmp = freqs.new;
 		freqs.new = freqs.old;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2f3728d..191f726 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -244,8 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
 
 static unsigned int speedstep_get(unsigned int cpu)
 {
-	cpumask_of_cpu_ptr(newmask, cpu);
-	return _speedstep_get(newmask);
+	return _speedstep_get(&cpumask_of_cpu(cpu));
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 650d40f..6b0a10b 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -516,7 +516,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
 	unsigned long		j;
 	int			retval;
 	cpumask_t		oldmask;
-	cpumask_of_cpu_ptr(newmask, cpu);
 
 	if (num_cache_leaves == 0)
 		return -ENOENT;
@@ -527,7 +526,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
 		return -ENOMEM;
 
 	oldmask = current->cpus_allowed;
-	retval = set_cpus_allowed_ptr(current, newmask);
+	retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 	if (retval)
 		goto out;
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 3fee2aa..b68e21f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -62,12 +62,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 
 	if (reload) {
 #ifdef CONFIG_SMP
-		cpumask_of_cpu_ptr_declare(mask);
-
 		preempt_disable();
 		load_LDT(pc);
-		cpumask_of_cpu_ptr_next(mask, smp_processor_id());
-		if (!cpus_equal(current->mm->cpu_vm_mask, *mask))
+		if (!cpus_equal(current->mm->cpu_vm_mask,
+				cpumask_of_cpu(smp_processor_id())))
 			smp_call_function(flush_ldt, current->mm, 1);
 		preempt_enable();
 #else
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 6994c75..652fa5c 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -388,7 +388,6 @@ static int do_microcode_update (void)
 	void *new_mc = NULL;
 	int cpu;
 	cpumask_t old;
-	cpumask_of_cpu_ptr_declare(newmask);
 
 	old = current->cpus_allowed;
 
@@ -405,8 +404,7 @@ static int do_microcode_update (void)
 
 			if (!uci->valid)
 				continue;
-			cpumask_of_cpu_ptr_next(newmask, cpu);
-			set_cpus_allowed_ptr(current, newmask);
+			set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 			error = get_maching_microcode(new_mc, cpu);
 			if (error < 0)
 				goto out;
@@ -576,7 +574,6 @@ static int apply_microcode_check_cpu(int cpu)
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	cpumask_t old;
-	cpumask_of_cpu_ptr(newmask, cpu);
 	unsigned int val[2];
 	int err = 0;
 
@@ -585,7 +582,7 @@ static int apply_microcode_check_cpu(int cpu)
 		return 0;
 
 	old = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, newmask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 
 	/* Check if the microcode we have in memory matches the CPU */
 	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
@@ -623,12 +620,11 @@ static int apply_microcode_check_cpu(int cpu)
 static void microcode_init_cpu(int cpu, int resume)
 {
 	cpumask_t old;
-	cpumask_of_cpu_ptr(newmask, cpu);
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 	old = current->cpus_allowed;
 
-	set_cpus_allowed_ptr(current, newmask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 	mutex_lock(&microcode_mutex);
 	collect_cpu_info(cpu);
 	if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
@@ -661,13 +657,10 @@ static ssize_t reload_store(struct sys_device *dev,
 	if (end == buf)
 		return -EINVAL;
 	if (val == 1) {
-		cpumask_t old;
-		cpumask_of_cpu_ptr(newmask, cpu);
-
-		old = current->cpus_allowed;
+		cpumask_t old = current->cpus_allowed;
 
 		get_online_cpus();
-		set_cpus_allowed_ptr(current, newmask);
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 
 		mutex_lock(&microcode_mutex);
 		if (uci->valid)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 06a9f64..724adfc 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -414,25 +414,20 @@ void native_machine_shutdown(void)
 
 	/* The boot cpu is always logical cpu 0 */
 	int reboot_cpu_id = 0;
-	cpumask_of_cpu_ptr(newmask, reboot_cpu_id);
 
 #ifdef CONFIG_X86_32
 	/* See if there has been given a command line override */
 	if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
-		cpu_online(reboot_cpu)) {
+		cpu_online(reboot_cpu))
 		reboot_cpu_id = reboot_cpu;
-		cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
-	}
 #endif
 
 	/* Make certain the cpu I'm about to reboot on is online */
-	if (!cpu_online(reboot_cpu_id)) {
+	if (!cpu_online(reboot_cpu_id))
 		reboot_cpu_id = smp_processor_id();
-		cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
-	}
 
 	/* Make certain I only run on the appropriate processor */
-	set_cpus_allowed_ptr(current, newmask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id));
 
 	/* O.K Now that I'm on the appropriate processor,
 	 * stop all of the others.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index f7745f9..1cd53df 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -81,10 +81,12 @@ static void __init setup_per_cpu_maps(void)
 }
 
 #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
-EXPORT_SYMBOL(cpumask_of_cpu_map);
-
-/* requires nr_cpu_ids to be initialized */
+/*
+ * Replace static cpumask_of_cpu_map in the initdata section,
+ * with one that's allocated sized by the possible number of cpus.
+ *
+ * (requires nr_cpu_ids to be initialized)
+ */
 static void __init setup_cpumask_of_cpu(void)
 {
 	int i;
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index a2c3f9c..a56fc6c 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -827,7 +827,6 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
 static int acpi_processor_get_throttling(struct acpi_processor *pr)
 {
 	cpumask_t saved_mask;
-	cpumask_of_cpu_ptr_declare(new_mask);
 	int ret;
 
 	if (!pr)
@@ -839,8 +838,7 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr)
 	 * Migrate task to the cpu pointed by pr.
 	 */
 	saved_mask = current->cpus_allowed;
-	cpumask_of_cpu_ptr_next(new_mask, pr->id);
-	set_cpus_allowed_ptr(current, new_mask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id));
 	ret = pr->throttling.acpi_processor_get_throttling(pr);
 	/* restore the previous state */
 	set_cpus_allowed_ptr(current, &saved_mask);
@@ -989,7 +987,6 @@ static int acpi_processor_set_throttling_ptc(struct acpi_processor *pr,
 int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
 {
 	cpumask_t saved_mask;
-	cpumask_of_cpu_ptr_declare(new_mask);
 	int ret = 0;
 	unsigned int i;
 	struct acpi_processor *match_pr;
@@ -1028,8 +1025,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
 	 * it can be called only for the cpu pointed by pr.
 	 */
 	if (p_throttling->shared_type == DOMAIN_COORD_TYPE_SW_ANY) {
-		cpumask_of_cpu_ptr_next(new_mask, pr->id);
-		set_cpus_allowed_ptr(current, new_mask);
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id));
 		ret = p_throttling->acpi_processor_set_throttling(pr,
 						t_state.target_state);
 	} else {
@@ -1060,8 +1056,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
 				continue;
 			}
 			t_state.cpu = i;
-			cpumask_of_cpu_ptr_next(new_mask, i);
-			set_cpus_allowed_ptr(current, new_mask);
+			set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
 			ret = match_pr->throttling.
 				acpi_processor_set_throttling(
 				match_pr, t_state.target_state);
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index c66817e..50a071f 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -245,7 +245,6 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
 static int smi_request(struct smi_cmd *smi_cmd)
 {
 	cpumask_t old_mask;
-	cpumask_of_cpu_ptr(new_mask, 0);
 	int ret = 0;
 
 	if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@ -256,7 +255,7 @@ static int smi_request(struct smi_cmd *smi_cmd)
 
 	/* SMI requires CPU 0 */
 	old_mask = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, new_mask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(0));
 	if (smp_processor_id() != 0) {
 		dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
 			__func__);
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 579b01f..c3b4227 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -229,11 +229,10 @@ xpc_hb_checker(void *ignore)
 	int last_IRQ_count = 0;
 	int new_IRQ_count;
 	int force_IRQ = 0;
-	cpumask_of_cpu_ptr(cpumask, XPC_HB_CHECK_CPU);
 
 	/* this thread was marked active by xpc_hb_init() */
 
-	set_cpus_allowed_ptr(current, cpumask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU));
 
 	/* set our heartbeating to other partitions into motion */
 	xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1b5c98e..8fa3b6d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -62,15 +62,7 @@
  * int next_cpu_nr(cpu, mask)		Next cpu past 'cpu', or nr_cpu_ids
  *
  * cpumask_t cpumask_of_cpu(cpu)	Return cpumask with bit 'cpu' set
- *ifdef CONFIG_HAS_CPUMASK_OF_CPU
- * cpumask_of_cpu_ptr_declare(v)	Declares cpumask_t *v
- * cpumask_of_cpu_ptr_next(v, cpu)	Sets v = &cpumask_of_cpu_map[cpu]
- * cpumask_of_cpu_ptr(v, cpu)		Combines above two operations
- *else
- * cpumask_of_cpu_ptr_declare(v)	Declares cpumask_t _v and *v = &_v
- * cpumask_of_cpu_ptr_next(v, cpu)	Sets _v = cpumask_of_cpu(cpu)
- * cpumask_of_cpu_ptr(v, cpu)		Combines above two operations
- *endif
+ *					(can be used as an lvalue)
  * CPU_MASK_ALL				Initializer - all bits set
  * CPU_MASK_NONE			Initializer - no bits set
  * unsigned long *cpus_addr(mask)	Array of unsigned long's in mask
@@ -274,36 +266,9 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 }
 
 
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-extern cpumask_t *cpumask_of_cpu_map;
+/* cpumask_of_cpu_map[] is in kernel/cpu.c */
+extern const cpumask_t *cpumask_of_cpu_map;
 #define cpumask_of_cpu(cpu)	(cpumask_of_cpu_map[cpu])
-#define	cpumask_of_cpu_ptr(v, cpu)					\
-		const cpumask_t *v = &cpumask_of_cpu(cpu)
-#define	cpumask_of_cpu_ptr_declare(v)					\
-		const cpumask_t *v
-#define cpumask_of_cpu_ptr_next(v, cpu)					\
-					v = &cpumask_of_cpu(cpu)
-#else
-#define cpumask_of_cpu(cpu)						\
-({									\
-	typeof(_unused_cpumask_arg_) m;					\
-	if (sizeof(m) == sizeof(unsigned long)) {			\
-		m.bits[0] = 1UL<<(cpu);					\
-	} else {							\
-		cpus_clear(m);						\
-		cpu_set((cpu), m);					\
-	}								\
-	m;								\
-})
-#define	cpumask_of_cpu_ptr(v, cpu) 					\
-		cpumask_t _##v = cpumask_of_cpu(cpu);			\
-		const cpumask_t *v = &_##v
-#define	cpumask_of_cpu_ptr_declare(v)					\
-		cpumask_t _##v;						\
-		const cpumask_t *v = &_##v
-#define cpumask_of_cpu_ptr_next(v, cpu)					\
-					_##v = cpumask_of_cpu(cpu)
-#endif
 
 #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 10ba5f1..a35d899 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -461,3 +461,116 @@ out:
 #endif /* CONFIG_PM_SLEEP_SMP */
 
 #endif /* CONFIG_SMP */
+
+/* 64 bits of zeros, for initializers. */
+#if BITS_PER_LONG == 32
+#define Z64 0, 0
+#else
+#define Z64 0
+#endif
+
+/* Initializer macros. */
+#define CMI0(n) { .bits = { 1UL << (n) } }
+#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
+
+#define CMI8(n, ...)						\
+	CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__),		\
+	CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__),	\
+	CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__),	\
+	CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
+
+#if BITS_PER_LONG == 32
+#define CMI64(n, ...)							\
+	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
+	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
+	CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__),	\
+	CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
+#else
+#define CMI64(n, ...)							\
+	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
+	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
+	CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__),	\
+	CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
+#endif
+
+#define CMI256(n, ...)							\
+	CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__),	\
+	CMI64((n)+128, Z64, Z64, __VA_ARGS__),				\
+	CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
+#define Z256 Z64, Z64, Z64, Z64
+
+#define CMI1024(n, ...)					\
+	CMI256((n), __VA_ARGS__),			\
+	CMI256((n)+256, Z256, __VA_ARGS__),		\
+	CMI256((n)+512, Z256, Z256, __VA_ARGS__),	\
+	CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
+#define Z1024 Z256, Z256, Z256, Z256
+
+/* We want this statically initialized, just to be safe.  We try not
+ * to waste too much space, either. */
+static const cpumask_t cpumask_map[]
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+__initdata
+#endif
+= {
+	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
+#if NR_CPUS > 4
+	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
+#endif
+#if NR_CPUS > 8
+	CMI0(8), CMI0(9), CMI0(10), CMI0(11),
+	CMI0(12), CMI0(13), CMI0(14), CMI0(15),
+#endif
+#if NR_CPUS > 16
+	CMI0(16), CMI0(17), CMI0(18), CMI0(19),
+	CMI0(20), CMI0(21), CMI0(22), CMI0(23),
+	CMI0(24), CMI0(25), CMI0(26), CMI0(27),
+	CMI0(28), CMI0(29), CMI0(30), CMI0(31),
+#endif
+#if NR_CPUS > 32
+#if BITS_PER_LONG == 32
+	CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
+	CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
+	CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
+	CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
+	CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
+	CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
+	CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
+	CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
+#else
+	CMI0(32), CMI0(33), CMI0(34), CMI0(35),
+	CMI0(36), CMI0(37), CMI0(38), CMI0(39),
+	CMI0(40), CMI0(41), CMI0(42), CMI0(43),
+	CMI0(44), CMI0(45), CMI0(46), CMI0(47),
+	CMI0(48), CMI0(49), CMI0(50), CMI0(51),
+	CMI0(52), CMI0(53), CMI0(54), CMI0(55),
+	CMI0(56), CMI0(57), CMI0(58), CMI0(59),
+	CMI0(60), CMI0(61), CMI0(62), CMI0(63),
+#endif /* BITS_PER_LONG == 64 */
+#endif
+#if NR_CPUS > 64
+	CMI64(64, Z64),
+#endif
+#if NR_CPUS > 128
+	CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
+#endif
+#if NR_CPUS > 256
+	CMI256(256, Z256),
+#endif
+#if NR_CPUS > 512
+	CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
+#endif
+#if NR_CPUS > 1024
+	CMI1024(1024, Z1024),
+#endif
+#if NR_CPUS > 2048
+	CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
+#endif
+#if NR_CPUS > 4096
+#error NR_CPUS too big.  Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+#endif
+};
+
+const cpumask_t *cpumask_of_cpu_map = cpumask_map;
+
+EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411..ba9b205 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -33,9 +33,8 @@ static int stopmachine(void *cpu)
 {
 	int irqs_disabled = 0;
 	int prepared = 0;
-	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
 
-	set_cpus_allowed_ptr(current, cpumask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
 
 	/* Ack: we are alive */
 	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bf43284..80c4336 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -196,12 +196,10 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	struct tick_device *td;
 	int cpu, ret = NOTIFY_OK;
 	unsigned long flags;
-	cpumask_of_cpu_ptr_declare(cpumask);
 
 	spin_lock_irqsave(&tick_device_lock, flags);
 
 	cpu = smp_processor_id();
-	cpumask_of_cpu_ptr_next(cpumask, cpu);
 	if (!cpu_isset(cpu, newdev->cpumask))
 		goto out_bc;
 
@@ -209,7 +207,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	curdev = td->evtdev;
 
 	/* cpu local device ? */
-	if (!cpus_equal(newdev->cpumask, *cpumask)) {
+	if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
 
 		/*
 		 * If the cpu affinity of the device interrupt can not
@@ -222,7 +220,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		 * If we have a cpu local device already, do not replace it
 		 * by a non cpu local device
 		 */
-		if (curdev && cpus_equal(curdev->cpumask, *cpumask))
+		if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
 			goto out_bc;
 	}
 
@@ -254,7 +252,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		curdev = NULL;
 	}
 	clockevents_exchange_device(curdev, newdev);
-	tick_setup_device(td, newdev, cpu, cpumask);
+	tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
 	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
 		tick_oneshot_notify();
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index ce2d723..bb948e5 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -213,9 +213,7 @@ static void start_stack_timers(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		cpumask_of_cpu_ptr(new_mask, cpu);
-
-		set_cpus_allowed_ptr(current, new_mask);
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 		start_stack_timer(cpu);
 	}
 	set_cpus_allowed_ptr(current, &saved_mask);
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index c4381d9..0f8fc22 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -11,7 +11,6 @@ notrace unsigned int debug_smp_processor_id(void)
 {
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
-	cpumask_of_cpu_ptr_declare(this_mask);
 
 	if (likely(preempt_count))
 		goto out;
@@ -23,9 +22,7 @@ notrace unsigned int debug_smp_processor_id(void)
 	 * Kernel threads bound to a single CPU can safely use
 	 * smp_processor_id():
 	 */
-	cpumask_of_cpu_ptr_next(this_mask, this_cpu);
-
-	if (cpus_equal(current->cpus_allowed, *this_mask))
+	if (cpus_equal(current->cpus_allowed, cpumask_of_cpu(this_cpu)))
 		goto out;
 
 	/*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 835d274..5a32cb7 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -310,8 +310,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	switch (m->mode) {
 	case SVC_POOL_PERCPU:
 	{
-		cpumask_of_cpu_ptr(cpumask, node);
-		set_cpus_allowed_ptr(task, cpumask);
+		set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
 		break;
 	}
 	case SVC_POOL_PERNODE:

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-27 19:06 [git pull] cpus4096 fixes Ingo Molnar
@ 2008-07-27 20:15 ` Linus Torvalds
  2008-07-27 21:03   ` Ingo Molnar
                     ` (2 more replies)
  2008-07-28  0:53 ` Rusty Russell
  1 sibling, 3 replies; 43+ messages in thread
From: Linus Torvalds @ 2008-07-27 20:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linux Kernel Mailing List, Andrew Morton, Mike Travis, Rusty Russell



On Sun, 27 Jul 2008, Ingo Molnar wrote:
> 
> Please pull the latest cpus4096-fixes git tree from:

No. Not without explanations.

Quite frankly, this "fix" looks like a huge stinking pile of sh*t.

I can't follow that thread on lkml.org (horrible web interface with 
hard-to-follow threading), and I'm too lazy to bother to look in my lkml 
email archives, but whoever said

  "The simple version is just a static array of [NR_CPUS] cpumask_t's."

and then implemented this piece of shit is a complete and utter moron.

I'm sorry, but guys, I really expect people to have better taste than 
this, and also expect people to be able to _think_ better than this. 

Am I right, and all you want is NR_CPU constant bitmasks that have just a 
single big set in each (for that single CPU)?

And I further right, adn you are so STUPID that you cannot see that you 
can share all the zero words?

In other words, on a 64-bit architecture, you only ever need 64 of these 
arrays - with a different bit set in ONE SINGLE WORD (with enough zero 
words around it so that you can create any bitmask by just offsetting in 
that big array). And then you just put enough zeroes around it that you 
can point _every_single_cpumask_ to be one of those things.

So when you have 4k CPU's, instead of having 4k arrays (of 4k bits each, 
with one bit set in each array - 2MB memory total), you have exactly 64 
arrays instead, each 8k bits in size (64kB total). 

And then you just point cpumask(n) to the right position (which you can 
calculate dynamically). Once you have the right arrays, getting 
"cpumask(n)" ends up being something like

	static const cpumask_t *cpumask_of_cpu(int cpu)
	{
		/* Get the array with the right bit set */
		unsigned long *p = array[cpu % BITS_PER_LONG];

		/* Offset it so that it's in the right word */
		p += (NR_CPUS-n)/BITS_PER_LONG;

		/* Return it as a cpumask_t */
		return (cpumask_t) p;
	}

And once you're not being a total idiot about wasting memory that is just 
filled with a single bit in various different places, you don't need all 
those games to re-create the arrays in some dense format, because they're 
already going to be dense enough. If you compile a kernel for up to 4k 
CPU's, "wasting" that 64kB of memory is a non-issue (especially since by 
doing this "overlapping" trick you probbaly get better cache behaviour 
anyway).

Ok, so now that I've insulted you and your pets (they're ugly!), show me 
wrong, and then call me a d*ckhead. ("Linus - you're a d*ckhead, and you 
didn't understand the problem, so you're a _stupid_ d*ckhead. And my 
pet may be ugly, but yours _smells_ bad!").

Or say "Uh, yeah, we're morons, and here's the much better patch, and we 
won't do that again".

			Linus

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-27 20:15 ` Linus Torvalds
@ 2008-07-27 21:03   ` Ingo Molnar
  2008-07-28 18:42     ` Mike Travis
  2008-07-27 21:05   ` Al Viro
  2008-07-28  0:42   ` Rusty Russell
  2 siblings, 1 reply; 43+ messages in thread
From: Ingo Molnar @ 2008-07-27 21:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, Andrew Morton, Mike Travis, Rusty Russell


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, 27 Jul 2008, Ingo Molnar wrote:
> > 
> > Please pull the latest cpus4096-fixes git tree from:
> 
> No. Not without explanations.
> 
> Quite frankly, this "fix" looks like a huge stinking pile of sh*t.

Agreed :-/

NUMA-locality might have been a valid argument in favor of the massive 
array of constant cpumasks (common usage is to use it for the current 
cpu), if it wasnt all stupidly allocated on the boot node:

        cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
        for (i = 0; i < nr_cpu_ids; i++)
                cpu_set(i, cpumask_of_cpu_map[i]);

> And I further right, and you are so STUPID that you cannot see that 
> you can share all the zero words?

That's fair to say :-/ I cannot talk for the others but it certainly 
didnt occur to me and i should have caught it.

( To me it didnt occur because we rarely have the opportunity for such 
  rather clever optimizations, in most cases there's really just two 
  choices in practice: either maximally global, or maximally per cpu. 
  Here we can do something inbetween and overlap. I dont think we do any 
  comparable compression of constants in the kernel. That boxed in my
  imagination i guess. )

Sorry about this - please ignore these patches, we'll rework them along 
the lines you suggest. Your suggested data structure will indeed be both 
simpler, smaller and more cache-efficient.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-27 20:15 ` Linus Torvalds
  2008-07-27 21:03   ` Ingo Molnar
@ 2008-07-27 21:05   ` Al Viro
  2008-07-27 22:17     ` Linus Torvalds
  2008-07-28  0:42   ` Rusty Russell
  2 siblings, 1 reply; 43+ messages in thread
From: Al Viro @ 2008-07-27 21:05 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Mike Travis, Rusty Russell

On Sun, Jul 27, 2008 at 01:15:26PM -0700, Linus Torvalds wrote:

> So when you have 4k CPU's, instead of having 4k arrays (of 4k bits each, 
> with one bit set in each array - 2MB memory total), you have exactly 64 
> arrays instead, each 8k bits in size (64kB total). 

> And once you're not being a total idiot about wasting memory that is just 
> filled with a single bit in various different places, you don't need all 
> those games to re-create the arrays in some dense format, because they're 
> already going to be dense enough. If you compile a kernel for up to 4k 
> CPU's, "wasting" that 64kB of memory is a non-issue (especially since by 
> doing this "overlapping" trick you probbaly get better cache behaviour 
> anyway).
> 
> Ok, so now that I've insulted you and your pets (they're ugly!), show me 
> wrong, and then call me a d*ckhead. ("Linus - you're a d*ckhead, and you 
> didn't understand the problem, so you're a _stupid_ d*ckhead. And my 
> pet may be ugly, but yours _smells_ bad!").
> 
> Or say "Uh, yeah, we're morons, and here's the much better patch, and we 
> won't do that again".

ITYM "one 32.5kB array" -
(u64[65][64]){[1][0] = 1, [2][0] = 2, [3][0] = 4, ..., [64][0] = 1ULL<<63}
would work just fine.  You were saying...?


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-27 21:05   ` Al Viro
@ 2008-07-27 22:17     ` Linus Torvalds
  0 siblings, 0 replies; 43+ messages in thread
From: Linus Torvalds @ 2008-07-27 22:17 UTC (permalink / raw)
  To: Al Viro
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Mike Travis, Rusty Russell



On Sun, 27 Jul 2008, Al Viro wrote:
> 
> ITYM "one 32.5kB array" -
> (u64[65][64]){[1][0] = 1, [2][0] = 2, [3][0] = 4, ..., [64][0] = 1ULL<<63}
> would work just fine.  You were saying...?

Yeah, you can optimize it even more, I agree. But even the _trivial_ one 
gets you to linear memory use (rather than something that is O(n^2) in 
number of CPU's) and makes the thing a non-issue. But yes, there's another 
almost-factor-of-two that you can get by being clever.

But in fact, I think your optimization would not just use less memory, but 
yes, it's also easier to write the initializer for (with just a couple of 
levels of macros to avoid having to do 64 entries by hand).

So yes, make it so. Please.

		Linus



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-27 20:15 ` Linus Torvalds
  2008-07-27 21:03   ` Ingo Molnar
  2008-07-27 21:05   ` Al Viro
@ 2008-07-28  0:42   ` Rusty Russell
  2008-07-28  3:06     ` Andrew Morton
                       ` (3 more replies)
  2 siblings, 4 replies; 43+ messages in thread
From: Rusty Russell @ 2008-07-28  0:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton, Mike Travis

On Monday 28 July 2008 06:15:26 Linus Torvalds wrote:
> On Sun, 27 Jul 2008, Ingo Molnar wrote:
> > Please pull the latest cpus4096-fixes git tree from:
>
> No. Not without explanations.
>
> Quite frankly, this "fix" looks like a huge stinking pile of sh*t.
>
> I can't follow that thread on lkml.org (horrible web interface with
> hard-to-follow threading), and I'm too lazy to bother to look in my lkml
> email archives, but whoever said
>
>   "The simple version is just a static array of [NR_CPUS] cpumask_t's."
>
> and then implemented this piece of shit is a complete and utter moron.

How awesome, Linus is talking about me!

Your idea is clever, thanks.  This patch was a bandaid to save us from the 
unbearably fugly "cpumask_of_cpu_ptr_declare(cpu_mask)".  I was going to rip 
this code out as soon as possible, but with your trick we should keep it.

The 4k CPU patches have been sliding in without review up until now.  We need 
a serious think about how to handle cpumask_t that doesn't fit on the stack.  

Hey, since Mike put his Signed-off-by above mine and didn't put a From line 
when he took my patch, WFT am I taking responsibility? :)

Rusty.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-27 19:06 [git pull] cpus4096 fixes Ingo Molnar
  2008-07-27 20:15 ` Linus Torvalds
@ 2008-07-28  0:53 ` Rusty Russell
  2008-07-28  8:16   ` Ingo Molnar
  2008-07-28  8:43   ` Ingo Molnar
  1 sibling, 2 replies; 43+ messages in thread
From: Rusty Russell @ 2008-07-28  0:53 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, Andrew Morton, Mike Travis

On Monday 28 July 2008 05:06:01 Ingo Molnar wrote:
> Linus,
>
> Please pull the latest cpus4096-fixes git tree from:
>
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
> cpus4096
>
> this fixes the cpumask_of_cpu API fallout described here:
>
>    http://lkml.org/lkml/2008/7/23/76
>
> ... and the fix is wider than i'd like it to be, so close to -rc1

Sorry, it's wider because you pushed the last stupid Mike patch.  Over my 
(obviously too-polite) objections.  Most of this is reverting that, just 
without actually admitting it.

> - but 
> it's the cleanest one and it has Rusty's ack as well.

Not really.  As authored, I intended it as a bandaid for systems with small 
CPU numbers.  Mike made it always on (tho __initdata on large x86 systems), 
which IMHO is insane (2MB of initdata?).

Mike: I now think the right long-term answer is Linus' dense cpumap idea + a 
convenience allocator for cpumasks.  We sweep the kernel for all on-stack 
vars and replace them with one or the other.  Thoughts?

Rusty.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  0:42   ` Rusty Russell
@ 2008-07-28  3:06     ` Andrew Morton
  2008-07-28  6:34       ` Rusty Russell
  2008-07-28  8:33     ` Ingo Molnar
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 43+ messages in thread
From: Andrew Morton @ 2008-07-28  3:06 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Linus Torvalds, Ingo Molnar, Linux Kernel Mailing List, Mike Travis

On Mon, 28 Jul 2008 10:42:12 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:

> The 4k CPU patches have been sliding in without review up until now.

wot?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  3:06     ` Andrew Morton
@ 2008-07-28  6:34       ` Rusty Russell
  2008-07-28  6:58         ` Nick Piggin
                           ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Rusty Russell @ 2008-07-28  6:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, Ingo Molnar, Linux Kernel Mailing List, Mike Travis

On Monday 28 July 2008 13:06:36 Andrew Morton wrote:
> On Mon, 28 Jul 2008 10:42:12 +1000 Rusty Russell <rusty@rustcorp.com.au> 
wrote:
> > The 4k CPU patches have been sliding in without review up until now.
>
> wot?

This surprises you?  I stumbled across the cpumask_of_cpu() bug because I 
happened to want it for stop_machine and read the damned code.  But it lead 
me to the surrounding code, which is pretty questionable.  An arch-specific 
map, rather than depending on NR_CPUS?  Adding set_cpus_allowed_ptr() instead 
of changing set_cpus_allowed()?  Macros which declare things and may or may 
not do an allocation/free?  Finally a patch so horrifically ugly that it 
can't be ignored any more gets all the way to Linus.

Overall, it seems like an attempt to sneak in gradual workarounds for cpumasks 
on the stack, rather than a coherent plan.  I understand the temptation to 
avoid an "are we prepared to pay this price for large NR_CPUS?" discussion, 
but we need it anyway.

And that's what I call "review".
Rusty.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  6:34       ` Rusty Russell
@ 2008-07-28  6:58         ` Nick Piggin
  2008-07-28  7:56         ` Ingo Molnar
  2008-07-28 18:12         ` Mike Travis
  2 siblings, 0 replies; 43+ messages in thread
From: Nick Piggin @ 2008-07-28  6:58 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Linus Torvalds, Ingo Molnar,
	Linux Kernel Mailing List, Mike Travis

On Monday 28 July 2008 16:34, Rusty Russell wrote:
> On Monday 28 July 2008 13:06:36 Andrew Morton wrote:
> > On Mon, 28 Jul 2008 10:42:12 +1000 Rusty Russell <rusty@rustcorp.com.au>
>
> wrote:
> > > The 4k CPU patches have been sliding in without review up until now.
> >
> > wot?
>
> This surprises you?  I stumbled across the cpumask_of_cpu() bug because I
> happened to want it for stop_machine and read the damned code.  But it lead
> me to the surrounding code, which is pretty questionable.  An arch-specific
> map, rather than depending on NR_CPUS?  Adding set_cpus_allowed_ptr()
> instead of changing set_cpus_allowed()?  Macros which declare things and
> may or may not do an allocation/free?  Finally a patch so horrifically ugly
> that it can't be ignored any more gets all the way to Linus.
>
> Overall, it seems like an attempt to sneak in gradual workarounds for
> cpumasks on the stack, rather than a coherent plan.  I understand the
> temptation to avoid an "are we prepared to pay this price for large
> NR_CPUS?" discussion, but we need it anyway.
>
> And that's what I call "review".

Well, I'm not talking about this case specifically, but lots of
people I see these days tend to "review" code without doing a lot
of high level critical thinking (is the overall design any good,
could it be better or completely different, are the tradeoffs
worthwhile etc).

Don't get me wrong, the lower level spot-the-bug and coding style
etc reviews are indespensible too...

Unfortunately the development process isn't exactly conducive to
that higher level kind of review... there are pros and cons to
this, it's not all bad...


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  6:34       ` Rusty Russell
  2008-07-28  6:58         ` Nick Piggin
@ 2008-07-28  7:56         ` Ingo Molnar
  2008-07-28 18:12         ` Mike Travis
  2 siblings, 0 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28  7:56 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Linus Torvalds, Linux Kernel Mailing List, Mike Travis


* Rusty Russell <rusty@rustcorp.com.au> wrote:

> On Monday 28 July 2008 13:06:36 Andrew Morton wrote:
> > On Mon, 28 Jul 2008 10:42:12 +1000 Rusty Russell <rusty@rustcorp.com.au> 
> wrote:
> > > The 4k CPU patches have been sliding in without review up until now.
> >
> > wot?
> 
> This surprises you? [...]

you should check many of the earliest iterations (it's all on lkml), and 
the bits we rejected in review/testing. You'll be surprised how much 
questionable and fragile stuff was filtered out.

But your intuition is right in a sense, this whole topic _feels_ ugly, 
and there's a good reason for it and i doubt you'll like it:

Much of it derives from the ugly fact that cpumasks were designed to be 
word-size-ish and are used as such in hundreds of places in the kernel, 
while with 4K CPUs they become half a _kilobyte_.

That causes the basic conceptual friction. That fundamental unease is 
what caused me to split these patches off into a completly separate 
topic, so that they can be NAK-ed individually without blocking other 
subsystem changes. Mike will be able to tell you how many bits were 
rejected and rewritten - it's been one of the most iterated topics.

Unless you know some good way around that basic "0.5K cpumask" problem 
[besides the 'dont try to do it at all then, stupid' solution] Mike's 
painful year-long, multi-release, all-on-lkml effort to bootstrap a 4K 
CPUs kernel, to track down dozens of early boot crashes, to look at 
stack sizes in zillions of functions, to write a ton of patches to 
evolve the APIs to cope with it better (all of this was done out in the 
open on lkml for all to see) looks like quite close to what _can_ be 
done.

128/256/512/1024 CPU support (which has been upstream for years and 
built into enterprise distros, etc.) already turned cpumasks into rather 
static objects in practice and their proliferation into hotpaths stopped 
- so maybe we could just turn them into non-stack objects from now on.

( with perhaps some nice wrappers that turns then into on-stack objects
  to not slow down the common case. Mike tried to do something like 
  that. )

Help and more cleanup patches welcome. Mike & co did most of the hard 
work already, latest -git does boot with 4K cpus built into the kernel. 
We can iterate this stuff a _lot_ easier now. Turn on CONFIG_MAXSMP=y on 
x86 and you can boot it on your PC.

> [...]  I stumbled across the cpumask_of_cpu() bug because I happened 
> to want it for stop_machine and read the damned code.  But it lead me 
> to the surrounding code, which is pretty questionable.  An 
> arch-specific map, rather than depending on NR_CPUS?  Adding 
> set_cpus_allowed_ptr() instead of changing set_cpus_allowed()? [...]

the set_cpus_allowed_ptr() change too was done due to review feedback, 
to reduce the friction with other tree, to make for smoother migration. 
Breaking an existing API is a far too rude technique for a long-lived 
topic like this. (it's been going on for nearly a year or so)

> [...] Macros which declare things and may or may not do an 
> allocation/free?  Finally a patch so horrifically ugly that it can't 
> be ignored any more gets all the way to Linus.

[ hey, is that your suggested solution you are talking about? ;-) ]

> Overall, it seems like an attempt to sneak in gradual workarounds for 
> cpumasks on the stack, rather than a coherent plan.  I understand the 
> temptation to avoid an "are we prepared to pay this price for large 
> NR_CPUS?" discussion, but we need it anyway.

sure. From a practical standpoint 4096 CPUs support looks pretty stable 
and functional. I boot a 4K cpus kernel every couple of minutes:

 config-Sun_Jul_27_09_15_47_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_09_27_00_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_09_29_39_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_09_36_41_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_09_40_22_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_09_59_33_CEST_2008.good:CONFIG_MAXSMP=y

 config-Sun_Jul_27_22_14_47_CEST_2008.good:CONFIG_NR_CPUS=8
 config-Sun_Jul_27_22_20_09_CEST_2008.good:CONFIG_NR_CPUS=8
 config-Sun_Jul_27_22_25_32_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_22_25_32_CEST_2008.good:CONFIG_NR_CPUS=4096
 config-Sun_Jul_27_22_36_52_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_22_36_52_CEST_2008.good:CONFIG_NR_CPUS=4096
 config-Sun_Jul_27_22_42_19_CEST_2008.good:CONFIG_MAXSMP=y
 config-Sun_Jul_27_22_42_19_CEST_2008.good:CONFIG_NR_CPUS=4096
 config-Sun_Jul_27_22_47_28_CEST_2008.good:CONFIG_NR_CPUS=32
 config-Sun_Jul_27_22_52_47_CEST_2008.good:CONFIG_NR_CPUS=32
 config-Sun_Jul_27_22_57_59_CEST_2008.good:CONFIG_NR_CPUS=32

The last difficult regression has been months ago. So this stuff is 
hackable in practice and you can try out the end result if you are 
interested in it.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  0:53 ` Rusty Russell
@ 2008-07-28  8:16   ` Ingo Molnar
  2008-07-28 13:21     ` Rusty Russell
  2008-07-28  8:43   ` Ingo Molnar
  1 sibling, 1 reply; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28  8:16 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Linus Torvalds, linux-kernel, Andrew Morton, Mike Travis


* Rusty Russell <rusty@rustcorp.com.au> wrote:

> Mike: I now think the right long-term answer is Linus' dense cpumap 
> idea + a convenience allocator for cpumasks.  We sweep the kernel for 
> all on-stack vars and replace them with one or the other.  Thoughts?

The dense cpumap for constant cpumasks is OK as it's clever, compact and 
static.

All-dynamic allocator for on-stack cpumasks ... is a less obvious 
choice.

I dont fundamentally oppose it, but doing that we might be throwing the 
baby out with the bathwater. In many cases having 0.5K on the kernel 
stack is OK. (especially for high-level functions.)

We could do all-dynamic allocations, and it's simpler in many ways (less 
ugly dual-use initialization macros), but it's intrusive for the common 
case. I kind of liked Mike's approach to push the dynamic overhead to 
the less common case - although i dont like all of the end result. 
That's been the driving principle behind the existing patches: dont hurt 
the common case.

Anyway, Mike's solution gives us the option to go in both directions now 
as we've essentially annotated the most difficult on-stack users and 
added a dynamic alloc/free path to them and tested it. Making them all 
dynamic will be easy as it just gets rid of the small-mask 
special-cases.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  0:42   ` Rusty Russell
  2008-07-28  3:06     ` Andrew Morton
@ 2008-07-28  8:33     ` Ingo Molnar
  2008-07-28 18:07       ` Mike Travis
  2008-07-28 17:50     ` Mike Travis
  2008-07-28 18:46     ` [git pull] cpus4096 fixes Mike Travis
  3 siblings, 1 reply; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28  8:33 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton, Mike Travis


* Rusty Russell <rusty@rustcorp.com.au> wrote:

> Hey, since Mike put his Signed-off-by above mine and didn't put a From 
> line when he took my patch, WFT am I taking responsibility? :)

hm, Mike, please dont do that. You changed Rusty's patch but the way we 
do that is to preserve the original change and then do a delta patch. If 
you dont want to split a commit you should keep the original From and 
mention your own changes in the changelog. I know this was all in the 
rush to meet -rc1, but still.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  0:53 ` Rusty Russell
  2008-07-28  8:16   ` Ingo Molnar
@ 2008-07-28  8:43   ` Ingo Molnar
  1 sibling, 0 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28  8:43 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Linus Torvalds, linux-kernel, Andrew Morton, Mike Travis


* Rusty Russell <rusty@rustcorp.com.au> wrote:

> On Monday 28 July 2008 05:06:01 Ingo Molnar wrote:
> > Linus,
> >
> > Please pull the latest cpus4096-fixes git tree from:
> >
> >    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
> > cpus4096
> >
> > this fixes the cpumask_of_cpu API fallout described here:
> >
> >    http://lkml.org/lkml/2008/7/23/76
> >
> > ... and the fix is wider than i'd like it to be, so close to -rc1
> 
> Sorry, it's wider because you pushed the last stupid Mike patch.  Over my 
> (obviously too-polite) objections.  Most of this is reverting that, just 
> without actually admitting it.
> 
> > - but it's the cleanest one and it has Rusty's ack as well.
> 
> Not really.  As authored, I intended it as a bandaid for systems with 
> small CPU numbers.  Mike made it always on (tho __initdata on large 
> x86 systems), which IMHO is insane (2MB of initdata?).

well, firstly, please understand that nobody is (and the least i am) 
blaming you for anything. I was the one who picked up the patch from 
Mike and sent it Linuswards so i'm to blame and it's me where the buck 
stops.

I have picked up a patch that you wrote originally that solved a problem 
that was first noticed due to crossing commits in the middle of -rc1, by 
drivers/net/sfc/efx.c aa6ef27ea9 which was authored 10 days ago and sent 
upstream 4 days after that - so there was no linux-next exposure where 
we could have noticed it. The cpumask API changes were in the works for 
months literally, and in linux-next for a long time.
 
That particular cpumask API breakage was dormant (read: nobody in-tree 
used the API as an lvalue up to commit aa6ef27ea9) and only got 
unearthed by that crossing change that made such (valid) use of the API. 
I'd have delayed it all and would have waited for the discussion to play 
out, hadnt it been for that (small) build breakage.
   
The 4 commits which were objected to by Linus were literally scrambled 
together in haste within 4 days, to solve that lvalue problem. And i 
Cc:-ed you on the pull request. If you disagree with or are unsure about 
patches you send out then please do not put a Signed-off-by into them 
but a Not-Yet-Signed-off-by. I _will_ notice that - and even if i dont 
notice it, my scripts will ;)

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  8:16   ` Ingo Molnar
@ 2008-07-28 13:21     ` Rusty Russell
  2008-07-28 18:23       ` Mike Travis
  2008-07-31 10:30       ` Ingo Molnar
  0 siblings, 2 replies; 43+ messages in thread
From: Rusty Russell @ 2008-07-28 13:21 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, Andrew Morton, Mike Travis

On Monday 28 July 2008 18:16:39 Ingo Molnar wrote:
> * Rusty Russell <rusty@rustcorp.com.au> wrote:
> > Mike: I now think the right long-term answer is Linus' dense cpumap
> > idea + a convenience allocator for cpumasks.  We sweep the kernel for
> > all on-stack vars and replace them with one or the other.  Thoughts?
>
> The dense cpumap for constant cpumasks is OK as it's clever, compact and
> static.
>
> All-dynamic allocator for on-stack cpumasks ... is a less obvious
> choice.

Sorry, I was unclear.  "long-term" == "more than 4096 CPUs", since I thought 
that was Mike's aim.  If we only want to hack up 4k CPUS and stop, then I 
understand the current approach.

If we want huge cpu numbers, I think cpumask_alloc/free gives the clearest 
code.  So our approach is backwards: let's do that *then* put ugly hacks in 
if it's really too slow.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  0:42   ` Rusty Russell
  2008-07-28  3:06     ` Andrew Morton
  2008-07-28  8:33     ` Ingo Molnar
@ 2008-07-28 17:50     ` Mike Travis
  2008-07-28 18:32       ` Linus Torvalds
  2008-07-28 18:46     ` [git pull] cpus4096 fixes Mike Travis
  3 siblings, 1 reply; 43+ messages in thread
From: Mike Travis @ 2008-07-28 17:50 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Linus Torvalds, Ingo Molnar, Linux Kernel Mailing List, Andrew Morton

Rusty Russell wrote:
> On Monday 28 July 2008 06:15:26 Linus Torvalds wrote:
>> On Sun, 27 Jul 2008, Ingo Molnar wrote:
>>> Please pull the latest cpus4096-fixes git tree from:
>> No. Not without explanations.
>>
>> Quite frankly, this "fix" looks like a huge stinking pile of sh*t.
>>
>> I can't follow that thread on lkml.org (horrible web interface with
>> hard-to-follow threading), and I'm too lazy to bother to look in my lkml
>> email archives, but whoever said
>>
>>   "The simple version is just a static array of [NR_CPUS] cpumask_t's."
>>
>> and then implemented this piece of shit is a complete and utter moron.
> 
> How awesome, Linus is talking about me!
> 
> Your idea is clever, thanks.  This patch was a bandaid to save us from the 
> unbearably fugly "cpumask_of_cpu_ptr_declare(cpu_mask)".  I was going to rip 
> this code out as soon as possible, but with your trick we should keep it.
> 
> The 4k CPU patches have been sliding in without review up until now.  We need 
> a serious think about how to handle cpumask_t that doesn't fit on the stack.  
> 
> Hey, since Mike put his Signed-off-by above mine and didn't put a From line 
> when he took my patch, WFT am I taking responsibility? :)
> 
> Rusty.

Sorry, I didn't know that was the protocol.  And yes, the clever idea of
compacting the memory is a good one (wish I would have thought of it... ;-)
But, and it's a big but, if you really have 4096 cpus present (not NR_CPUS,
but nr_cpu_ids), then 2MB is pretty much chump change.

But I'll redo the patch again.  And look for more space saving areas.  As
for the goal of these patches, it was to lessen the impact as much as possible
on smaller systems so that the distros can set the default NR_CPUS=4096 in
their binary (validated, certified, licensable) kernels.  This is a huge
priority to our customers.  Please keep telling me where I'm going wrong at
this.

Thanks for the comments and feedback.
Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  8:33     ` Ingo Molnar
@ 2008-07-28 18:07       ` Mike Travis
  0 siblings, 0 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 18:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Rusty Russell, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton

Ingo Molnar wrote:
> * Rusty Russell <rusty@rustcorp.com.au> wrote:
> 
>> Hey, since Mike put his Signed-off-by above mine and didn't put a From 
>> line when he took my patch, WFT am I taking responsibility? :)
> 
> hm, Mike, please dont do that. You changed Rusty's patch but the way we 
> do that is to preserve the original change and then do a delta patch. If 
> you dont want to split a commit you should keep the original From and 
> mention your own changes in the changelog. I know this was all in the 
> rush to meet -rc1, but still.
> 
> 	Ingo


Yes, sorry, as I mentioned I wasn't aware of the correct thing to do.  And
as you've mentioned, a lot of this has been under pressure because 2.6.27
will be the kernel that the distros use when this system comes online.  So
time is the one thing we (SGI) do not have.  That's not to say that doing it
wrong is ok, it's just that sometimes what I might think is trivial (ie.,
builds and functions correctly) might not be to others, and I welcome comments,
suggestions, reviews, criticism, ideas, whatever.  As I've mentioned I am
new to this "arena" of development.

And again, thanks for the comments and feedback, it has been an enlightening
10 months. ;-)

Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  6:34       ` Rusty Russell
  2008-07-28  6:58         ` Nick Piggin
  2008-07-28  7:56         ` Ingo Molnar
@ 2008-07-28 18:12         ` Mike Travis
  2 siblings, 0 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 18:12 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Linus Torvalds, Ingo Molnar, Linux Kernel Mailing List

Rusty Russell wrote:
> On Monday 28 July 2008 13:06:36 Andrew Morton wrote:
>> On Mon, 28 Jul 2008 10:42:12 +1000 Rusty Russell <rusty@rustcorp.com.au> 
> wrote:
>>> The 4k CPU patches have been sliding in without review up until now.
>> wot?
> 
> This surprises you?  I stumbled across the cpumask_of_cpu() bug because I 
> happened to want it for stop_machine and read the damned code.  But it lead 
> me to the surrounding code, which is pretty questionable.  An arch-specific 
> map, rather than depending on NR_CPUS?  Adding set_cpus_allowed_ptr() instead 
> of changing set_cpus_allowed()?  Macros which declare things and may or may 
> not do an allocation/free?  Finally a patch so horrifically ugly that it 
> can't be ignored any more gets all the way to Linus.
> 
> Overall, it seems like an attempt to sneak in gradual workarounds for cpumasks 
> on the stack, rather than a coherent plan.  I understand the temptation to 
> avoid an "are we prepared to pay this price for large NR_CPUS?" discussion, 
> but we need it anyway.
> 
> And that's what I call "review".
> Rusty.


I'm not sure I can respond to all, but some of this was brought up in discussions
previously, and I always took the advice and objections that came up.  I don't
think anything went in that wasn't (at least in general) agreed upon by those that
reviewed any of my changes.  If I did some things wrong, I apologize and I'll take
full blame ("rookie mistakes?" ;-).

Thanks,
Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 13:21     ` Rusty Russell
@ 2008-07-28 18:23       ` Mike Travis
  2008-07-31 10:30       ` Ingo Molnar
  1 sibling, 0 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 18:23 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, Linus Torvalds, linux-kernel, Andrew Morton

Rusty Russell wrote:
> On Monday 28 July 2008 18:16:39 Ingo Molnar wrote:
>> * Rusty Russell <rusty@rustcorp.com.au> wrote:
>>> Mike: I now think the right long-term answer is Linus' dense cpumap
>>> idea + a convenience allocator for cpumasks.  We sweep the kernel for
>>> all on-stack vars and replace them with one or the other.  Thoughts?
>> The dense cpumap for constant cpumasks is OK as it's clever, compact and
>> static.
>>
>> All-dynamic allocator for on-stack cpumasks ... is a less obvious
>> choice.
> 
> Sorry, I was unclear.  "long-term" == "more than 4096 CPUs", since I thought 
> that was Mike's aim.  If we only want to hack up 4k CPUS and stop, then I 
> understand the current approach.
> 
> If we want huge cpu numbers, I think cpumask_alloc/free gives the clearest 
> code.  So our approach is backwards: let's do that *then* put ugly hacks in 
> if it's really too slow.
> 
> Cheers,
> Rusty.

Well, yes, "long-term" is not really that long and the system will be capable
of supporting 16k cpus.  With the limit on clock scalability, core count is
going through the roof.  Fortunately, we have a whole new release cycle to
rethink some basic ideas.

I did bring up a number of suggestions on how to replace cpumask_t, but they
all seemed to hamper small systems in one way or another.  And the goal, again
was to minimize impact for 99.99% of the systems that won't have a thousand
or more cpus.  (Though it only takes 8 Larrabee chips to attain that.)

Thanks,
Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 17:50     ` Mike Travis
@ 2008-07-28 18:32       ` Linus Torvalds
  2008-07-28 18:37         ` Linus Torvalds
                           ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Linus Torvalds @ 2008-07-28 18:32 UTC (permalink / raw)
  To: Mike Travis
  Cc: Rusty Russell, Ingo Molnar, Linux Kernel Mailing List,
	Andrew Morton, Al Viro



On Mon, 28 Jul 2008, Mike Travis wrote:
> 
> Sorry, I didn't know that was the protocol.  And yes, the clever idea of
> compacting the memory is a good one (wish I would have thought of it... ;-)
> But, and it's a big but, if you really have 4096 cpus present (not NR_CPUS,
> but nr_cpu_ids), then 2MB is pretty much chump change.

Umm. Yes, it's chump change, but if you compile a kernel to be generic, 
and you actually only have a few CPU's, it's no longer chump change.

> But I'll redo the patch again.

Here's a trivial setup, that is even tested. It's _small_ too.

	/* cpu_bit_bitmap[0] is empty - so we can back into it */
	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)

	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
	#if BITS_PER_LONG > 32
		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
	#endif
	};

	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
	{
		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
		p -= nr / BITS_PER_LONG;
		return (const cpumask_t *)p;
	}

that should be all you need to do.

Honesty in advertizing: my "testing" was some trivial user-space harness, 
maybe I had some bug in it. But at least it's not _horribly_ wrong.

And yes, this has the added optimization from Viro of overlapping the 
cpumask_t's internally too, rather than making them twice the size. So 
with 4096 CPU's, this should result 32.5kB of static const data.

			Linus

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 18:32       ` Linus Torvalds
@ 2008-07-28 18:37         ` Linus Torvalds
  2008-07-28 18:51           ` Ingo Molnar
  2008-07-28 19:04         ` Mike Travis
  2008-07-28 20:57         ` [rfc git pull] cpus4096 fixes, take 2 Ingo Molnar
  2 siblings, 1 reply; 43+ messages in thread
From: Linus Torvalds @ 2008-07-28 18:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Rusty Russell, Mike Travis, Linux Kernel Mailing List,
	Andrew Morton, Al Viro



On Mon, 28 Jul 2008, Linus Torvalds wrote:
> 
> Here's a trivial setup, that is even tested. It's _small_ too.
> 
> 	/* cpu_bit_bitmap[0] is empty - so we can back into it */
> 	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
> 	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
> 	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
> 	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
> 
> 	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
> 		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
> 		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
> 	#if BITS_PER_LONG > 32
> 		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
> 		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
> 	#endif
> 	};
> 
> 	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
> 	{
> 		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
> 		p -= nr / BITS_PER_LONG;
> 		return (const cpumask_t *)p;
> 	}

Btw, Ingo, can we get this issue resolved asap, please?

I was planning on doing -rc1 today, but this kind of hangs over me. The 
above three lines of code (and more lines of macro initializers) obviously 
does need some more testing, but if you hook it into the other changes you 
already had, maybe we can get it done.

Hmm?

			Linus

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-27 21:03   ` Ingo Molnar
@ 2008-07-28 18:42     ` Mike Travis
  0 siblings, 0 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 18:42 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton, Rusty Russell

Ingo Molnar wrote:
...
> NUMA-locality might have been a valid argument in favor of the massive 
> array of constant cpumasks (common usage is to use it for the current 
> cpu), if it wasnt all stupidly allocated on the boot node:
> 
>         cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
>         for (i = 0; i < nr_cpu_ids; i++)
>                 cpu_set(i, cpumask_of_cpu_map[i]);

The alternate method was for each cpu to contribute a cpumask_of_cpu
for itself in the percpu area.  I don't recall why that idea was rejected.

But yes, the set_cpus_allowed is the biggest receiver of cpumask_t's with
a single bit set.  I had also proposed a different interface allowing just
that, but I think it was rejected as being redundant.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28  0:42   ` Rusty Russell
                       ` (2 preceding siblings ...)
  2008-07-28 17:50     ` Mike Travis
@ 2008-07-28 18:46     ` Mike Travis
  2008-07-28 19:13       ` Ingo Molnar
  2008-07-29  1:33       ` Rusty Russell
  3 siblings, 2 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 18:46 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Linus Torvalds, Ingo Molnar, Linux Kernel Mailing List, Andrew Morton

Rusty Russell wrote:
> On Monday 28 July 2008 06:15:26 Linus Torvalds wrote:
>> On Sun, 27 Jul 2008, Ingo Molnar wrote:
>>> Please pull the latest cpus4096-fixes git tree from:
>> No. Not without explanations.
>>
>> Quite frankly, this "fix" looks like a huge stinking pile of sh*t.
>>
>> I can't follow that thread on lkml.org (horrible web interface with
>> hard-to-follow threading), and I'm too lazy to bother to look in my lkml
>> email archives, but whoever said
>>
>>   "The simple version is just a static array of [NR_CPUS] cpumask_t's."
>>
>> and then implemented this piece of shit is a complete and utter moron.
> 
> How awesome, Linus is talking about me!
> 
> Your idea is clever, thanks.  This patch was a bandaid to save us from the 
> unbearably fugly "cpumask_of_cpu_ptr_declare(cpu_mask)".  I was going to rip 
> this code out as soon as possible, but with your trick we should keep it.
> 
> The 4k CPU patches have been sliding in without review up until now.  We need 
> a serious think about how to handle cpumask_t that doesn't fit on the stack.  
> 
> Hey, since Mike put his Signed-off-by above mine and didn't put a From line 
> when he took my patch, WFT am I taking responsibility? :)
> 
> Rusty.


Just so I don't make the same mistake, this:


Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>


Should have been this:

From: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>

?

Thanks,
Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 18:37         ` Linus Torvalds
@ 2008-07-28 18:51           ` Ingo Molnar
  2008-07-28 19:22             ` Mike Travis
  0 siblings, 1 reply; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28 18:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Rusty Russell, Mike Travis, Linux Kernel Mailing List,
	Andrew Morton, Al Viro


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Mon, 28 Jul 2008, Linus Torvalds wrote:
> > 
> > Here's a trivial setup, that is even tested. It's _small_ too.
> > 
> > 	/* cpu_bit_bitmap[0] is empty - so we can back into it */
> > 	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
> > 	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
> > 	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
> > 	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
> > 
> > 	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
> > 		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
> > 		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
> > 	#if BITS_PER_LONG > 32
> > 		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
> > 		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
> > 	#endif
> > 	};
> > 
> > 	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
> > 	{
> > 		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
> > 		p -= nr / BITS_PER_LONG;
> > 		return (const cpumask_t *)p;
> > 	}
> 
> Btw, Ingo, can we get this issue resolved asap, please?
> 
> I was planning on doing -rc1 today, but this kind of hangs over me. 
> The above three lines of code (and more lines of macro initializers) 
> obviously does need some more testing, but if you hook it into the 
> other changes you already had, maybe we can get it done.
> 
> Hmm?

yeah, i'm on it. If everything goes well hopefully i'll have something 
pullable in 1-2 hours, if that's ok as a timeframe.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 18:32       ` Linus Torvalds
  2008-07-28 18:37         ` Linus Torvalds
@ 2008-07-28 19:04         ` Mike Travis
  2008-07-28 20:57         ` [rfc git pull] cpus4096 fixes, take 2 Ingo Molnar
  2 siblings, 0 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 19:04 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Rusty Russell, Ingo Molnar, Linux Kernel Mailing List,
	Andrew Morton, Al Viro

Linus Torvalds wrote:
> 
> On Mon, 28 Jul 2008, Mike Travis wrote:
>> Sorry, I didn't know that was the protocol.  And yes, the clever idea of
>> compacting the memory is a good one (wish I would have thought of it... ;-)
>> But, and it's a big but, if you really have 4096 cpus present (not NR_CPUS,
>> but nr_cpu_ids), then 2MB is pretty much chump change.
> 
> Umm. Yes, it's chump change, but if you compile a kernel to be generic, 
> and you actually only have a few CPU's, it's no longer chump change.

The 2Mb's of initdata is released, I just meant that if you really have 4k
cpus in the system, you'll probably have 4k * [2 .. 32 Gb (or more?)] of
memory.  The Nahalem memory limit is (iirc) 44 bits.

Originally, I only had the constant for cpu(0) but since it _was_ originally
a constant (alibi, rvalue only), then it might be thought that it's valid to
use any cpu# before setup_per_cpu_areas is called.

> 
>> But I'll redo the patch again.
> 
> Here's a trivial setup, that is even tested. It's _small_ too.
> 
> 	/* cpu_bit_bitmap[0] is empty - so we can back into it */
> 	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
> 	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
> 	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
> 	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
> 
> 	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
> 		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
> 		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
> 	#if BITS_PER_LONG > 32
> 		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
> 		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
> 	#endif
> 	};
> 
> 	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
> 	{
> 		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
> 		p -= nr / BITS_PER_LONG;
> 		return (const cpumask_t *)p;
> 	}
> 
> that should be all you need to do.

Very cool, thanks!!

> 
> Honesty in advertizing: my "testing" was some trivial user-space harness, 
> maybe I had some bug in it. But at least it's not _horribly_ wrong.
> 
> And yes, this has the added optimization from Viro of overlapping the 
> cpumask_t's internally too, rather than making them twice the size. So 
> with 4096 CPU's, this should result 32.5kB of static const data.
> 
> 			Linus

Don't worry, I'll beat it to death... ;-)  [and try not to screw up the
acknowledgments this time... ;-)]

Thanks again,
Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 18:46     ` [git pull] cpus4096 fixes Mike Travis
@ 2008-07-28 19:13       ` Ingo Molnar
  2008-07-29  1:33       ` Rusty Russell
  1 sibling, 0 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28 19:13 UTC (permalink / raw)
  To: Mike Travis
  Cc: Rusty Russell, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton


* Mike Travis <travis@sgi.com> wrote:

> Just so I don't make the same mistake, this:
> 
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Mike Travis <travis@sgi.com>
> 
> 
> Should have been this:
> 
> From: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Mike Travis <travis@sgi.com>
> 
> ?

yes:

  email-header:
    From: Mike Travis <travis@sgi.com>
  email-body:
    From: Rusty Russell <rusty@rustcorp.com.au>
    [...]
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Mike Travis <travis@sgi.com>

gets turned into such a commit:

    Author: Rusty Russell <rusty@rustcorp.com.au>
    [...]
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Mike Travis <travis@sgi.com>

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 18:51           ` Ingo Molnar
@ 2008-07-28 19:22             ` Mike Travis
  2008-07-28 19:31               ` Mike Travis
  0 siblings, 1 reply; 43+ messages in thread
From: Mike Travis @ 2008-07-28 19:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Al Viro

Ingo Molnar wrote:
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
>>
>> On Mon, 28 Jul 2008, Linus Torvalds wrote:
>>> Here's a trivial setup, that is even tested. It's _small_ too.
>>>
>>> 	/* cpu_bit_bitmap[0] is empty - so we can back into it */
>>> 	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
>>> 	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
>>> 	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
>>> 	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
>>>
>>> 	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
>>> 		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
>>> 		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
>>> 	#if BITS_PER_LONG > 32
>>> 		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
>>> 		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
>>> 	#endif
>>> 	};
>>>
>>> 	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
>>> 	{
>>> 		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
>>> 		p -= nr / BITS_PER_LONG;
>>> 		return (const cpumask_t *)p;
>>> 	}
>> Btw, Ingo, can we get this issue resolved asap, please?
>>
>> I was planning on doing -rc1 today, but this kind of hangs over me. 
>> The above three lines of code (and more lines of macro initializers) 
>> obviously does need some more testing, but if you hook it into the 
>> other changes you already had, maybe we can get it done.
>>
>> Hmm?
> 
> yeah, i'm on it. If everything goes well hopefully i'll have something 
> pullable in 1-2 hours, if that's ok as a timeframe.
> 
> 	Ingo

One problem is that the current api is cpumask_t cpumask_of_cpu(x), so
all ref's would have to be changed.  (Unless there's some clever way of
defining "&cpumask_of_cpu(x)" to be get_cpu_mask(x) ...?)

Do you want me to do these changes?

Thanks,
Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 19:22             ` Mike Travis
@ 2008-07-28 19:31               ` Mike Travis
  0 siblings, 0 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 19:31 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Al Viro

Mike Travis wrote:
> Ingo Molnar wrote:
>> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>
>>> On Mon, 28 Jul 2008, Linus Torvalds wrote:
>>>> Here's a trivial setup, that is even tested. It's _small_ too.
>>>>
>>>> 	/* cpu_bit_bitmap[0] is empty - so we can back into it */
>>>> 	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
>>>> 	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
>>>> 	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
>>>> 	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
>>>>
>>>> 	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
>>>> 		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
>>>> 		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
>>>> 	#if BITS_PER_LONG > 32
>>>> 		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
>>>> 		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
>>>> 	#endif
>>>> 	};
>>>>
>>>> 	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
>>>> 	{
>>>> 		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
>>>> 		p -= nr / BITS_PER_LONG;
>>>> 		return (const cpumask_t *)p;
>>>> 	}
>>> Btw, Ingo, can we get this issue resolved asap, please?
>>>
>>> I was planning on doing -rc1 today, but this kind of hangs over me. 
>>> The above three lines of code (and more lines of macro initializers) 
>>> obviously does need some more testing, but if you hook it into the 
>>> other changes you already had, maybe we can get it done.
>>>
>>> Hmm?
>> yeah, i'm on it. If everything goes well hopefully i'll have something 
>> pullable in 1-2 hours, if that's ok as a timeframe.
>>
>> 	Ingo
> 
> One problem is that the current api is cpumask_t cpumask_of_cpu(x), so
> all ref's would have to be changed.  (Unless there's some clever way of
> defining "&cpumask_of_cpu(x)" to be get_cpu_mask(x) ...?)
> 
> Do you want me to do these changes?
> 
> Thanks,
> Mike

Please ignore my stupidity...  I was not thinking very clearly.

Mike

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [rfc git pull] cpus4096 fixes, take 2
  2008-07-28 18:32       ` Linus Torvalds
  2008-07-28 18:37         ` Linus Torvalds
  2008-07-28 19:04         ` Mike Travis
@ 2008-07-28 20:57         ` Ingo Molnar
  2008-07-28 21:35           ` Ingo Molnar
                             ` (2 more replies)
  2 siblings, 3 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28 20:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mike Travis, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Al Viro


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> > But I'll redo the patch again.
> 
> Here's a trivial setup, that is even tested. It's _small_ too.
> 
> 	/* cpu_bit_bitmap[0] is empty - so we can back into it */
> 	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
> 	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
> 	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
> 	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
> 
> 	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
> 		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
> 		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
> 	#if BITS_PER_LONG > 32
> 		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
> 		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
> 	#endif
> 	};
> 
> 	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
> 	{
> 		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
> 		p -= nr / BITS_PER_LONG;
> 		return (const cpumask_t *)p;
> 	}
> 
> that should be all you need to do.
> 
> Honesty in advertizing: my "testing" was some trivial user-space 
> harness, maybe I had some bug in it. But at least it's not _horribly_ 
> wrong.

Amazing! Your code, once plugged into the kernel proper, booted fine on 
5 different x86 testsystems, it booted fine an allyesconfig kernel with 
MAXSMP and NR_CPUS=4096, it booted fine on allnoconfig as well (and 
allmodconfig and on a good number of randconfigs as well).

> And yes, this has the added optimization from Viro of overlapping the 
> cpumask_t's internally too, rather than making them twice the size. So 
> with 4096 CPU's, this should result 32.5kB of static const data.

What do you think about the commit below?

I've put your fix into:

  # e56b3bc: cpu masks: optimize and clean up cpumask_of_cpu()

It's quoted fully below after the pull request diffstat. Have i missed 
anything obvious?

What i'm unsure about are other architectures. (Small detail i just 
noticed: HAVE_CPUMASK_OF_CPU_MAP is now orphaned in arch/x86/Kconfig, 
will get rid of that in a separate change, i dont want to upset the test 
results.)

( And ... because v1 of your code was so frustratingly and
  mind-blowingly stable in testing (breaking a long track record of v1 
  patches in this area of kernel), and because the perfect patch does 
  not exist by definition, i thought i'd mention that after a long 
  search i found and fixed a serious showstopper bug in your code: you 
  used "1ul" in your macros, instead of the more proper "1UL" style. The
  ratio between the use of 1ul versus 1UL is 1:30 in the tree, so your 
  choice of integer literals type suffix capitalization was deemed 
  un-Linuxish, and was fixed up for good. )

	Ingo

---------------->
Linus,

Please pull the latest cpus4096 git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git cpus4096

 Thanks,

	Ingo

------------------>
Ingo Molnar (1):
      cpumask: export cpumask_of_cpu_map

Linus Torvalds (1):
      cpu masks: optimize and clean up cpumask_of_cpu()

Mike Travis (3):
      cpumask: make cpumask_of_cpu_map generic
      cpumask: put cpumask_of_cpu_map in the initdata section
      cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu


 arch/x86/kernel/acpi/cstate.c                    |    3 +-
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c       |   10 +---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c        |   15 ++----
 arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c |   12 ++---
 arch/x86/kernel/cpu/cpufreq/speedstep-ich.c      |    3 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c            |    3 +-
 arch/x86/kernel/ldt.c                            |    6 +--
 arch/x86/kernel/microcode.c                      |   17 ++----
 arch/x86/kernel/reboot.c                         |   11 +---
 arch/x86/kernel/setup_percpu.c                   |   21 -------
 drivers/acpi/processor_throttling.c              |   11 +---
 drivers/firmware/dcdbas.c                        |    3 +-
 drivers/misc/sgi-xp/xpc_main.c                   |    3 +-
 include/linux/cpumask.h                          |   63 ++++++++-------------
 kernel/cpu.c                                     |   25 +++++++++
 kernel/stop_machine.c                            |    3 +-
 kernel/time/tick-common.c                        |    8 +--
 kernel/trace/trace_sysprof.c                     |    4 +-
 lib/smp_processor_id.c                           |    5 +--
 net/sunrpc/svc.c                                 |    3 +-
 20 files changed, 86 insertions(+), 143 deletions(-)

---->

  # e56b3bc: cpu masks: optimize and clean up cpumask_of_cpu()

>From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 28 Jul 2008 11:32:33 -0700
Subject: [PATCH] cpu masks: optimize and clean up cpumask_of_cpu()

Clean up and optimize cpumask_of_cpu(), by sharing all the zero words.

Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns
creating a huge array of constant bitmasks, realize that the zero words
can be shared.

In other words, on a 64-bit architecture, we only ever need 64 of these
arrays - with a different bit set in one single world (with enough zero
words around it so that we can create any bitmask by just offsetting in
that big array). And then we just put enough zeroes around it that we
can point every single cpumask to be one of those things.

So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each,
with one bit set in each array - 2MB memory total), we have exactly 64
arrays instead, each 8k bits in size (64kB total).

And then we just point cpumask(n) to the right position (which we can
calculate dynamically). Once we have the right arrays, getting
"cpumask(n)" ends up being:

  static inline const cpumask_t *get_cpu_mask(unsigned int cpu)
  {
          const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
          p -= cpu / BITS_PER_LONG;
          return (const cpumask_t *)p;
  }

This brings other advantages and simplifications as well:

 - we are not wasting memory that is just filled with a single bit in
   various different places

 - we don't need all those games to re-create the arrays in some dense
   format, because they're already going to be dense enough.

if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory
is a non-issue (especially since by doing this "overlapping" trick we
probably get better cache behaviour anyway).

[ mingo@elte.hu:

  Converted Linus's mails into a commit. See:

     http://lkml.org/lkml/2008/7/27/156
     http://lkml.org/lkml/2008/7/28/320

  Also applied a family filter - which also has the side-effect of leaving
  out the bits where Linus calls me an idio... Oh, never mind ;-)
]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |   23 -------
 include/linux/cpumask.h        |   26 +++++++-
 kernel/cpu.c                   |  128 ++++++---------------------------------
 3 files changed, 43 insertions(+), 134 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 1cd53df..76e305e 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,26 +80,6 @@ static void __init setup_per_cpu_maps(void)
 #endif
 }
 
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-/*
- * Replace static cpumask_of_cpu_map in the initdata section,
- * with one that's allocated sized by the possible number of cpus.
- *
- * (requires nr_cpu_ids to be initialized)
- */
-static void __init setup_cpumask_of_cpu(void)
-{
-	int i;
-
-	/* alloc_bootmem zeroes memory */
-	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
-	for (i = 0; i < nr_cpu_ids; i++)
-		cpu_set(i, cpumask_of_cpu_map[i]);
-}
-#else
-static inline void setup_cpumask_of_cpu(void) { }
-#endif
-
 #ifdef CONFIG_X86_32
 /*
  * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -199,9 +179,6 @@ void __init setup_per_cpu_areas(void)
 
 	/* Setup node to cpumask map */
 	setup_node_to_cpumask_map();
-
-	/* Setup cpumask_of_cpu map */
-	setup_cpumask_of_cpu();
 }
 
 #endif
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 8fa3b6d..96d0509 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -265,10 +265,30 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
 }
 
+/*
+ * Special-case data structure for "single bit set only" constant CPU masks.
+ *
+ * We pre-generate all the 64 (or 32) possible bit positions, with enough
+ * padding to the left and the right, and return the constant pointer
+ * appropriately offset.
+ */
+extern const unsigned long
+	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
+
+static inline const cpumask_t *get_cpu_mask(unsigned int cpu)
+{
+	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
+	p -= cpu / BITS_PER_LONG;
+	return (const cpumask_t *)p;
+}
+
+/*
+ * In cases where we take the address of the cpumask immediately,
+ * gcc optimizes it out (it's a constant) and there's no huge stack
+ * variable created:
+ */
+#define cpumask_of_cpu(cpu) ({ *get_cpu_mask(cpu); })
 
-/* cpumask_of_cpu_map[] is in kernel/cpu.c */
-extern const cpumask_t *cpumask_of_cpu_map;
-#define cpumask_of_cpu(cpu)	(cpumask_of_cpu_map[cpu])
 
 #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a35d899..06a8358 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,115 +462,27 @@ out:
 
 #endif /* CONFIG_SMP */
 
-/* 64 bits of zeros, for initializers. */
-#if BITS_PER_LONG == 32
-#define Z64 0, 0
-#else
-#define Z64 0
-#endif
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
 
-/* Initializer macros. */
-#define CMI0(n) { .bits = { 1UL << (n) } }
-#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
-
-#define CMI8(n, ...)						\
-	CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__),		\
-	CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__),	\
-	CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__),	\
-	CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
-
-#if BITS_PER_LONG == 32
-#define CMI64(n, ...)							\
-	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
-	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
-	CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__),	\
-	CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
-#else
-#define CMI64(n, ...)							\
-	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
-	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
-	CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__),	\
-	CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
-#endif
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x)	[x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
 
-#define CMI256(n, ...)							\
-	CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__),	\
-	CMI64((n)+128, Z64, Z64, __VA_ARGS__),				\
-	CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
-#define Z256 Z64, Z64, Z64, Z64
-
-#define CMI1024(n, ...)					\
-	CMI256((n), __VA_ARGS__),			\
-	CMI256((n)+256, Z256, __VA_ARGS__),		\
-	CMI256((n)+512, Z256, Z256, __VA_ARGS__),	\
-	CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
-#define Z1024 Z256, Z256, Z256, Z256
-
-/* We want this statically initialized, just to be safe.  We try not
- * to waste too much space, either. */
-static const cpumask_t cpumask_map[]
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-__initdata
-#endif
-= {
-	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
-#if NR_CPUS > 4
-	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
-#endif
-#if NR_CPUS > 8
-	CMI0(8), CMI0(9), CMI0(10), CMI0(11),
-	CMI0(12), CMI0(13), CMI0(14), CMI0(15),
-#endif
-#if NR_CPUS > 16
-	CMI0(16), CMI0(17), CMI0(18), CMI0(19),
-	CMI0(20), CMI0(21), CMI0(22), CMI0(23),
-	CMI0(24), CMI0(25), CMI0(26), CMI0(27),
-	CMI0(28), CMI0(29), CMI0(30), CMI0(31),
-#endif
-#if NR_CPUS > 32
-#if BITS_PER_LONG == 32
-	CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
-	CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
-	CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
-	CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
-	CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
-	CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
-	CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
-	CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
-#else
-	CMI0(32), CMI0(33), CMI0(34), CMI0(35),
-	CMI0(36), CMI0(37), CMI0(38), CMI0(39),
-	CMI0(40), CMI0(41), CMI0(42), CMI0(43),
-	CMI0(44), CMI0(45), CMI0(46), CMI0(47),
-	CMI0(48), CMI0(49), CMI0(50), CMI0(51),
-	CMI0(52), CMI0(53), CMI0(54), CMI0(55),
-	CMI0(56), CMI0(57), CMI0(58), CMI0(59),
-	CMI0(60), CMI0(61), CMI0(62), CMI0(63),
-#endif /* BITS_PER_LONG == 64 */
-#endif
-#if NR_CPUS > 64
-	CMI64(64, Z64),
-#endif
-#if NR_CPUS > 128
-	CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
-#endif
-#if NR_CPUS > 256
-	CMI256(256, Z256),
-#endif
-#if NR_CPUS > 512
-	CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
-#endif
-#if NR_CPUS > 1024
-	CMI1024(1024, Z1024),
-#endif
-#if NR_CPUS > 2048
-	CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
-#endif
-#if NR_CPUS > 4096
-#error NR_CPUS too big.  Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+
+	MASK_DECLARE_8(0),	MASK_DECLARE_8(8),
+	MASK_DECLARE_8(16),	MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+	MASK_DECLARE_8(32),	MASK_DECLARE_8(40),
+	MASK_DECLARE_8(48),	MASK_DECLARE_8(56),
 #endif
 };
-
-const cpumask_t *cpumask_of_cpu_map = cpumask_map;
-
-EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [rfc git pull] cpus4096 fixes, take 2
  2008-07-28 20:57         ` [rfc git pull] cpus4096 fixes, take 2 Ingo Molnar
@ 2008-07-28 21:35           ` Ingo Molnar
  2008-07-28 21:41             ` [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network' Ingo Molnar
  2008-07-28 21:36           ` [rfc git pull] cpus4096 fixes, take 2 Mike Travis
  2008-07-29  1:45           ` Rusty Russell
  2 siblings, 1 reply; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28 21:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mike Travis, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Al Viro


* Ingo Molnar <mingo@elte.hu> wrote:

> Linus,
> 
> Please pull the latest cpus4096 git tree from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git cpus4096

This has a trivial conflict with the latest tree you just pushed out. 
I've pushed the resolution out into the cpus4096-v2 branch, which you 
can pull from:

  git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git cpus4096-v2

all testsystems are still OK.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [rfc git pull] cpus4096 fixes, take 2
  2008-07-28 20:57         ` [rfc git pull] cpus4096 fixes, take 2 Ingo Molnar
  2008-07-28 21:35           ` Ingo Molnar
@ 2008-07-28 21:36           ` Mike Travis
  2008-07-29  1:45           ` Rusty Russell
  2 siblings, 0 replies; 43+ messages in thread
From: Mike Travis @ 2008-07-28 21:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Al Viro

Great work, thanks to both Linus and you!  (And of course Rusty who noticed the
original failure.)

I've also tested a few more configs incl both NR_CPUS=4096 & !SMP 

Acked-by: Mike Travis <travis@sgi.com>

Ingo Molnar wrote:
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
>>> But I'll redo the patch again.
>> Here's a trivial setup, that is even tested. It's _small_ too.
>>
>> 	/* cpu_bit_bitmap[0] is empty - so we can back into it */
>> 	#define MASK_DECLARE_1(x) [x+1][0] = 1ul << (x)
>> 	#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
>> 	#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
>> 	#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
>>
>> 	static const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
>> 		MASK_DECLARE_8(0), MASK_DECLARE_8(8),
>> 		MASK_DECLARE_8(16), MASK_DECLARE_8(24),
>> 	#if BITS_PER_LONG > 32
>> 		MASK_DECLARE_8(32), MASK_DECLARE_8(40),
>> 		MASK_DECLARE_8(48), MASK_DECLARE_8(56),
>> 	#endif
>> 	};
>>
>> 	static inline const cpumask_t *get_cpu_mask(unsigned int nr)
>> 	{
>> 		const unsigned long *p = cpu_bit_bitmap[1 + nr % BITS_PER_LONG];
>> 		p -= nr / BITS_PER_LONG;
>> 		return (const cpumask_t *)p;
>> 	}
>>
>> that should be all you need to do.
>>
>> Honesty in advertizing: my "testing" was some trivial user-space 
>> harness, maybe I had some bug in it. But at least it's not _horribly_ 
>> wrong.
> 
> Amazing! Your code, once plugged into the kernel proper, booted fine on 
> 5 different x86 testsystems, it booted fine an allyesconfig kernel with 
> MAXSMP and NR_CPUS=4096, it booted fine on allnoconfig as well (and 
> allmodconfig and on a good number of randconfigs as well).
> 
>> And yes, this has the added optimization from Viro of overlapping the 
>> cpumask_t's internally too, rather than making them twice the size. So 
>> with 4096 CPU's, this should result 32.5kB of static const data.
> 
> What do you think about the commit below?
> 
> I've put your fix into:
> 
>   # e56b3bc: cpu masks: optimize and clean up cpumask_of_cpu()
> 
> It's quoted fully below after the pull request diffstat. Have i missed 
> anything obvious?
> 
> What i'm unsure about are other architectures. (Small detail i just 
> noticed: HAVE_CPUMASK_OF_CPU_MAP is now orphaned in arch/x86/Kconfig, 
> will get rid of that in a separate change, i dont want to upset the test 
> results.)
> 
> ( And ... because v1 of your code was so frustratingly and
>   mind-blowingly stable in testing (breaking a long track record of v1 
>   patches in this area of kernel), and because the perfect patch does 
>   not exist by definition, i thought i'd mention that after a long 
>   search i found and fixed a serious showstopper bug in your code: you 
>   used "1ul" in your macros, instead of the more proper "1UL" style. The
>   ratio between the use of 1ul versus 1UL is 1:30 in the tree, so your 
>   choice of integer literals type suffix capitalization was deemed 
>   un-Linuxish, and was fixed up for good. )
> 
> 	Ingo
> 
> ---------------->
> Linus,
> 
> Please pull the latest cpus4096 git tree from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git cpus4096
> 
>  Thanks,
> 
> 	Ingo
> 
> ------------------>
> Ingo Molnar (1):
>       cpumask: export cpumask_of_cpu_map
> 
> Linus Torvalds (1):
>       cpu masks: optimize and clean up cpumask_of_cpu()
> 
> Mike Travis (3):
>       cpumask: make cpumask_of_cpu_map generic
>       cpumask: put cpumask_of_cpu_map in the initdata section
>       cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu
> 
> 
>  arch/x86/kernel/acpi/cstate.c                    |    3 +-
>  arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c       |   10 +---
>  arch/x86/kernel/cpu/cpufreq/powernow-k8.c        |   15 ++----
>  arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c |   12 ++---
>  arch/x86/kernel/cpu/cpufreq/speedstep-ich.c      |    3 +-
>  arch/x86/kernel/cpu/intel_cacheinfo.c            |    3 +-
>  arch/x86/kernel/ldt.c                            |    6 +--
>  arch/x86/kernel/microcode.c                      |   17 ++----
>  arch/x86/kernel/reboot.c                         |   11 +---
>  arch/x86/kernel/setup_percpu.c                   |   21 -------
>  drivers/acpi/processor_throttling.c              |   11 +---
>  drivers/firmware/dcdbas.c                        |    3 +-
>  drivers/misc/sgi-xp/xpc_main.c                   |    3 +-
>  include/linux/cpumask.h                          |   63 ++++++++-------------
>  kernel/cpu.c                                     |   25 +++++++++
>  kernel/stop_machine.c                            |    3 +-
>  kernel/time/tick-common.c                        |    8 +--
>  kernel/trace/trace_sysprof.c                     |    4 +-
>  lib/smp_processor_id.c                           |    5 +--
>  net/sunrpc/svc.c                                 |    3 +-
>  20 files changed, 86 insertions(+), 143 deletions(-)
> 
> ---->
> 
>   # e56b3bc: cpu masks: optimize and clean up cpumask_of_cpu()
> 
>>From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Mon, 28 Jul 2008 11:32:33 -0700
> Subject: [PATCH] cpu masks: optimize and clean up cpumask_of_cpu()
> 
> Clean up and optimize cpumask_of_cpu(), by sharing all the zero words.
> 
> Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns
> creating a huge array of constant bitmasks, realize that the zero words
> can be shared.
> 
> In other words, on a 64-bit architecture, we only ever need 64 of these
> arrays - with a different bit set in one single world (with enough zero
> words around it so that we can create any bitmask by just offsetting in
> that big array). And then we just put enough zeroes around it that we
> can point every single cpumask to be one of those things.
> 
> So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each,
> with one bit set in each array - 2MB memory total), we have exactly 64
> arrays instead, each 8k bits in size (64kB total).
> 
> And then we just point cpumask(n) to the right position (which we can
> calculate dynamically). Once we have the right arrays, getting
> "cpumask(n)" ends up being:
> 
>   static inline const cpumask_t *get_cpu_mask(unsigned int cpu)
>   {
>           const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
>           p -= cpu / BITS_PER_LONG;
>           return (const cpumask_t *)p;
>   }
> 
> This brings other advantages and simplifications as well:
> 
>  - we are not wasting memory that is just filled with a single bit in
>    various different places
> 
>  - we don't need all those games to re-create the arrays in some dense
>    format, because they're already going to be dense enough.
> 
> if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory
> is a non-issue (especially since by doing this "overlapping" trick we
> probably get better cache behaviour anyway).
> 
> [ mingo@elte.hu:
> 
>   Converted Linus's mails into a commit. See:
> 
>      http://lkml.org/lkml/2008/7/27/156
>      http://lkml.org/lkml/2008/7/28/320
> 
>   Also applied a family filter - which also has the side-effect of leaving
>   out the bits where Linus calls me an idio... Oh, never mind ;-)
> ]
> 
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Al Viro <viro@ZenIV.linux.org.uk>
> Cc: Mike Travis <travis@sgi.com>
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> ---
>  arch/x86/kernel/setup_percpu.c |   23 -------
>  include/linux/cpumask.h        |   26 +++++++-
>  kernel/cpu.c                   |  128 ++++++---------------------------------
>  3 files changed, 43 insertions(+), 134 deletions(-)
> 
> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
> index 1cd53df..76e305e 100644
> --- a/arch/x86/kernel/setup_percpu.c
> +++ b/arch/x86/kernel/setup_percpu.c
> @@ -80,26 +80,6 @@ static void __init setup_per_cpu_maps(void)
>  #endif
>  }
>  
> -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
> -/*
> - * Replace static cpumask_of_cpu_map in the initdata section,
> - * with one that's allocated sized by the possible number of cpus.
> - *
> - * (requires nr_cpu_ids to be initialized)
> - */
> -static void __init setup_cpumask_of_cpu(void)
> -{
> -	int i;
> -
> -	/* alloc_bootmem zeroes memory */
> -	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
> -	for (i = 0; i < nr_cpu_ids; i++)
> -		cpu_set(i, cpumask_of_cpu_map[i]);
> -}
> -#else
> -static inline void setup_cpumask_of_cpu(void) { }
> -#endif
> -
>  #ifdef CONFIG_X86_32
>  /*
>   * Great future not-so-futuristic plan: make i386 and x86_64 do it
> @@ -199,9 +179,6 @@ void __init setup_per_cpu_areas(void)
>  
>  	/* Setup node to cpumask map */
>  	setup_node_to_cpumask_map();
> -
> -	/* Setup cpumask_of_cpu map */
> -	setup_cpumask_of_cpu();
>  }
>  
>  #endif
> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> index 8fa3b6d..96d0509 100644
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -265,10 +265,30 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
>  	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
>  }
>  
> +/*
> + * Special-case data structure for "single bit set only" constant CPU masks.
> + *
> + * We pre-generate all the 64 (or 32) possible bit positions, with enough
> + * padding to the left and the right, and return the constant pointer
> + * appropriately offset.
> + */
> +extern const unsigned long
> +	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
> +
> +static inline const cpumask_t *get_cpu_mask(unsigned int cpu)
> +{
> +	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
> +	p -= cpu / BITS_PER_LONG;
> +	return (const cpumask_t *)p;
> +}
> +
> +/*
> + * In cases where we take the address of the cpumask immediately,
> + * gcc optimizes it out (it's a constant) and there's no huge stack
> + * variable created:
> + */
> +#define cpumask_of_cpu(cpu) ({ *get_cpu_mask(cpu); })
>  
> -/* cpumask_of_cpu_map[] is in kernel/cpu.c */
> -extern const cpumask_t *cpumask_of_cpu_map;
> -#define cpumask_of_cpu(cpu)	(cpumask_of_cpu_map[cpu])
>  
>  #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
>  
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index a35d899..06a8358 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -462,115 +462,27 @@ out:
>  
>  #endif /* CONFIG_SMP */
>  
> -/* 64 bits of zeros, for initializers. */
> -#if BITS_PER_LONG == 32
> -#define Z64 0, 0
> -#else
> -#define Z64 0
> -#endif
> +/*
> + * cpu_bit_bitmap[] is a special, "compressed" data structure that
> + * represents all NR_CPUS bits binary values of 1<<nr.
> + *
> + * It is used by cpumask_of_cpu() to get a constant address to a CPU
> + * mask value that has a single bit set only.
> + */
>  
> -/* Initializer macros. */
> -#define CMI0(n) { .bits = { 1UL << (n) } }
> -#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
> -
> -#define CMI8(n, ...)						\
> -	CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__),		\
> -	CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__),	\
> -	CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__),	\
> -	CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
> -
> -#if BITS_PER_LONG == 32
> -#define CMI64(n, ...)							\
> -	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
> -	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
> -	CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__),	\
> -	CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
> -#else
> -#define CMI64(n, ...)							\
> -	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
> -	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
> -	CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__),	\
> -	CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
> -#endif
> +/* cpu_bit_bitmap[0] is empty - so we can back into it */
> +#define MASK_DECLARE_1(x)	[x+1][0] = 1UL << (x)
> +#define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
> +#define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
> +#define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
>  
> -#define CMI256(n, ...)							\
> -	CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__),	\
> -	CMI64((n)+128, Z64, Z64, __VA_ARGS__),				\
> -	CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
> -#define Z256 Z64, Z64, Z64, Z64
> -
> -#define CMI1024(n, ...)					\
> -	CMI256((n), __VA_ARGS__),			\
> -	CMI256((n)+256, Z256, __VA_ARGS__),		\
> -	CMI256((n)+512, Z256, Z256, __VA_ARGS__),	\
> -	CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
> -#define Z1024 Z256, Z256, Z256, Z256
> -
> -/* We want this statically initialized, just to be safe.  We try not
> - * to waste too much space, either. */
> -static const cpumask_t cpumask_map[]
> -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
> -__initdata
> -#endif
> -= {
> -	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
> -#if NR_CPUS > 4
> -	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
> -#endif
> -#if NR_CPUS > 8
> -	CMI0(8), CMI0(9), CMI0(10), CMI0(11),
> -	CMI0(12), CMI0(13), CMI0(14), CMI0(15),
> -#endif
> -#if NR_CPUS > 16
> -	CMI0(16), CMI0(17), CMI0(18), CMI0(19),
> -	CMI0(20), CMI0(21), CMI0(22), CMI0(23),
> -	CMI0(24), CMI0(25), CMI0(26), CMI0(27),
> -	CMI0(28), CMI0(29), CMI0(30), CMI0(31),
> -#endif
> -#if NR_CPUS > 32
> -#if BITS_PER_LONG == 32
> -	CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
> -	CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
> -	CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
> -	CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
> -	CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
> -	CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
> -	CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
> -	CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
> -#else
> -	CMI0(32), CMI0(33), CMI0(34), CMI0(35),
> -	CMI0(36), CMI0(37), CMI0(38), CMI0(39),
> -	CMI0(40), CMI0(41), CMI0(42), CMI0(43),
> -	CMI0(44), CMI0(45), CMI0(46), CMI0(47),
> -	CMI0(48), CMI0(49), CMI0(50), CMI0(51),
> -	CMI0(52), CMI0(53), CMI0(54), CMI0(55),
> -	CMI0(56), CMI0(57), CMI0(58), CMI0(59),
> -	CMI0(60), CMI0(61), CMI0(62), CMI0(63),
> -#endif /* BITS_PER_LONG == 64 */
> -#endif
> -#if NR_CPUS > 64
> -	CMI64(64, Z64),
> -#endif
> -#if NR_CPUS > 128
> -	CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
> -#endif
> -#if NR_CPUS > 256
> -	CMI256(256, Z256),
> -#endif
> -#if NR_CPUS > 512
> -	CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
> -#endif
> -#if NR_CPUS > 1024
> -	CMI1024(1024, Z1024),
> -#endif
> -#if NR_CPUS > 2048
> -	CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
> -#endif
> -#if NR_CPUS > 4096
> -#error NR_CPUS too big.  Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
> +const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
> +
> +	MASK_DECLARE_8(0),	MASK_DECLARE_8(8),
> +	MASK_DECLARE_8(16),	MASK_DECLARE_8(24),
> +#if BITS_PER_LONG > 32
> +	MASK_DECLARE_8(32),	MASK_DECLARE_8(40),
> +	MASK_DECLARE_8(48),	MASK_DECLARE_8(56),
>  #endif
>  };
> -
> -const cpumask_t *cpumask_of_cpu_map = cpumask_map;
> -
> -EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
> +EXPORT_SYMBOL_GPL(cpu_bit_bitmap);


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network'
  2008-07-28 21:35           ` Ingo Molnar
@ 2008-07-28 21:41             ` Ingo Molnar
  2008-07-28 22:06               ` Ingo Molnar
  2008-07-30 14:59               ` David Sterba
  0 siblings, 2 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28 21:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mike Travis, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Al Viro, David Sterba, Jiri Kosina


* Ingo Molnar <mingo@elte.hu> wrote:

> This has a trivial conflict with the latest tree you just pushed out. 

btw., if you intend that for -rc1, that tree has a common looking build 
failure:

drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of 
undefined type 'struct ipw_network'

http://redhat.com/~mingo/misc/config-Mon_Jul_28_23_36_20_CEST_2008.bad

probably due to a string of commits to that file from today.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network'
  2008-07-28 21:41             ` [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network' Ingo Molnar
@ 2008-07-28 22:06               ` Ingo Molnar
  2008-07-28 22:20                 ` Andrew Morton
  2008-07-30 14:59               ` David Sterba
  1 sibling, 1 reply; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28 22:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mike Travis, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Al Viro, David Sterba, Jiri Kosina


* Ingo Molnar <mingo@elte.hu> wrote:

> drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of 
> undefined type 'struct ipw_network'
> 
> http://redhat.com/~mingo/misc/config-Mon_Jul_28_23_36_20_CEST_2008.bad
> 
> probably due to a string of commits to that file from today.

hm, has this version of the driver ever been built successfully? Because 
struct ipw_network is defined in network.c [only], and then used in 
hardware.c.

It could be changed to void * if the structure wasnt relied on by:

                        const int min_capacity =
                                ipwireless_ppp_mru(hw->network + 2);

so changing it to void * would break this part of the code.

furthermore, what does that "hw->network + 2" mean? It points into 
la-la-land AFAICS, because it's initialized as:

        struct ipw_network *network =
                kzalloc(sizeof(struct ipw_network), GFP_ATOMIC);

and then written into hw->network via:

        ipwireless_associate_network(hw, network);

it's getting late here, so i might be missing some really obvious 
solution (and Jiri is asleep i suspect), so below is a temporary patch 
that marks the driver CONFIG_BROKEN until this is resolved. This gets 
allyesconfig going on x86.

	Ingo

--------------->
>From 56df93b08edb33310aaf5c17c48be098d134cc07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 29 Jul 2008 00:06:05 +0200
Subject: [PATCH] ipwireless: disable for now
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit

mark it CONFIG_BROKEN for now, it breaks the build:

drivers/char/pcmcia/ipwireless/hardware.c: In function ‘pool_allocate':
drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type ‘struct ipw_network'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/char/pcmcia/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/char/pcmcia/Kconfig b/drivers/char/pcmcia/Kconfig
index ffa0efc..8ee7bbd 100644
--- a/drivers/char/pcmcia/Kconfig
+++ b/drivers/char/pcmcia/Kconfig
@@ -46,6 +46,7 @@ config CARDMAN_4040
 config IPWIRELESS
 	tristate "IPWireless 3G UMTS PCMCIA card support"
 	depends on PCMCIA && NETDEVICES
+	depends on BROKEN
 	select PPP
 	help
 	  This is a driver for 3G UMTS PCMCIA card from IPWireless company. In

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network'
  2008-07-28 22:06               ` Ingo Molnar
@ 2008-07-28 22:20                 ` Andrew Morton
  2008-07-28 22:29                   ` Ingo Molnar
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Morton @ 2008-07-28 22:20 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: torvalds, travis, rusty, linux-kernel, viro, dsterba, jkosina

On Tue, 29 Jul 2008 00:06:31 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Ingo Molnar <mingo@elte.hu> wrote:
> 
> > drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of 
> > undefined type 'struct ipw_network'
> > 
> > http://redhat.com/~mingo/misc/config-Mon_Jul_28_23_36_20_CEST_2008.bad
> > 
> > probably due to a string of commits to that file from today.
> 
> hm, has this version of the driver ever been built successfully? Because 
> struct ipw_network is defined in network.c [only], and then used in 
> hardware.c.
> 
> It could be changed to void * if the structure wasnt relied on by:
> 
>                         const int min_capacity =
>                                 ipwireless_ppp_mru(hw->network + 2);
> 
> so changing it to void * would break this part of the code.
> 
> furthermore, what does that "hw->network + 2" mean? It points into 
> la-la-land AFAICS, because it's initialized as:
> 
>         struct ipw_network *network =
>                 kzalloc(sizeof(struct ipw_network), GFP_ATOMIC);
> 
> and then written into hw->network via:
> 
>         ipwireless_associate_network(hw, network);
> 
> it's getting late here, so i might be missing some really obvious 
> solution (and Jiri is asleep i suspect), so below is a temporary patch 
> that marks the driver CONFIG_BROKEN until this is resolved. This gets 
> allyesconfig going on x86.

I suspect that this:

--- a/drivers/char/pcmcia/ipwireless/hardware.c~a
+++ a/drivers/char/pcmcia/ipwireless/hardware.c
@@ -568,7 +568,7 @@ static struct ipw_rx_packet *pool_alloca
 			list_del(&packet->queue);
 		} else {
 			const int min_capacity =
-				ipwireless_ppp_mru(hw->network + 2);
+				ipwireless_ppp_mru(hw->network) + 2;
 			int new_capacity;
 
 			spin_unlock_irqrestore(&hw->lock, flags);
_

was intended.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network'
  2008-07-28 22:20                 ` Andrew Morton
@ 2008-07-28 22:29                   ` Ingo Molnar
  0 siblings, 0 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-28 22:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: torvalds, travis, rusty, linux-kernel, viro, dsterba, jkosina


* Andrew Morton <akpm@linux-foundation.org> wrote:

> On Tue, 29 Jul 2008 00:06:31 +0200
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > * Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of 
> > > undefined type 'struct ipw_network'
> > > 
> > > http://redhat.com/~mingo/misc/config-Mon_Jul_28_23_36_20_CEST_2008.bad
> > > 
> > > probably due to a string of commits to that file from today.
> > 
> > hm, has this version of the driver ever been built successfully? Because 
> > struct ipw_network is defined in network.c [only], and then used in 
> > hardware.c.
> > 
> > It could be changed to void * if the structure wasnt relied on by:
> > 
> >                         const int min_capacity =
> >                                 ipwireless_ppp_mru(hw->network + 2);
> > 
> > so changing it to void * would break this part of the code.
> > 
> > furthermore, what does that "hw->network + 2" mean? It points into 
> > la-la-land AFAICS, because it's initialized as:
> > 
> >         struct ipw_network *network =
> >                 kzalloc(sizeof(struct ipw_network), GFP_ATOMIC);
> > 
> > and then written into hw->network via:
> > 
> >         ipwireless_associate_network(hw, network);
> > 
> > it's getting late here, so i might be missing some really obvious 
> > solution (and Jiri is asleep i suspect), so below is a temporary patch 
> > that marks the driver CONFIG_BROKEN until this is resolved. This gets 
> > allyesconfig going on x86.
> 
> I suspect that this:
> 
> --- a/drivers/char/pcmcia/ipwireless/hardware.c~a
> +++ a/drivers/char/pcmcia/ipwireless/hardware.c
> @@ -568,7 +568,7 @@ static struct ipw_rx_packet *pool_alloca
>  			list_del(&packet->queue);
>  		} else {
>  			const int min_capacity =
> -				ipwireless_ppp_mru(hw->network + 2);
> +				ipwireless_ppp_mru(hw->network) + 2;
>  			int new_capacity;
>  
>  			spin_unlock_irqrestore(&hw->lock, flags);
> _
> 
> was intended.

ah, indeed. I see, this came from the 12/12 patch:

-                       static int min_capacity = 256;
+                       const int min_capacity =
+                               ipwireless_ppp_mru(hw->network + 2);

which was probably a last-minute change. So instead of turning the 
driver into something (while the author is sleeping) that was never 
tested before, i'd rather suggest to revert commit 
a01386924874c4d6d67f8a34e66f04452c2abb69, which seems a non-essential 
optimization to the driver and which apparently was not build tested, so 
quite likely not functionality tested either. It reverts cleanly here.

OTOH ... your change looks correct enough too.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 18:46     ` [git pull] cpus4096 fixes Mike Travis
  2008-07-28 19:13       ` Ingo Molnar
@ 2008-07-29  1:33       ` Rusty Russell
  1 sibling, 0 replies; 43+ messages in thread
From: Rusty Russell @ 2008-07-29  1:33 UTC (permalink / raw)
  To: Mike Travis
  Cc: Linus Torvalds, Ingo Molnar, Linux Kernel Mailing List, Andrew Morton

On Tuesday 29 July 2008 04:46:35 Mike Travis wrote:
> Just so I don't make the same mistake, this:
>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Mike Travis <travis@sgi.com>
>
> Should have been this:
>
> From: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Mike Travis <travis@sgi.com>

Well yes, but it's clear you knew Linus would have a flaming hissy fit, so you 
deliberately stepped into the line of fire...

Heroic, really.
Rusty.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [rfc git pull] cpus4096 fixes, take 2
  2008-07-28 20:57         ` [rfc git pull] cpus4096 fixes, take 2 Ingo Molnar
  2008-07-28 21:35           ` Ingo Molnar
  2008-07-28 21:36           ` [rfc git pull] cpus4096 fixes, take 2 Mike Travis
@ 2008-07-29  1:45           ` Rusty Russell
  2008-07-29 12:11             ` Ingo Molnar
  2 siblings, 1 reply; 43+ messages in thread
From: Rusty Russell @ 2008-07-29  1:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Mike Travis, Linux Kernel Mailing List,
	Andrew Morton, Al Viro

On Tuesday 29 July 2008 06:57:00 Ingo Molnar wrote:
> +/*
> + * In cases where we take the address of the cpumask immediately,
> + * gcc optimizes it out (it's a constant) and there's no huge stack
> + * variable created:
> + */
> +#define cpumask_of_cpu(cpu) ({ *get_cpu_mask(cpu); })

Why use a statement expression here?  Isn't (*get_cpu_mask(cpu)) sufficient?

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [rfc git pull] cpus4096 fixes, take 2
  2008-07-29  1:45           ` Rusty Russell
@ 2008-07-29 12:11             ` Ingo Molnar
  2008-07-30  0:15               ` Rusty Russell
  0 siblings, 1 reply; 43+ messages in thread
From: Ingo Molnar @ 2008-07-29 12:11 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Linus Torvalds, Mike Travis, Linux Kernel Mailing List,
	Andrew Morton, Al Viro


* Rusty Russell <rusty@rustcorp.com.au> wrote:

> On Tuesday 29 July 2008 06:57:00 Ingo Molnar wrote:
> > +/*
> > + * In cases where we take the address of the cpumask immediately,
> > + * gcc optimizes it out (it's a constant) and there's no huge stack
> > + * variable created:
> > + */
> > +#define cpumask_of_cpu(cpu) ({ *get_cpu_mask(cpu); })
> 
> Why use a statement expression here?  Isn't (*get_cpu_mask(cpu)) 
> sufficient?

Yeah, it's sufficient - no strong reasons - it felt a slightly bit more 
correct to isolate the read-only data structure. Not that it makes any 
real difference in practice - it's still possible to take the address of 
get_cpu_mask() and abuse that - gcc will only issue a warning.

But gcc 3.4.5 apparently craps out on this valid use of a gcc extension, 
see the report and the fix at:

  http://lkml.org/lkml/2008/7/29/154

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [rfc git pull] cpus4096 fixes, take 2
  2008-07-29 12:11             ` Ingo Molnar
@ 2008-07-30  0:15               ` Rusty Russell
  0 siblings, 0 replies; 43+ messages in thread
From: Rusty Russell @ 2008-07-30  0:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Mike Travis, Linux Kernel Mailing List,
	Andrew Morton, Al Viro

On Tuesday 29 July 2008 22:11:56 Ingo Molnar wrote:
> But gcc 3.4.5 apparently craps out on this valid use of a gcc extension,
> see the report and the fix at:

Hmm.  I can't find anything in the docs, but I'd assumed that a statement 
expression isn't an lvalue.  Trying "int x; &({ x; });" fails as expected 
with gcc 3.4, 4.1, 4.2 and 4.3:

	foo.c:3: error: lvalue required as unary ‘&’ operand

I'd say your code should, too...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network'
  2008-07-28 21:41             ` [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network' Ingo Molnar
  2008-07-28 22:06               ` Ingo Molnar
@ 2008-07-30 14:59               ` David Sterba
  2008-07-30 15:11                 ` James Bottomley
  1 sibling, 1 reply; 43+ messages in thread
From: David Sterba @ 2008-07-30 14:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Mike Travis, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Al Viro, David Sterba,
	Jiri Kosina, James.Bottomley

Hi,

I'm sorry for the build error and thank you all for prompt fix.

It was not a last-moment change, the patches were waiting a
few weeks for the merge window. Hm, I probably did not test this
one, will do better next time.

Dave


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network'
  2008-07-30 14:59               ` David Sterba
@ 2008-07-30 15:11                 ` James Bottomley
  2008-07-30 15:14                   ` Jiri Kosina
  0 siblings, 1 reply; 43+ messages in thread
From: James Bottomley @ 2008-07-30 15:11 UTC (permalink / raw)
  To: dsterba
  Cc: Ingo Molnar, Linus Torvalds, Mike Travis, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Al Viro, Jiri Kosina

On Wed, 2008-07-30 at 16:59 +0200, David Sterba wrote:
> Hi,
> 
> I'm sorry for the build error and thank you all for prompt fix.

You're welcome, But ...

> It was not a last-moment change, the patches were waiting a
> few weeks for the merge window. Hm, I probably did not test this
> one, will do better next time.

I see the patches came in via a straight email sequence.  They really
need to be in a tree somewhere that's pulled into linux-next so this
type of thing gets noticed.  Since they're wireless patches, why not
send them through John Linville's wireless tree?

James



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network'
  2008-07-30 15:11                 ` James Bottomley
@ 2008-07-30 15:14                   ` Jiri Kosina
  0 siblings, 0 replies; 43+ messages in thread
From: Jiri Kosina @ 2008-07-30 15:14 UTC (permalink / raw)
  To: James Bottomley
  Cc: dsterba, Ingo Molnar, Linus Torvalds, Mike Travis, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Al Viro

On Wed, 30 Jul 2008, James Bottomley wrote:

> I see the patches came in via a straight email sequence.  They really 
> need to be in a tree somewhere that's pulled into linux-next so this 
> type of thing gets noticed.  Since they're wireless patches, why not 
> send them through John Linville's wireless tree?

Well, the 'wireless' in the name of the driver is misleading, it's not 
'wireless' device at all, it's just a serial modem (character device), 
with 'wireless' in its product name.

I used to have git tree for this driver some time ago, but after the 
driver has been merged, I have dropped it, as I didn't expect any massive 
further development of it. Apparently, David is going on with improving 
the driver (thanks!), so I'll probably resurrect it and ask Stephen to 
pull it into -next again.

Thanks,

-- 
Jiri Kosina

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [git pull] cpus4096 fixes
  2008-07-28 13:21     ` Rusty Russell
  2008-07-28 18:23       ` Mike Travis
@ 2008-07-31 10:30       ` Ingo Molnar
  1 sibling, 0 replies; 43+ messages in thread
From: Ingo Molnar @ 2008-07-31 10:30 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Linus Torvalds, linux-kernel, Andrew Morton, Mike Travis


* Rusty Russell <rusty@rustcorp.com.au> wrote:

> On Monday 28 July 2008 18:16:39 Ingo Molnar wrote:
> > * Rusty Russell <rusty@rustcorp.com.au> wrote:
> > > Mike: I now think the right long-term answer is Linus' dense cpumap
> > > idea + a convenience allocator for cpumasks.  We sweep the kernel for
> > > all on-stack vars and replace them with one or the other.  Thoughts?
> >
> > The dense cpumap for constant cpumasks is OK as it's clever, compact and
> > static.
> >
> > All-dynamic allocator for on-stack cpumasks ... is a less obvious
> > choice.
> 
> Sorry, I was unclear.  "long-term" == "more than 4096 CPUs", since I 
> thought that was Mike's aim.  If we only want to hack up 4k CPUS and 
> stop, then I understand the current approach.
> 
> If we want huge cpu numbers, I think cpumask_alloc/free gives the 
> clearest code.  So our approach is backwards: let's do that *then* put 
> ugly hacks in if it's really too slow.

My only worry with that principle is that the "does it really hurt" fact 
is seldom really provable on a standalone basis.

Creeping bloat and creeping slowdowns are the hardest to catch. A cycle 
here, a byte there, and it mounts up quickly. Coupled with faster but 
less deterministic CPUs it's pretty hard to prove a slowdown even with 
very careful profiling. We only catch the truly egregious cases that 
manage to shine through the general haze of other changes - and the haze 
is thickening every year.

I dont fundamentally disagree with turning cpumask into standalone 
objects on large machines though. I just think that our profiling 
methods are simply not good enough at the moment to truly trace small 
slowdowns back to their source commits fast enough. So the "we wont do 
it if it hurts" notion, while i agree with it, does not fulfill its 
promise in practice.

[ We might need something like a simulated reference CPU where various 
  "reference" performance tests are 100% repeatable and slowdowns are 
  thus 100% provable and bisectable. That CPU would simulate a cache and 
  would be modern in most aspects, etc. - just that the results it 
  produces would be fully deterministic in virtual time.

  Problem is, hw is not fast enough for that kind of simulation yet IMO
  (tools exist but it would not be fun at all to work in such a
  simulated environment in practice - hence kernel developers would
  generally ignore it) - so there will be a few years of uncertainty
  still. ]

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2008-07-31 10:30 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-27 19:06 [git pull] cpus4096 fixes Ingo Molnar
2008-07-27 20:15 ` Linus Torvalds
2008-07-27 21:03   ` Ingo Molnar
2008-07-28 18:42     ` Mike Travis
2008-07-27 21:05   ` Al Viro
2008-07-27 22:17     ` Linus Torvalds
2008-07-28  0:42   ` Rusty Russell
2008-07-28  3:06     ` Andrew Morton
2008-07-28  6:34       ` Rusty Russell
2008-07-28  6:58         ` Nick Piggin
2008-07-28  7:56         ` Ingo Molnar
2008-07-28 18:12         ` Mike Travis
2008-07-28  8:33     ` Ingo Molnar
2008-07-28 18:07       ` Mike Travis
2008-07-28 17:50     ` Mike Travis
2008-07-28 18:32       ` Linus Torvalds
2008-07-28 18:37         ` Linus Torvalds
2008-07-28 18:51           ` Ingo Molnar
2008-07-28 19:22             ` Mike Travis
2008-07-28 19:31               ` Mike Travis
2008-07-28 19:04         ` Mike Travis
2008-07-28 20:57         ` [rfc git pull] cpus4096 fixes, take 2 Ingo Molnar
2008-07-28 21:35           ` Ingo Molnar
2008-07-28 21:41             ` [build error] drivers/char/pcmcia/ipwireless/hardware.c:571: error: invalid use of undefined type 'struct ipw_network' Ingo Molnar
2008-07-28 22:06               ` Ingo Molnar
2008-07-28 22:20                 ` Andrew Morton
2008-07-28 22:29                   ` Ingo Molnar
2008-07-30 14:59               ` David Sterba
2008-07-30 15:11                 ` James Bottomley
2008-07-30 15:14                   ` Jiri Kosina
2008-07-28 21:36           ` [rfc git pull] cpus4096 fixes, take 2 Mike Travis
2008-07-29  1:45           ` Rusty Russell
2008-07-29 12:11             ` Ingo Molnar
2008-07-30  0:15               ` Rusty Russell
2008-07-28 18:46     ` [git pull] cpus4096 fixes Mike Travis
2008-07-28 19:13       ` Ingo Molnar
2008-07-29  1:33       ` Rusty Russell
2008-07-28  0:53 ` Rusty Russell
2008-07-28  8:16   ` Ingo Molnar
2008-07-28 13:21     ` Rusty Russell
2008-07-28 18:23       ` Mike Travis
2008-07-31 10:30       ` Ingo Molnar
2008-07-28  8:43   ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).