ae01f84b93
Now we dynamically allocate the paca array, it takes an extra load whenever we want to access another cpu's paca. One place we do that a lot is per cpu variables. A simple example: DEFINE_PER_CPU(unsigned long, vara); unsigned long test4(int cpu) { return per_cpu(vara, cpu); } This takes 4 loads, 5 if you include the actual load of the per cpu variable: ld r11,-32760(r30) # load address of paca pointer ld r9,-32768(r30) # load link address of percpu variable sldi r3,r29,9 # get offset into paca (each entry is 512 bytes) ld r0,0(r11) # load paca pointer add r3,r0,r3 # paca + offset ld r11,64(r3) # load paca[cpu].data_offset ldx r3,r9,r11 # load per cpu variable If we remove the ppc64 specific per_cpu_offset(), we get the generic one which indexes into a statically allocated array. This removes one load and one add: ld r11,-32760(r30) # load address of __per_cpu_offset ld r9,-32768(r30) # load link address of percpu variable sldi r3,r29,3 # get offset into __per_cpu_offset (each entry 8 bytes) ldx r11,r11,r3 # load __per_cpu_offset[cpu] ldx r3,r9,r11 # load per cpu variable Having all the offsets in one array also helps when iterating over a per cpu variable across a number of cpus, such as in the scheduler. Before we would need to load one paca cacheline when calculating each per cpu offset. Now we have 16 (128 / sizeof(long)) per cpu offsets in each cacheline. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
22 lines
429 B
C
22 lines
429 B
C
#ifndef _ASM_POWERPC_PERCPU_H_
|
|
#define _ASM_POWERPC_PERCPU_H_
|
|
#ifdef __powerpc64__
|
|
|
|
/*
|
|
* Same as asm-generic/percpu.h, except that we store the per cpu offset
|
|
* in the paca. Based on the x86-64 implementation.
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
#include <asm/paca.h>
|
|
|
|
#define __my_cpu_offset local_paca->data_offset
|
|
|
|
#endif /* CONFIG_SMP */
|
|
#endif /* __powerpc64__ */
|
|
|
|
#include <asm-generic/percpu.h>
|
|
|
|
#endif /* _ASM_POWERPC_PERCPU_H_ */
|