kernel-ark/include/asm-powerpc/paca.h
Anton Blanchard 7a0268fa1a [PATCH] powerpc/64: per cpu data optimisations
The current ppc64 per cpu data implementation is quite slow. eg:

        lhz 11,18(13)           /* smp_processor_id() */
        ld 9,.LC63-.LCTOC1(30)  /* per_cpu__variable_name */
        ld 8,.LC61-.LCTOC1(30)  /* __per_cpu_offset */
        sldi 11,11,3            /* form index into __per_cpu_offset */
        mr 10,9
        ldx 9,11,8              /* __per_cpu_offset[smp_processor_id()] */
        ldx 0,10,9              /* load per cpu data */

5 loads for something that is supposed to be fast, pretty awful. One
reason for the large number of loads is that we have to synthesize 2
64bit constants (per_cpu__variable_name and __per_cpu_offset).

By putting __per_cpu_offset into the paca we can avoid the 2 loads
associated with it:

        ld 11,56(13)            /* paca->data_offset */
        ld 9,.LC59-.LCTOC1(30)  /* per_cpu__variable_name */
        ldx 0,9,11              /* load per cpu data

Longer term we can should be able to do even better than 3 loads.
If per_cpu__variable_name wasnt a 64bit constant and paca->data_offset
was in a register we could cut it down to one load. A suggestion from
Rusty is to use gcc's __thread extension here. In order to do this we
would need to free up r13 (the __thread register and where the paca
currently is). So far Ive had a few unsuccessful attempts at doing that :)

The patch also allocates per cpu memory node local on NUMA machines.
This patch from Rusty has been sitting in my queue _forever_ but stalled
when I hit the compiler bug. Sorry about that.

Finally I also only allocate per cpu data for possible cpus, which comes
straight out of the x86-64 port. On a pseries kernel (with NR_CPUS == 128)
and 4 possible cpus we see some nice gains:

             total       used       free     shared    buffers cached
Mem:       4012228     212860    3799368          0          0 162424

             total       used       free     shared    buffers cached
Mem:       4016200     212984    3803216          0          0 162424

A saving of 3.75MB. Quite nice for smaller machines. Note: we now have
to be careful of per cpu users that touch data for !possible cpus.

At this stage it might be worth making the NUMA and possible cpu
optimisations generic, but per cpu init is done so early we have to be
careful that all architectures have their possible map setup correctly.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-01-11 14:49:45 +11:00

117 lines
3.7 KiB
C

/*
* include/asm-powerpc/paca.h
*
* This control block defines the PACA which defines the processor
* specific data for each logical processor on the system.
* There are some pointers defined that are utilized by PLIC.
*
* C 2001 PPC 64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ASM_POWERPC_PACA_H
#define _ASM_POWERPC_PACA_H
#ifdef __KERNEL__
#include <linux/config.h>
#include <asm/types.h>
#include <asm/lppaca.h>
#include <asm/mmu.h>
register struct paca_struct *local_paca asm("r13");
#define get_paca() local_paca
struct task_struct;
/*
* Defines the layout of the paca.
*
* This structure is not directly accessed by firmware or the service
* processor except for the first two pointers that point to the
* lppaca area and the ItLpRegSave area for this CPU. The lppaca
* object is currently contained within the PACA but it doesn't need
* to be.
*/
struct paca_struct {
/*
* Because hw_cpu_id, unlike other paca fields, is accessed
* routinely from other CPUs (from the IRQ code), we stick to
* read-only (after boot) fields in the first cacheline to
* avoid cacheline bouncing.
*/
/*
* MAGIC: These first two pointers can't be moved - they're
* accessed by the firmware
*/
struct lppaca *lppaca_ptr; /* Pointer to LpPaca for PLIC */
#ifdef CONFIG_PPC_ISERIES
void *reg_save_ptr; /* Pointer to LpRegSave for PLIC */
#endif /* CONFIG_PPC_ISERIES */
/*
* MAGIC: the spinlock functions in arch/ppc64/lib/locks.c
* load lock_token and paca_index with a single lwz
* instruction. They must travel together and be properly
* aligned.
*/
u16 lock_token; /* Constant 0x8000, used in locks */
u16 paca_index; /* Logical processor number */
u64 kernel_toc; /* Kernel TOC address */
u64 stab_real; /* Absolute address of segment table */
u64 stab_addr; /* Virtual address of segment table */
void *emergency_sp; /* pointer to emergency stack */
u64 data_offset; /* per cpu data offset */
s16 hw_cpu_id; /* Physical processor number */
u8 cpu_start; /* At startup, processor spins until */
/* this becomes non-zero. */
/*
* Now, starting in cacheline 2, the exception save areas
*/
/* used for most interrupts/exceptions */
u64 exgen[10] __attribute__((aligned(0x80)));
u64 exmc[10]; /* used for machine checks */
u64 exslb[10]; /* used for SLB/segment table misses
* on the linear mapping */
#ifdef CONFIG_PPC_64K_PAGES
pgd_t *pgdir;
#endif /* CONFIG_PPC_64K_PAGES */
mm_context_t context;
u16 slb_cache[SLB_CACHE_ENTRIES];
u16 slb_cache_ptr;
/*
* then miscellaneous read-write fields
*/
struct task_struct *__current; /* Pointer to current */
u64 kstack; /* Saved Kernel stack addr */
u64 stab_rr; /* stab/slb round-robin counter */
u64 saved_r1; /* r1 save for RTAS calls */
u64 saved_msr; /* MSR saved here by enter_rtas */
u8 proc_enabled; /* irq soft-enable flag */
/*
* iSeries structure which the hypervisor knows about -
* this structure should not cross a page boundary.
* The vpa_init/register_vpa call is now known to fail if the
* lppaca structure crosses a page boundary.
* The lppaca is also used on POWER5 pSeries boxes.
* The lppaca is 640 bytes long, and cannot readily change
* since the hypervisor knows its layout, so a 1kB
* alignment will suffice to ensure that it doesn't
* cross a page boundary.
*/
struct lppaca lppaca __attribute__((__aligned__(0x400)));
};
extern struct paca_struct paca[];
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_PACA_H */