1#ifndef _LINUX_MMZONE_H 2#define _LINUX_MMZONE_H 3 4#ifdef __KERNEL__ 5#ifndef __ASSEMBLY__ 6 7#include <linux/config.h> 8#include <linux/spinlock.h> 9#include <linux/list.h> 10#include <linux/wait.h> 11#include <linux/cache.h> 12#include <linux/threads.h> 13#include <linux/numa.h> 14#include <asm/atomic.h> 15 16/* Free memory management - zoned buddy allocator. */ 17#ifndef CONFIG_FORCE_MAX_ZONEORDER 18#define MAX_ORDER 11 19#else 20#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER 21#endif 22 23struct free_area { 24 struct list_head free_list; 25 unsigned long nr_free; 26}; 27 28struct pglist_data; 29 30/* 31 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. 32 * So add a wild amount of padding here to ensure that they fall into separate 33 * cachelines. There are very few zone structures in the machine, so space 34 * consumption is not a concern here. 35 */ 36#if defined(CONFIG_SMP) 37struct zone_padding { 38 char x[0]; 39} ____cacheline_maxaligned_in_smp; 40#define ZONE_PADDING(name) struct zone_padding name; 41#else 42#define ZONE_PADDING(name) 43#endif 44 45struct per_cpu_pages { 46 int count; /* number of pages in the list */ 47 int low; /* low watermark, refill needed */ 48 int high; /* high watermark, emptying needed */ 49 int batch; /* chunk size for buddy add/remove */ 50 struct list_head list; /* the list of pages */ 51}; 52 53struct per_cpu_pageset { 54 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 55#ifdef CONFIG_NUMA 56 unsigned long numa_hit; /* allocated in intended node */ 57 unsigned long numa_miss; /* allocated in non intended node */ 58 unsigned long numa_foreign; /* was intended here, hit elsewhere */ 59 unsigned long interleave_hit; /* interleaver prefered this zone */ 60 unsigned long local_node; /* allocation from local node */ 61 unsigned long other_node; /* allocation from other node */ 62#endif 63} ____cacheline_aligned_in_smp; 64 65#define ZONE_DMA 0 66#define ZONE_NORMAL 1 67#define ZONE_HIGHMEM 2 68 69#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ 70#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ 71 72 73/* 74 * When a memory allocation must conform to specific limitations (such 75 * as being suitable for DMA) the caller will pass in hints to the 76 * allocator in the gfp_mask, in the zone modifier bits. These bits 77 * are used to select a priority ordered list of memory zones which 78 * match the requested limits. GFP_ZONEMASK defines which bits within 79 * the gfp_mask should be considered as zone modifiers. Each valid 80 * combination of the zone modifier bits has a corresponding list 81 * of zones (in node_zonelists). Thus for two zone modifiers there 82 * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will 83 * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible 84 * combinations of zone modifiers in "zone modifier space". 85 */ 86#define GFP_ZONEMASK 0x03 87/* 88 * As an optimisation any zone modifier bits which are only valid when 89 * no other zone modifier bits are set (loners) should be placed in 90 * the highest order bits of this field. This allows us to reduce the 91 * extent of the zonelists thus saving space. For example in the case 92 * of three zone modifier bits, we could require up to eight zonelists. 93 * If the left most zone modifier is a "loner" then the highest valid 94 * zonelist would be four allowing us to allocate only five zonelists. 95 * Use the first form when the left most bit is not a "loner", otherwise 96 * use the second. 97 */ 98/* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ 99#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ 100 101/* 102 * On machines where it is needed (eg PCs) we divide physical memory 103 * into multiple physical zones. On a PC we have 3 zones: 104 * 105 * ZONE_DMA < 16 MB ISA DMA capable memory 106 * ZONE_NORMAL 16-896 MB direct mapped by the kernel 107 * ZONE_HIGHMEM > 896 MB only page cache and user processes 108 */ 109 110struct zone { 111 /* Fields commonly accessed by the page allocator */ 112 unsigned long free_pages; 113 unsigned long pages_min, pages_low, pages_high; 114 /* 115 * We don't know if the memory that we're going to allocate will be freeable 116 * or/and it will be released eventually, so to avoid totally wasting several 117 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 118 * to run OOM on the lower zones despite there's tons of freeable ram 119 * on the higher zones). This array is recalculated at runtime if the 120 * sysctl_lowmem_reserve_ratio sysctl changes. 121 */ 122 unsigned long lowmem_reserve[MAX_NR_ZONES]; 123 124 struct per_cpu_pageset pageset[NR_CPUS]; 125 126 /* 127 * free areas of different sizes 128 */ 129 spinlock_t lock; 130 struct free_area free_area[MAX_ORDER]; 131 132 133 ZONE_PADDING(_pad1_) 134 135 /* Fields commonly accessed by the page reclaim scanner */ 136 spinlock_t lru_lock; 137 struct list_head active_list; 138 struct list_head inactive_list; 139 unsigned long nr_scan_active; 140 unsigned long nr_scan_inactive; 141 unsigned long nr_active; 142 unsigned long nr_inactive; 143 unsigned long pages_scanned; /* since last reclaim */ 144 int all_unreclaimable; /* All pages pinned */ 145 146 /* 147 * prev_priority holds the scanning priority for this zone. It is 148 * defined as the scanning priority at which we achieved our reclaim 149 * target at the previous try_to_free_pages() or balance_pgdat() 150 * invokation. 151 * 152 * We use prev_priority as a measure of how much stress page reclaim is 153 * under - it drives the swappiness decision: whether to unmap mapped 154 * pages. 155 * 156 * temp_priority is used to remember the scanning priority at which 157 * this zone was successfully refilled to free_pages == pages_high. 158 * 159 * Access to both these fields is quite racy even on uniprocessor. But 160 * it is expected to average out OK. 161 */ 162 int temp_priority; 163 int prev_priority; 164 165 166 ZONE_PADDING(_pad2_) 167 /* Rarely used or read-mostly fields */ 168 169 /* 170 * wait_table -- the array holding the hash table 171 * wait_table_size -- the size of the hash table array 172 * wait_table_bits -- wait_table_size == (1 << wait_table_bits) 173 * 174 * The purpose of all these is to keep track of the people 175 * waiting for a page to become available and make them 176 * runnable again when possible. The trouble is that this 177 * consumes a lot of space, especially when so few things 178 * wait on pages at a given time. So instead of using 179 * per-page waitqueues, we use a waitqueue hash table. 180 * 181 * The bucket discipline is to sleep on the same queue when 182 * colliding and wake all in that wait queue when removing. 183 * When something wakes, it must check to be sure its page is 184 * truly available, a la thundering herd. The cost of a 185 * collision is great, but given the expected load of the 186 * table, they should be so rare as to be outweighed by the 187 * benefits from the saved space. 188 * 189 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the 190 * primary users of these fields, and in mm/page_alloc.c 191 * free_area_init_core() performs the initialization of them. 192 */ 193 wait_queue_head_t * wait_table; 194 unsigned long wait_table_size; 195 unsigned long wait_table_bits; 196 197 /* 198 * Discontig memory support fields. 199 */ 200 struct pglist_data *zone_pgdat; 201 struct page *zone_mem_map; 202 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 203 unsigned long zone_start_pfn; 204 205 unsigned long spanned_pages; /* total size, including holes */ 206 unsigned long present_pages; /* amount of memory (excluding holes) */ 207 208 /* 209 * rarely used fields: 210 */ 211 char *name; 212} ____cacheline_maxaligned_in_smp; 213 214 215/* 216 * The "priority" of VM scanning is how much of the queues we will scan in one 217 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 218 * queues ("queue_length >> 12") during an aging round. 219 */ 220#define DEF_PRIORITY 12 221 222/* 223 * One allocation request operates on a zonelist. A zonelist 224 * is a list of zones, the first one is the 'goal' of the 225 * allocation, the other zones are fallback zones, in decreasing 226 * priority. 227 * 228 * Right now a zonelist takes up less than a cacheline. We never 229 * modify it apart from boot-up, and only a few indices are used, 230 * so despite the zonelist table being relatively big, the cache 231 * footprint of this construct is very small. 232 */ 233struct zonelist { 234 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited 235}; 236 237 238/* 239 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM 240 * (mostly NUMA machines?) to denote a higher-level memory zone than the 241 * zone denotes. 242 * 243 * On NUMA machines, each NUMA node would have a pg_data_t to describe 244 * it's memory layout. 245 * 246 * Memory statistics and page replacement data structures are maintained on a 247 * per-zone basis. 248 */ 249struct bootmem_data; 250typedef struct pglist_data { 251 struct zone node_zones[MAX_NR_ZONES]; 252 struct zonelist node_zonelists[GFP_ZONETYPES]; 253 int nr_zones; 254 struct page *node_mem_map; 255 struct bootmem_data *bdata; 256 unsigned long node_start_pfn; 257 unsigned long node_present_pages; /* total number of physical pages */ 258 unsigned long node_spanned_pages; /* total size of physical page 259 range, including holes */ 260 int node_id; 261 struct pglist_data *pgdat_next; 262 wait_queue_head_t kswapd_wait; 263 struct task_struct *kswapd; 264 int kswapd_max_order; 265} pg_data_t; 266 267#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 268#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 269 270extern struct pglist_data *pgdat_list; 271 272void __get_zone_counts(unsigned long *active, unsigned long *inactive, 273 unsigned long *free, struct pglist_data *pgdat); 274void get_zone_counts(unsigned long *active, unsigned long *inactive, 275 unsigned long *free); 276void build_all_zonelists(void); 277void wakeup_kswapd(struct zone *zone, int order); 278int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 279 int alloc_type, int can_try_harder, int gfp_high); 280 281/* 282 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 283 */ 284#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 285 286/** 287 * for_each_pgdat - helper macro to iterate over all nodes 288 * @pgdat - pointer to a pg_data_t variable 289 * 290 * Meant to help with common loops of the form 291 * pgdat = pgdat_list; 292 * while(pgdat) { 293 * ... 294 * pgdat = pgdat->pgdat_next; 295 * } 296 */ 297#define for_each_pgdat(pgdat) \ 298 for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) 299 300/* 301 * next_zone - helper magic for for_each_zone() 302 * Thanks to William Lee Irwin III for this piece of ingenuity. 303 */ 304static inline struct zone *next_zone(struct zone *zone) 305{ 306 pg_data_t *pgdat = zone->zone_pgdat; 307 308 if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) 309 zone++; 310 else if (pgdat->pgdat_next) { 311 pgdat = pgdat->pgdat_next; 312 zone = pgdat->node_zones; 313 } else 314 zone = NULL; 315 316 return zone; 317} 318 319/** 320 * for_each_zone - helper macro to iterate over all memory zones 321 * @zone - pointer to struct zone variable 322 * 323 * The user only needs to declare the zone variable, for_each_zone 324 * fills it in. This basically means for_each_zone() is an 325 * easier to read version of this piece of code: 326 * 327 * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) 328 * for (i = 0; i < MAX_NR_ZONES; ++i) { 329 * struct zone * z = pgdat->node_zones + i; 330 * ... 331 * } 332 * } 333 */ 334#define for_each_zone(zone) \ 335 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) 336 337static inline int is_highmem_idx(int idx) 338{ 339 return (idx == ZONE_HIGHMEM); 340} 341 342static inline int is_normal_idx(int idx) 343{ 344 return (idx == ZONE_NORMAL); 345} 346/** 347 * is_highmem - helper function to quickly check if a struct zone is a 348 * highmem zone or not. This is an attempt to keep references 349 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 350 * @zone - pointer to struct zone variable 351 */ 352static inline int is_highmem(struct zone *zone) 353{ 354 return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; 355} 356 357static inline int is_normal(struct zone *zone) 358{ 359 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; 360} 361 362/* These two functions are used to setup the per zone pages min values */ 363struct ctl_table; 364struct file; 365int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 366 void __user *, size_t *, loff_t *); 367extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; 368int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, 369 void __user *, size_t *, loff_t *); 370 371#include <linux/topology.h> 372/* Returns the number of the current Node. */ 373#define numa_node_id() (cpu_to_node(_smp_processor_id())) 374 375#ifndef CONFIG_DISCONTIGMEM 376 377extern struct pglist_data contig_page_data; 378#define NODE_DATA(nid) (&contig_page_data) 379#define NODE_MEM_MAP(nid) mem_map 380#define MAX_NODES_SHIFT 1 381#define pfn_to_nid(pfn) (0) 382 383#else /* CONFIG_DISCONTIGMEM */ 384 385#include <asm/mmzone.h> 386 387#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 388/* 389 * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 390 * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. 391 */ 392#define MAX_NODES_SHIFT 6 393#elif BITS_PER_LONG == 64 394/* 395 * with 64 bit flags field, there's plenty of room. 396 */ 397#define MAX_NODES_SHIFT 10 398#endif 399 400#endif /* !CONFIG_DISCONTIGMEM */ 401 402#if NODES_SHIFT > MAX_NODES_SHIFT 403#error NODES_SHIFT > MAX_NODES_SHIFT 404#endif 405 406/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */ 407#define MAX_ZONES_SHIFT 2 408 409#if ZONES_SHIFT > MAX_ZONES_SHIFT 410#error ZONES_SHIFT > MAX_ZONES_SHIFT 411#endif 412 413#endif /* !__ASSEMBLY__ */ 414#endif /* __KERNEL__ */ 415#endif /* _LINUX_MMZONE_H */ 416

