一、Linux伙伴系統分配器
伙伴系統分配器大體上分為兩類。__get_free_pages()類函數返回分配的第一個頁面的線性地址;alloc_pages()類函數返回頁面描述符地址。不管以哪種函數進行分配,最終會調用alloc_pages()進行分配頁面。
為清楚了解其分配制度,先給個伙伴系統數據的存儲框圖
也就是每個order對應一個free_area結構,free_area以不同的類型以鏈表的方式存儲這些內存塊。
二、主分配函數
下面我們來看這個函數(在UMA模式下)
- #define alloc_pages(gfp_mask, order) \
- alloc_pages_node(numa_node_id(), gfp_mask, order)
-
- static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
- unsigned int order)
- {
- /* Unknown node is current node */
- if (nid < 0)
- nid = numa_node_id();
-
- return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
- }
- static inline struct page *
- __alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist)
- {
- return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
- }
上層分配函數__alloc_pages_nodemask()
- /*
- * This is the 'heart' of the zoned buddy allocator.
- */
- /*上層分配器運用了各種方式進行*/
- struct page *
- __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
- {
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zone *preferred_zone;
- struct page *page;
-
- /* Convert GFP flags to their corresponding migrate type */
- int migratetype = allocflags_to_migratetype(gfp_mask);
-
- gfp_mask &= gfp_allowed_mask;
- /*調試用*/
- lockdep_trace_alloc(gfp_mask);
- /*如果__GFP_WAIT標志設置了,需要等待和重新調度*/
- might_sleep_if(gfp_mask & __GFP_WAIT);
- /*沒有設置對應的宏*/
- if (should_fail_alloc_page(gfp_mask, order))
- return NULL;
-
- /*
- * Check the zones suitable for the gfp_mask contain at least one
- * valid zone. It's possible to have an empty zonelist as a result
- * of GFP_THISNODE and a memoryless node
- */
- if (unlikely(!zonelist->_zonerefs->zone))
- return NULL;
-
- /* The preferred zone is used for statistics later */
- /* 英文注釋所說*/
- first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
- if (!preferred_zone)
- return NULL;
-
- /* First allocation attempt */
- /*從pcp和伙伴系統中正常的分配內存空間*/
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
- preferred_zone, migratetype);
- if (unlikely(!page))/*如果上面沒有分配到空間,調用下面函數慢速分配,允許等待和回收*/
- page = __alloc_pages_slowpath(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
- /*調試用*/
- trace_mm_page_alloc(page, order, gfp_mask, migratetype);
- return page;
- }
三、從pcp和伙伴系統中正常的分配內存空間
函數get_page_from_freelist()
- /*
- * get_page_from_freelist goes through the zonelist trying to allocate
- * a page.
- */
- /*為分配制定內存空間,遍歷每個zone*/
- static struct page *
- get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
- struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
- {
- struct zoneref *z;
- struct page *page = NULL;
- int classzone_idx;
- struct zone *zone;
- nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
- int zlc_active = 0; /* set if using zonelist_cache */
- int did_zlc_setup = 0; /* just call zlc_setup() one time */
- /*zone對應的下標*/
- classzone_idx = zone_idx(preferred_zone);
- zonelist_scan:
- /*
- * Scan zonelist, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- /*遍歷每個zone,進行分配*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- /*在UMA模式下不成立*/ high_zoneidx, nodemask) {
- if (NUMA_BUILD && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
- continue;
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
-
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
- /*需要關注水位*/
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
- unsigned long mark;
- int ret;
- /*從flags中取的mark*/
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- /*如果水位正常,從本zone中分配*/
- if (zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto try_this_zone;
-
- if (zone_reclaim_mode == 0)/*如果上面檢查的水位低於正常值,且沒有設置頁面回收值*/
- goto this_zone_full;
- /*在UMA模式下下面函數直接返回0*/
- ret = zone_reclaim(zone, gfp_mask, order);
- switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
- /* did not scan */
- goto try_next_zone;
- case ZONE_RECLAIM_FULL:
- /* scanned but unreclaimable */
- goto this_zone_full;
- default:
- /* did we reclaim enough */
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto this_zone_full;
- }
- }
-
- try_this_zone:/*本zone正常水位*/
- /*先從pcp中分配,然後不行的話再從伙伴系統中分配*/
- page = buffered_rmqueue(preferred_zone, zone, order,
- gfp_mask, migratetype);
- if (page)
- break;
- this_zone_full:
- if (NUMA_BUILD)/*UMA模式為0*/
- zlc_mark_zone_full(zonelist, z);
- try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup after the first zone is tried but only
- * if there are multiple nodes make it worthwhile
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
- }
-
- if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- goto zonelist_scan;
- }
- return page;/*返回頁面*/
- }
主分配函數
- /*
- * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
- * we cheat by calling it from here, in the order > 0 path. Saves a branch
- * or two.
- */
- /*先考慮從pcp中分配空間,當order大於0時再考慮從伙伴系統中分配*/
- static inline
- struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
- {
- unsigned long flags;
- struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);/*如果分配參數指定了__GFP_COLD標志,則設置cold標志*/
- int cpu;
-
- again:
- cpu = get_cpu();
- if (likely(order == 0)) {/*分配一個頁面時,使用pcp*/
- struct per_cpu_pages *pcp;
- struct list_head *list;
- /*找到zone對應的pcp*/
- pcp = &zone_pcp(zone, cpu)->pcp;
- list = &pcp->lists[migratetype];/*pcp中對應類型的list*/
-
- /* 這裡需要關中斷,因為內存回收過程可能發送核間中斷,強制每個核從每CPU
- 緩存中釋放頁面。而且中斷處理函數也會分配單頁。 */
- local_irq_save(flags);
- if (list_empty(list)) {/*如果pcp中沒有頁面,需要補充*/
- /*從伙伴系統中獲得batch個頁面
- batch為一次分配的頁面數*/
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- /*如果鏈表仍然為空,申請失敗返回*/
- if (unlikely(list_empty(list)))
- goto failed;
- }
- /* 如果分配的頁面不需要考慮硬件緩存(注意不是每CPU頁面緩存)
- ,則取出鏈表的最後一個節點返回給上層*/
- if (cold)
- page = list_entry(list->prev, struct page, lru);
- else/* 如果要考慮硬件緩存,則取出鏈表的第一個頁面,這個頁面是最近剛釋放到每CPU
- 緩存的,緩存熱度更高 */
- page = list_entry(list->next, struct page, lru);
-
- list_del(&page->lru);/*從pcp中脫離*/
- pcp->count--;/*pcp計數減一*/
- }
- else {/*當order為大於1時,不從pcp中分配,直接考慮從伙伴系統中分配*/
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- /*
- * __GFP_NOFAIL is not to be used in new code.
- *
- * All __GFP_NOFAIL callers should be fixed so that they
- * properly detect and handle allocation failures.
- *
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with
- * __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- }
- /* 關中斷,並獲得管理區的鎖*/
- spin_lock_irqsave(&zone->lock, flags);
- /*從伙伴系統中相應類型的相應鏈表中分配空間*/
- page = __rmqueue(zone, order, migratetype);
- /* 已經分配了1 << order個頁面,這裡進行管理區空閒頁面統計計數*/
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
- spin_unlock(&zone->lock);/* 這裡僅僅打開自旋鎖,待後面統計計數設置完畢後再開中斷*/
- if (!page)
- goto failed;
- }
- /*事件統計計數,調試*/
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);/*恢復中斷*/
- put_cpu();
-
- VM_BUG_ON(bad_range(zone, page));
-
- /* 這裡進行安全性檢查,並進行一些善後工作。
- 如果頁面標志破壞,返回的頁面出現了問題,則返回試圖分配其他頁面*/
- if (prep_new_page(page, order, gfp_flags))
- goto again;
- return page;
-
- failed:
- local_irq_restore(flags);
- put_cpu();
- return NULL;
- }