您现在的位置： Linux教程網 >> UnixLinux > >> Linux基礎 >> Linux教程

Linux內存管理之伙伴系統（內存分配）

一、Linux伙伴系統分配器

伙伴系統分配器大體上分為兩類。__get_free_pages()類函數返回分配的第一個頁面的線性地址；alloc_pages()類函數返回頁面描述符地址。不管以哪種函數進行分配，最終會調用alloc_pages()進行分配頁面。

為清楚了解其分配制度，先給個伙伴系統數據的存儲框圖

也就是每個order對應一個free_area結構，free_area以不同的類型以鏈表的方式存儲這些內存塊。

二、主分配函數

下面我們來看這個函數（在UMA模式下）

#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
/* Unknown node is current node */
if (nid < 0)
nid = numa_node_id();
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}

上層分配函數__alloc_pages_nodemask()

/*
* This is the 'heart' of the zoned buddy allocator.
*/
/*上層分配器運用了各種方式進行*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page;
/* Convert GFP flags to their corresponding migrate type */
int migratetype = allocflags_to_migratetype(gfp_mask);
gfp_mask &= gfp_allowed_mask;
/*調試用*/
lockdep_trace_alloc(gfp_mask);
/*如果__GFP_WAIT標志設置了，需要等待和重新調度*/
might_sleep_if(gfp_mask & __GFP_WAIT);
/*沒有設置對應的宏*/
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
/* The preferred zone is used for statistics later */
/* 英文注釋所說*/
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
if (!preferred_zone)
return NULL;
/* First allocation attempt */
/*從pcp和伙伴系統中正常的分配內存空間*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
if (unlikely(!page))/*如果上面沒有分配到空間，調用下面函數慢速分配，允許等待和回收*/
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
/*調試用*/
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
}

三、從pcp和伙伴系統中正常的分配內存空間

函數get_page_from_freelist()

/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
/*為分配制定內存空間，遍歷每個zone*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
/*zone對應的下標*/
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
/*遍歷每個zone，進行分配*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
/*在UMA模式下不成立*/ high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
goto try_next_zone;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
/*需要關注水位*/
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
/*從flags中取的mark*/
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/*如果水位正常，從本zone中分配*/
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (zone_reclaim_mode == 0)/*如果上面檢查的水位低於正常值，且沒有設置頁面回收值*/
goto this_zone_full;
/*在UMA模式下下面函數直接返回0*/
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
goto try_next_zone;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
goto this_zone_full;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto this_zone_full;
}
}
try_this_zone:/*本zone正常水位*/
/*先從pcp中分配，然後不行的話再從伙伴系統中分配*/
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (NUMA_BUILD)/*UMA模式為0*/
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup after the first zone is tried but only
* if there are multiple nodes make it worthwhile
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
return page;/*返回頁面*/
}

主分配函數

/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
/*先考慮從pcp中分配空間，當order大於0時再考慮從伙伴系統中分配*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);/*如果分配參數指定了__GFP_COLD標志，則設置cold標志*/
int cpu;
again:
cpu = get_cpu();
if (likely(order == 0)) {/*分配一個頁面時，使用pcp*/
struct per_cpu_pages *pcp;
struct list_head *list;
/*找到zone對應的pcp*/
pcp = &zone_pcp(zone, cpu)->pcp;
list = &pcp->lists[migratetype];/*pcp中對應類型的list*/
/* 這裡需要關中斷，因為內存回收過程可能發送核間中斷，強制每個核從每CPU
緩存中釋放頁面。而且中斷處理函數也會分配單頁。 */
local_irq_save(flags);
if (list_empty(list)) {/*如果pcp中沒有頁面,需要補充*/
/*從伙伴系統中獲得batch個頁面
batch為一次分配的頁面數*/
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
/*如果鏈表仍然為空，申請失敗返回*/
if (unlikely(list_empty(list)))
goto failed;
}
/* 如果分配的頁面不需要考慮硬件緩存(注意不是每CPU頁面緩存)
，則取出鏈表的最後一個節點返回給上層*/
if (cold)
page = list_entry(list->prev, struct page, lru);
else/* 如果要考慮硬件緩存，則取出鏈表的第一個頁面，這個頁面是最近剛釋放到每CPU
緩存的，緩存熱度更高 */
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);/*從pcp中脫離*/
pcp->count--;/*pcp計數減一*/
}
else {/*當order為大於1時，不從pcp中分配，直接考慮從伙伴系統中分配*/
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
/* 關中斷，並獲得管理區的鎖*/
spin_lock_irqsave(&zone->lock, flags);
/*從伙伴系統中相應類型的相應鏈表中分配空間*/
page = __rmqueue(zone, order, migratetype);
/* 已經分配了1 << order個頁面，這裡進行管理區空閒頁面統計計數*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
spin_unlock(&zone->lock);/* 這裡僅僅打開自旋鎖，待後面統計計數設置完畢後再開中斷*/
if (!page)
goto failed;
}
/*事件統計計數，調試*/
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);/*恢復中斷*/
put_cpu();
VM_BUG_ON(bad_range(zone, page));
/* 這裡進行安全性檢查，並進行一些善後工作。
如果頁面標志破壞，返回的頁面出現了問題，則返回試圖分配其他頁面*/
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
failed:
local_irq_restore(flags);
put_cpu();
return NULL;
}

上一篇文章： Linux內存管理之伙伴系統（建立）
下一篇文章： Linux內存管理之伙伴系統（內存釋放）

Linux教程