您现在的位置： Linux教程網 >> UnixLinux > >> Linux基礎 >> 關於Linux

linux內存管理--實際分配函數 buffered_rmqueue

不管是快速分配還是慢速分配，實際分配內存的都是 buffered_rmqueue()函數，其他的都是在選擇從哪個地方來分配比較合適；

還是先來說說各個參數：

struct zone *preferred_zone 表示分配所能接受的最大zone類型

struct zone *zone 表示就在該zone上分配內存；

int order 表示分配頁的階數

gfp_t gfp_flags 分配的標識

page = buffered_rmqueue(preferred_zone, zone, order,
                        gfp_mask, migratetype);



/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
            struct zone *zone, int order, gfp_t gfp_flags,
            int migratetype)
{
    unsigned long flags;
    struct page *page;
    int cold = !!(gfp_flags & __GFP_COLD);//是否指定冷熱頁

again:
    if (likely(order == 0)) {//分配單頁
        struct per_cpu_pages *pcp;
        struct list_head *list;

        local_irq_save(flags);//禁止本地CPU中斷，禁止前先保存中斷狀態
        pcp = &this_cpu_ptr(zone->pageset)->pcp;//獲取到cpu高速緩存頁
        list = &pcp->lists[migratetype];//根據遷移類型，得到高速緩存區的freelist
        if (list_empty(list)) {//空的，高速緩存沒有數據；這可能是上次獲取的cpu高速緩存遷移類型和這次不一樣
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);//該函數向高速緩存中添加內存頁，具體分析見文章後面
            if (unlikely(list_empty(list)))
                goto failed;
        }

        if (cold)
            page = list_entry(list->prev, struct page, lru);
        else
            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);
        pcp->count--;
    } else {
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */
            WARN_ON_ONCE(order > 1);
        }
        spin_lock_irqsave(&zone->lock, flags);
        page = __rmqueue(zone, order, migratetype);
        spin_unlock(&zone->lock);
        if (!page)
            goto failed;
        __mod_zone_freepage_state(zone, -(1 << order),
                      get_pageblock_migratetype(page));
    }

    __count_zone_vm_events(PGALLOC, zone, 1 << order);
    zone_statistics(preferred_zone, zone, gfp_flags);
    local_irq_restore(flags);

    VM_BUG_ON(bad_range(zone, page));
    if (prep_new_page(page, order, gfp_flags))
        goto again;
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}

struct zone結構體中有個struct per_cpu_pageset __percpu *pageset; 成員，該成員用於冷熱分配器，熱頁表示已經在cpu的高速緩存中了；

struct per_cpu_pageset {
    struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
    s8 expire;
#endif
#ifdef CONFIG_SMP
    s8 stat_threshold;
    s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};

cpu緩存頁數組

struct per_cpu_pages {
    int count;      /* number of pages in the list */列表中頁數
    int high;       /* high watermark, emptying needed */列表頁數的上限
    int batch;      /* chunk size for buddy add/remove */添加和刪除頁時，一次操作多少頁。不是單頁刪除和填充的，而是以該單位頁來操作的

    /* Lists of pages, one per migrate type stored on the pcp-lists */
    struct list_head lists[MIGRATE_PCPTYPES];//遷移類型的鏈表
};

從伙伴系統中得到頁，然後填充到cpu的高速緩存中

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order,
            unsigned long count, struct list_head *list,
            int migratetype, int cold)
{
    int mt = migratetype, i;

    spin_lock(&zone->lock);
    for (i = 0; i < count; ++i) {//一個頁面一個頁面處理，
        struct page *page = __rmqueue(zone, order, migratetype);//分配到指定遷移類型的內存頁
        if (unlikely(page == NULL))
            break;

        /*   
         * Split buddy pages returned by expand() are received here
         * in physical page order. The page is added to the callers and
         * list and the list head then moves forward. From the callers
         * perspective, the linked list is ordered by page number in
         * some conditions. This is useful for IO devices that can
         * merge IO requests if the physical pages are ordered
         * properly.
         */
        if (likely(cold == 0))
            list_add(&page->lru, list);//如果是冷頁，則添加到鏈表頭
        else 
            list_add_tail(&page->lru, list);//否則添加鏈表尾部
        if (IS_ENABLED(CONFIG_CMA)) {//條件編譯了CONFIG_CMA選項
            mt = get_pageblock_migratetype(page);//獲取頁面的遷移類型
            if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))//如果不是MIGRATE_CMA和 MIGRATE_CMA
                mt = migratetype;
        }    
        set_freepage_migratetype(page, mt); //設置page的遷移類型
        list = &page->lru;//循環鏈接下一個頁
        if (is_migrate_cma(mt))//如果是MIGRATE_CMA遷移類型
            __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                          -(1 << order));//修改cma遷移類型的頁面計數
    }
    __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));//修改空閒頁面的計數
    spin_unlock(&zone->lock);
    return i;//返回添加到cpu高速緩存鏈表的頁面個數
}

修改對應類型的頁面計數

static inline void __mod_zone_page_state(struct zone *zone,
            enum zone_stat_item item, int delta)
{       
    zone_page_state_add(delta, zone, item);
}  

static inline void zone_page_state_add(long x, struct zone *zone,
                 enum zone_stat_item item)
{
    atomic_long_add(x, &zone->vm_stat[item]);
    atomic_long_add(x, &vm_stat[item]);
}

/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
static struct page *__rmqueue(struct zone *zone, unsigned int order,
                        int migratetype)
{
    struct page *page;

retry_reserve:
    page = __rmqueue_smallest(zone, order, migratetype);//常規情況下，從zone上分配指定的遷移類型的內存頁

    if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {//上面沒有分配到內存頁，並且不是緊急的遷移類型
        page = __rmqueue_fallback(zone, order, migratetype);//修改搬遷其他遷移類型的頁，

        /*
         * Use MIGRATE_RESERVE rather than fail an allocation. goto
         * is used because __rmqueue_smallest is an inline function
         * and we want just one call site
         */
        if (!page) {//沒有成功，則把遷移類型調整為 MIGRATE_RESERVE表示是緊急分配
            migratetype = MIGRATE_RESERVE;
            goto retry_reserve;//重試
        }
    }

    trace_mm_page_alloc_zone_locked(page, order, migratetype);
    return page;
}

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                        int migratetype)
{
    unsigned int current_order;
    struct free_area * area;
    struct page *page;

    /* Find a page of the appropriate size in the preferred list */
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {//掃描所有階的內存
        area = &(zone->free_area[current_order]);
        if (list_empty(&area->free_list[migratetype]))//查看下遷移類型下的鏈表是否為空
            continue;
        //獲取到鏈表中的頁
        page = list_entry(area->free_list[migratetype].next,
                            struct page, lru);
        list_del(&page->lru);
        rmv_page_order(page);//設置屬性，清除buddy標識，也就是設置 page->_mapcount = -1
        area->nr_free--;//從這裡可以看出，nr_free是表示該階下的頁塊的數目，而不是頁的個數
        expand(zone, page, order, current_order, area, migratetype);//這是把從高階分配的頁，逐漸對半分給下一階，直到自己需要的
        return page;
    }

    return NULL;
}

這是buddy的一個重要函數：在高階分配得到內存塊時，比如 8階分配得到內存塊時。而我們需要的是低價的，比如 6；那麼就要調用下面該函數，把8階分配得到的內存塊，掛到7階上，然後從該內存塊上截取一半，再到6階上，這時候再比較發現正是我們需要分配的內存階，就直接返回了；

說下參數：

struct zone *zone：所有的操作都在該zone上完成

struct page *page：高階上分配得到的頁塊

int low：我們需要的內存階

int high：在該階上分配到的內存

struct free_area *area：這是zone上的高階空閒頁數組項

int migratetype：遷移類型

/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- nyc
 */
static inline void expand(struct zone *zone, struct page *page,
    int low, int high, struct free_area *area,
    int migratetype)
{
    unsigned long size = 1 << high;

    while (high > low) {//如果在同階上分配得到了內存頁就不需要執行該函數了
        area--;//從高階空閒數組元素，遞減到下一個階的空閒數組元素
        high--;//下一個階
        size >>= 1;//內存大小的一半
        VM_BUG_ON(bad_range(zone, &page[size]));

#ifdef CONFIG_DEBUG_PAGEALLOC
        if (high < debug_guardpage_minorder()) {
            /*
             * Mark as guard pages (or page), that will allow to
             * merge back to allocator when buddy will be freed.
             * Corresponding page table entries will not be touched,
             * pages will stay not present in virtual address space
             */
            INIT_LIST_HEAD(&page[size].lru);
            set_page_guard_flag(&page[size]);
            set_page_private(&page[size], high);
            /* Guard pages are not available for any usage */
            __mod_zone_freepage_state(zone, -(1 << high),
                          migratetype);
            continue;
        }
#endif
        list_add(&page[size].lru, &area->free_list[migratetype]);//掛入該階的對應遷移類型下的鏈表中
        area->nr_free++;//該階上的內存塊增加
        set_page_order(&page[size], high);//設置private為高階，清除掉buddy標識，因為該頁已經不是伙伴系統的頁了
    }
}

跑到這個函數時，表明上面指定遷移類型從伙伴系統中分配內存失敗，所以要用備用遷移列表；

/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
static int fallbacks[MIGRATE_TYPES][4] = {
    [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
    [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
#ifdef CONFIG_CMA
    [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
    [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
#else
    [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
#endif
    [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
    [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
#endif
};

根據上面的備用遷移類型來遍歷

/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
    struct free_area * area;
    int current_order;
    struct page *page;
    int migratetype, i;

    /* Find the largest possible block of pages in the other list */
    for (current_order = MAX_ORDER-1; current_order >= order;
                        --current_order) {//這是和指定遷移類型的遍歷不一樣，這裡是從最大階開始遍歷，就是為了防止內存碎片
        for (i = 0;; i++) {
            migratetype = fallbacks[start_migratetype][i];

            /* MIGRATE_RESERVE handled later if necessary */
            if (migratetype == MIGRATE_RESERVE)//這是最後的選擇，現在還不到時候
                break;

            area = &(zone->free_area[current_order]);//得到高階空閒數組元素
            if (list_empty(&area->free_list[migratetype]))//如果對應階上的對應遷移類型的空閒頁鏈表是空的，則循環找備用遷移類型的空閒鏈表
                continue;

            page = list_entry(area->free_list[migratetype].next,
                    struct page, lru);//如果找到了空閒頁塊，則當前階上的空閒頁塊遞減
            area->nr_free--;

            /*
             * If breaking a large block of pages, move all free
             * pages to the preferred allocation list. If falling
             * back for a reclaimable kernel allocation, be more
             * aggressive about taking ownership of free pages
             *
             * On the other hand, never change migration
             * type of MIGRATE_CMA pageblocks nor move CMA
             * pages on different free lists. We don't
             * want unmovable pages to be allocated from
             * MIGRATE_CMA areas.
             *///下面是解決剩余的空閒頁，上面的注釋說的很清楚了
	     //解釋下幾個有關遷移類型的全局變量，pageblock_order 表示內核認為是大的分配階(看自己配置，一般會配置MAX_ORDER - 1)；pageblock_nr_pages 大分配階對應的頁數
            if (!is_migrate_cma(migratetype) &&//不是CMA區域
                (unlikely(current_order >= pageblock_order / 2) || //大內存塊，則全部轉到start_migratetype類型下
                 start_migratetype == MIGRATE_RECLAIMABLE || //可回收內存頁，就遷移類型轉換時，會更加積極
                 page_group_by_mobility_disabled)) {
                int pages;
                pages = move_freepages_block(zone, page,
                                start_migratetype);//把這些頁面轉換到 start_migratetype 遷移類型下面去

                /* Claim the whole block if over half of it is free */
                if (pages >= (1 << (pageblock_order-1)) ||
                        page_group_by_mobility_disabled)
                    set_pageblock_migratetype(page,
                                start_migratetype);//這裡是設置整個頁塊的遷移類型，上面move_freepage_block()函數是設置每個頁的遷移類型

                migratetype = start_migratetype;
            }

            /* Remove the page from the freelists */
            list_del(&page->lru);
            rmv_page_order(page);//清除buddy的標識，標識該page將不是buddy系統的了

            /* Take ownership for orders >= pageblock_order */
            if (current_order >= pageblock_order &&
                !is_migrate_cma(migratetype))
                change_pageblock_range(page, current_order,
                            start_migratetype);//這個函數是把剩下的其他pageblock塊都設置成start_migratetype類型

            expand(zone, page, order, current_order, area,
                   is_migrate_cma(migratetype)
                 ? migratetype : start_migratetype);//瓜分大伙伴頁塊，分成小伙伴頁塊

            trace_mm_page_alloc_extfrag(page, order, current_order,
                start_migratetype, migratetype);

            return page;
        }
    }

    return NULL;
}

int move_freepages_block(struct zone *zone, struct page *page,
                int migratetype)
{
    unsigned long start_pfn, end_pfn;
    struct page *start_page, *end_page;

    start_pfn = page_to_pfn(page);//頁幀號
    start_pfn = start_pfn & ~(pageblock_nr_pages-1);//pageblock_nr_pages是遷移類型認為大階所對應的頁數
    start_page = pfn_to_page(start_pfn);
    end_page = start_page + pageblock_nr_pages - 1;//准備遷移pgeblock_nr_pages個頁面，一般要轉換遷移類型的話，就轉換pageblock_nr_pages個連續頁面，這樣會減少內存碎片
    end_pfn = start_pfn + pageblock_nr_pages - 1;

    /* Do not cross zone boundaries */
    if (!zone_spans_pfn(zone, start_pfn))
        start_page = page;
    if (!zone_spans_pfn(zone, end_pfn))//判斷要遷移的內存區是否在一個zone上，不能交錯zone
        return 0;

    return move_freepages(zone, start_page, end_page, migratetype);//把要轉換遷移類型的內存頁面地址范圍給move_freepages()進行轉換
}

/*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 *///對注釋有點不理解？？前一個調用函數明明做了pageblock_nr_pages 對齊處理的，而這裡卻說不必對齊？？？？？？？？？？
int move_freepages(struct zone *zone,
              struct page *start_page, struct page *end_page,
              int migratetype)
{
    struct page *page;
    unsigned long order;
    int pages_moved = 0;

#ifndef CONFIG_HOLES_IN_ZONE
    /*
     * page_zone is not safe to call in this context when
     * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
     * anyway as we check zone boundaries in move_freepages_block().
     * Remove at a later date when no bug reports exist related to
     * grouping pages by mobility
     */
    BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif

    for (page = start_page; page <= end_page;) {
        /* Make sure we are not inadvertently changing nodes */
        VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

        if (!pfn_valid_within(page_to_pfn(page))) {
            page++;
            continue;
        }

        if (!PageBuddy(page)) {//現在頁還是伙伴系統的
            page++;
            continue;
        }

        order = page_order(page);//得到階
        list_move(&page->lru,
              &zone->free_area[order].free_list[migratetype]);//把這些頁搬遷到指定遷移類型對應的鏈表上
        set_freepage_migratetype(page, migratetype);//設置這些頁的遷移類型，page->index = migratetype
        page += 1 << order;//一下子就轉換了 2^order 個頁面
        pages_moved += 1 << order;
    }

    return pages_moved;//把范圍內的頁都遷移完，返回實際遷移了多少頁
}

static void change_pageblock_range(struct page *pageblock_page,
                    int start_order, int migratetype)
{
    int nr_pageblocks = 1 << (start_order - pageblock_order);//得到有多少個pageblock_order的頁塊

    while (nr_pageblocks--) {//循環設置每個pageblock_order頁塊
        set_pageblock_migratetype(pageblock_page, migratetype);//設置頁塊的遷移類型
        pageblock_page += pageblock_nr_pages;//調整到下一個頁塊的地址上去
    }
}

上一篇文章： linux中頁緩沖和塊緩沖之概念
下一篇文章： Linux啟動過程學習

關於Linux

Linux Kernel 3.0.8 內存管理函數

淺談linux性能調優之十一:內存分配管理