詳細的概念性解釋就不說了,如果對vmalloc沒有一點概念的話,可以稍微找些資料了解下,這裡主要就是分析下在內核中vmalloc的實現;
直接物理內存映射(內核邏輯地址)-- 8 MB -- vm -- 1 page -- vm -- 1page --vm ......
大概就是這樣:邏輯地址以high_memory為結束邊界;然後是 8MB 的空洞(主要是防止指針越界訪問);接著就是 VMALLOC_START為邊界 開始了vmalloc 區域,該區域有多個vm小區域組成,每個小區域之間有1頁(一個page大小)的空洞地址,作用還是防止越界訪問;結束是以VMALLOC_END,後面還有個空洞地址,接著最後就是固定映射和臨時映射的區域了;
結構體:
struct vm_struct {
struct vm_struct *next;//所有vm_struct鏈接的鏈表,vmlist是表頭
void *addr;//分配得到的子區域在虛擬地址空間中的起始地址
unsigned long size;//表示區域長度
unsigned long flags;//標識
struct page **pages;//這是個指針數組,每個數組元素都是一個被映射的page指針
unsigned int nr_pages;//表示多少個page被映射
phys_addr_t phys_addr;
const void *caller;
};
這個結構體和進程虛擬地址空間的vma非常相識,值得注意;
下面這個結構體是用來管理kvm地址的
struct vmap_area {
unsigned long va_start;
unsigned long va_end;
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct list_head purge_list; /* "lazy purge" list */
struct vm_struct *vm;
struct rcu_head rcu_head;
};
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
{
return __vmalloc_node_flags(size, NUMA_NO_NODE,
GFP_KERNEL | __GFP_HIGHMEM);//從高內存分配
}
static inline void *__vmalloc_node_flags(unsigned long size,
int node, gfp_t flags)
{
return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
node, __builtin_return_address(0));
}
__builtin_return_address(0)的含義是,得到當前函數返回地址,即此函數被別的函數調用,然後此函數執行完畢後,返回,所謂返回地址就是那時候的地址。__builtin_return_address(1)的含義是,得到當前函數的調用者的返回地址。注意是調用者的返回地址,而不是函數起始地址。
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, prot, node, caller);
}
這是個主要函數,說明下參數:
unsigned long size :表示要分配的內存大小;
unsigned long align:表示以什麼對齊,到這裡是 1;
unsigned long start:表示映射區域從什麼地方開始查找,這裡為:VMALLOC_START;
unsigned long end :表示映射區域從什麼地方結束查找,這裡為:VMALLOC_END;
gfp_t gfp_mask:表示分配的標識,這裡為:GFP_KERNEL | __GFP_HIGHMEM;
pgprot_t prot:表示區域的保護模式,這裡為:PAGE_KERNEL;
int node:表示分配節點,這裡為:-1;
const void *caller:表示函數地址,這裡表示的是__vmalloc_node的返回地址
/**
* __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @start: vm area range start
* @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, int node, const void *caller)
{
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
size = PAGE_ALIGN(size);//size必須頁面對齊,因為是映射到頁面上,所以必須的頁面對齊
if (!size || (size >> PAGE_SHIFT) > totalram_pages)//大小檢查下
goto fail;
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
start, end, node, gfp_mask, caller);//從這裡已經得到area了(也可能為NULL)
if (!area)
goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
if (!addr)
return NULL;
/*
* In this function, newly allocated vm_struct has VM_UNLIST flag.
* It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
clear_vm_unlist(area);//已經把所有成員都初始化好了,可以清除VM_UNLIST標識了
/*
* A ref_count = 3 is needed because the vm_struct and vmap_area
* structures allocated in the __get_vm_area_node() function contain
* references to the virtual address of the vmalloc'ed block.
*/
kmemleak_alloc(addr, real_size, 3, gfp_mask);
return addr;
fail:
warn_alloc_failed(gfp_mask, 0,
"vmalloc: allocation failure: %lu bytes\n",
real_size);
return NULL;
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP) {
int bit = fls(size);
if (bit > IOREMAP_MAX_ORDER)
bit = IOREMAP_MAX_ORDER;
else if (bit < PAGE_SHIFT)
bit = PAGE_SHIFT;
align = 1ul << bit;
}//ioremap映射時要做的一些檢查
size = PAGE_ALIGN(size);//頁對齊
if (unlikely(!size))
return NULL;
//分配一個area結構體內存
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
/*
* We always allocate a guard page.
*/
size += PAGE_SIZE;//加上空洞頁,空洞頁是不分配物理內存的
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);//分配一個虛擬內存區域kvm
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
/*
* When this function is called from __vmalloc_node_range,
* we add VM_UNLIST flag to avoid accessing uninitialized
* members of vm_struct such as pages and nr_pages fields.
* They will be set later.
*/
if (flags & VM_UNLIST)//標識含義上面有解釋,下面的函數主要是從va中賦值給area
setup_vmalloc_vm(area, va, flags, caller);
else
insert_vmalloc_vm(area, va, flags, caller);
return area;
}
下面是從
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
struct rb_node *n;
unsigned long addr;
int purged = 0;
struct vmap_area *first;
BUG_ON(!size);//size = 0
BUG_ON(size & ~PAGE_MASK); //size要頁對齊
BUG_ON(!is_power_of_2(align));//size要以2的n次冪對齊
//分配結構體
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
retry:
spin_lock(&vmap_area_lock);
/*
* Invalidate cache if we have more permissive parameters.
* cached_hole_size notes the largest hole noticed _below_
* the vmap_area cached in free_vmap_cache: if size fits
* into that hole, we want to scan from vstart to reuse
* the hole instead of allocating above free_vmap_cache.
* Note that __free_vmap_area may update free_vmap_cache
* without updating cached_hole_size or cached_align.
*///下面判斷cache vmap是否有用,主要檢查是否存在、大小、起始地址、對齊
if (!free_vmap_cache ||
size < cached_hole_size ||
vstart < cached_vstart ||
align < cached_align) {
nocache:
cached_hole_size = 0;
free_vmap_cache = NULL;
}
/* record if we encounter less permissive parameters */
cached_vstart = vstart;
cached_align = align;
/* find starting point for our search */
if (free_vmap_cache) {//把cache 中的vmap拿出來比較下
first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
addr = ALIGN(first->va_end, align);//首先要對齊後再比較
if (addr < vstart)//結束地址都比開始地址小,那肯定不能用
goto nocache;
if (addr + size < addr)//地址越界
goto overflow;
} else {//沒有free_vmap_cache
addr = ALIGN(vstart, align);//和上面一樣檢查下地址
if (addr + size < addr)
goto overflow;
n = vmap_area_root.rb_node;
first = NULL;
//下面是紅黑樹的遍歷,主要是看看比較的條件
while (n) {
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end >= addr) {//找到一個結束地址大於需要映射的開始地址
first = tmp;
if (tmp->va_start <= addr)//這裡就表明,起始地址在區域中間
break;
n = n->rb_left;//這裡往葉子節點走,則分配地址更小的區域
} else
n = n->rb_right;//這邊分配,則分配地址更大的區域
}
if (!first)//表示找到了起始地址,映射起始地址比任何區域的結束地址都大
goto found;
}
/* from the starting point, walk areas until a suitable hole is found */
while (addr + size > first->va_start && addr + size <= vend) {//這裡是計算空洞地址是否足夠
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);//重點是addr每次都會移動到區域結尾處
if (addr + size < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))//如果是最後一個區域,那接下來的都是空洞地址
goto found;
first = list_entry(first->list.next,
struct vmap_area, list);//下一個地址
}
found://如果要理解上面的代碼,其實分析下first的幾種情況就可以明了了;
if (addr + size > vend)//看看是否超出vmalloc_end的界限
goto overflow;
//下面開始賦值了
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
__insert_vmap_area(va);//插入紅黑樹和鏈表中
free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
BUG_ON(va->va_start & (align-1));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
return va;
overflow://沒有地址分配的打印
spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
goto retry;
}
if (printk_ratelimit())
printk(KERN_WARNING
"vmap allocation for size %lu failed: "
"use vmalloc= to increase size.\n", size);
kfree(va);
return ERR_PTR(-EBUSY);
}
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node, const void *caller)
{
const int order = 0;
struct page **pages;
unsigned int nr_pages, array_size, i;
gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;//分配初始化為0的內存頁
nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;//去掉一個空洞頁
array_size = (nr_pages * sizeof(struct page *));//數組大小
area->nr_pages = nr_pages;//實際映射的頁數
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {//如果大於一個page,則使用vmalloc來分配。這裡是遞歸
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, caller);
area->flags |= VM_VPAGES;//標識是vmalloc分配的內存
} else {//數組比較下,就用kmalloc來分配,node = -1
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
area->caller = caller;//這是__vmalloc_node_flags()函數的返回地址吧,這個不知道有什麼用??
if (!area->pages) {//分配數組空間失敗,就釋放area
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
if (node < 0)
page = alloc_page(tmp_mask);
else
page = alloc_pages_node(node, tmp_mask, order);
if (unlikely(!page)) {//如果有一個頁分配失敗的話就全部失敗,釋放掉開始分配的內存;
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;//記錄頁面數組
}
if (map_vm_area(area, prot, &pages))//利用頁表項來建立映射
goto fail;
return area->addr;
fail:
warn_alloc_failed(gfp_mask, order,
"vmalloc: allocation failure, allocated %ld of %ld bytes\n",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
-------------------------------釋放vmalloc分配的頁==vfree()-------------------------------------
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
*
* Free the virtually continuous memory area starting at @addr, as
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
* NULL, no operation is performed.
*
* Must not be called in NMI context (strictly speaking, only if we don't
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
*
* NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
*
*/
void vfree(const void *addr)
{
BUG_ON(in_nmi());
kmemleak_free(addr);//檢查內存洩漏函數
if (!addr)//簡單做下檢查
return;
if (unlikely(in_interrupt())) {
struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
llist_add((struct llist_node *)addr, &p->list);
schedule_work(&p->wq);
} else
__vunmap(addr, 1);
}
釋放的主要函數,vmalloc和其他虛擬映射的地址釋放也是調用該函數:參數是:addr和1
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
if (!addr)//NULL
return;
if ((PAGE_SIZE-1) & (unsigned long)addr) {//對齊檢查
WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
return;
}
area = remove_vm_area(addr);//釋放虛擬地址
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
debug_check_no_locks_freed(addr, area->size);
debug_check_no_obj_freed(addr, area->size);
if (deallocate_pages) {
int i;
for (i = 0; i < area->nr_pages; i++) {//釋放物理內存頁
struct page *page = area->pages[i];
BUG_ON(!page);
__free_page(page);
}
if (area->flags & VM_VPAGES)//如果pages是vmalloc分配的(數組大小大於一個page時)則用vfree釋放
vfree(area->pages);
else
kfree(area->pages);
}
kfree(area);
return;
}