歡迎來到Linux教程網
Linux教程網
Linux教程網
Linux教程網
Linux教程網 >> Linux綜合 >> Linux內核 >> linux內核md源代碼解讀 五 介紹raidd5陣列的運行

linux內核md源代碼解讀 五 介紹raidd5陣列的運行

日期:2017/3/3 16:17:34   编辑:Linux內核

如果看懂了raid1陣列的run函數,那麼看raid5陣列run就非常輕松了,因為兩者要做的事情都是大同小異。

raid5的run函數很長,但很大一部分跟創建運行是沒有關系的,特別是有一段跟reshape相關的,大多數系統都不關注該功能,因此可以直接跳過。經過刪減之後的run函數如下:

5307 static int run(struct mddev *mddev)  
5308 {  
5309         struct r5conf *conf;  
5310         int working_disks = 0;  
5311         int dirty_parity_disks = 0;  
5312         struct md_rdev *rdev;  
5313         sector_t reshape_offset = 0;  
5314         int i;  
5315         long long min_offset_diff = 0;  
5316         int first = 1;  
...  
5426         if (mddev->private == NULL)  
5427                 conf = setup_conf(mddev);  
5428         else
5429                 conf = mddev->private;  
5430  
5431         if (IS_ERR(conf))  
5432                 return PTR_ERR(conf);  
5433  
5434         conf->min_offset_diff = min_offset_diff;  
5435         mddev->thread = conf->thread;  
5436         conf->thread = NULL;  
5437         mddev->private = conf;  
...  
5491         /* 
5492          * 0 for a fully functional array, 1 or 2 for a degraded array. 
5493          */
5494         mddev->degraded = calc_degraded(conf);  
...  
5503         /* device size must be a multiple of chunk size */
5504         mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);  
5505         mddev->resync_max_sectors = mddev->dev_sectors;  
...  
5556         md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));  
5557  
5558         if (mddev->queue) {  
...  
5628         }  
5629  
5630         return 0;

是不是感覺超級簡單呢,就像有些事情表面上看起來很復雜,但只要認真地去分析之後發現其實是有規律可循的。就像這個run函數,做的事情與raid1的run是相同的,就是建立讀寫的上下文環境。

5427行,創建struct r5conf,跟進函數:

5131 static struct r5conf *setup_conf(struct mddev *mddev)  
5132 {  
5133         struct r5conf *conf;  
5134         int raid_disk, memory, max_disks;  
5135         struct md_rdev *rdev;  
5136         struct disk_info *disk;  
5137         char pers_name[6];  
5138  
5139         if (mddev->new_level != 5  
5140             && mddev->new_level != 4  
5141             && mddev->new_level != 6) {  
5142                 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",  
5143                        mdname(mddev), mddev->new_level);  
5144                 return ERR_PTR(-EIO);  
5145         }  
5146         if ((mddev->new_level == 5  
5147              && !algorithm_valid_raid5(mddev->new_layout)) ||  
5148             (mddev->new_level == 6  
5149              && !algorithm_valid_raid6(mddev->new_layout))) {  
5150                 printk(KERN_ERR "md/raid:%s: layout %d not supported\n",  
5151                        mdname(mddev), mddev->new_layout);  
5152                 return ERR_PTR(-EIO);  
5153         }  
5154         if (mddev->new_level == 6 && mddev->raid_disks < 4) {  
5155                 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",  
5156                        mdname(mddev), mddev->raid_disks);  
5157                 return ERR_PTR(-EINVAL);  
5158         }  
5159  
5160         if (!mddev->new_chunk_sectors ||  
5161             (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||  
5162             !is_power_of_2(mddev->new_chunk_sectors)) {  
5163                 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",  
5164                        mdname(mddev), mddev->new_chunk_sectors << 9);  
5165                 return ERR_PTR(-EINVAL);  
5166         }  
5167  
5168         conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);  
5169         if (conf == NULL)  
5170                 goto abort;  
5171         spin_lock_init(&conf->device_lock);  
5172         init_waitqueue_head(&conf->wait_for_stripe);  
5173         init_waitqueue_head(&conf->wait_for_overlap);  
5174         INIT_LIST_HEAD(&conf->handle_list);  
5175         INIT_LIST_HEAD(&conf->hold_list);  
5176         INIT_LIST_HEAD(&conf->delayed_list);  
5177         INIT_LIST_HEAD(&conf->bitmap_list);  
5178         INIT_LIST_HEAD(&conf->inactive_list);  
5179         atomic_set(&conf->active_stripes, 0);  
5180         atomic_set(&conf->preread_active_stripes, 0);  
5181         atomic_set(&conf->active_aligned_reads, 0);  
5182         conf->bypass_threshold = BYPASS_THRESHOLD;  
5183         conf->recovery_disabled = mddev->recovery_disabled - 1;  
5184  
5185         conf->raid_disks = mddev->raid_disks;  
5186         if (mddev->reshape_position == MaxSector)  
5187                 conf->previous_raid_disks = mddev->raid_disks;  
5188         else
5189                 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;  
5190         max_disks = max(conf->raid_disks, conf->previous_raid_disks);  
5191         conf->scribble_len = scribble_len(max_disks);  
5192  
5193         conf->disks = kzalloc(max_disks * sizeof(struct disk_info),  
5194                               GFP_KERNEL);  
5195         if (!conf->disks)  
5196                 goto abort;  
5197  
5198         conf->mddev = mddev;  
5199  
5200         if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)  
5201                 goto abort;  
5202  
5203         conf->level = mddev->new_level;  
5204         if (raid5_alloc_percpu(conf) != 0)  
5205                 goto abort;  
5206  
5207         pr_debug("raid456: run(%s) called.\n", mdname(mddev));  
5208  
5209         rdev_for_each(rdev, mddev) {  
5210                 raid_disk = rdev->raid_disk;  
5211                 if (raid_disk >= max_disks  
5212                     || raid_disk < 0)  
5213                         continue;  
5214                 disk = conf->disks + raid_disk;  
5215  
5216                 if (test_bit(Replacement, &rdev->flags)) {  
5217                         if (disk->replacement)  
5218                                 goto abort;  
5219                         disk->replacement = rdev;  
5220                 } else {  
5221                         if (disk->rdev)  
5222                                 goto abort;  
5223                         disk->rdev = rdev;  
5224                 }  
5225  
5226                 if (test_bit(In_sync, &rdev->flags)) {  
5227                         char b[BDEVNAME_SIZE];  
5228                         printk(KERN_INFO "md/raid:%s: device %s operational as raid"
5229                                " disk %d\n",  
5230                                mdname(mddev), bdevname(rdev->bdev, b), raid_disk);  
5231                 } else if (rdev->saved_raid_disk != raid_disk)  
5232                         /* Cannot rely on bitmap to complete recovery */
5233                         conf->fullsync = 1;  
5234         }  
5235  
5236         conf->chunk_sectors = mddev->new_chunk_sectors;  
5237         conf->level = mddev->new_level;  
5238         if (conf->level == 6)  
5239                 conf->max_degraded = 2;  
5240         else
5241                 conf->max_degraded = 1;  
5242         conf->algorithm = mddev->new_layout;  
5243         conf->max_nr_stripes = NR_STRIPES;  
5244         conf->reshape_progress = mddev->reshape_position;  
5245         if (conf->reshape_progress != MaxSector) {  
5246                 conf->prev_chunk_sectors = mddev->chunk_sectors;  
5247                 conf->prev_algo = mddev->layout;  
5248         }  
5249  
5250         memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +  
5251                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;  
5252         if (grow_stripes(conf, conf->max_nr_stripes)) {  
5253                 printk(KERN_ERR  

5254                        "md/raid:%s: couldn't allocate %dkB for buffers\n",  
5255                        mdname(mddev), memory);  
5256                 goto abort;  
5257         } else
5258                 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",  
5259                        mdname(mddev), memory);  
5260  
5261         sprintf(pers_name, "raid%d", mddev->new_level);  
5262         conf->thread = md_register_thread(raid5d, mddev, pers_name);  
5263         if (!conf->thread) {  
5264                 printk(KERN_ERR  
5265                        "md/raid:%s: couldn't allocate thread.\n",  
5266                        mdname(mddev));  
5267                 goto abort;  
5268         }  
5269  
5270         return conf;

同樣,這個函數與raid1的setup_conf也很相似。

5139行,檢查陣列級別,支持raid4,5,6。

5147行,檢查raid5的layout是否正確。

5160行,檢查陣列chunk大小,必須為page整數倍並且是2的n次方。

5168行,申請struct r5conf內存空間並初始化。

5185行,設置數據盤數。

5193行,申請struct disk_info,用於保存與磁盤的關聯。

5200行,用於保存struct stripe_head的哈希表,用於快速查找指定扇區的stripe_head。

5209-5234行,最關鍵的是5223行,關聯struct disk_info與struct md_rdev。

5236行,設置條塊大小。

5237行,設置級別。

5241行,設置最大降級磁盤數。

5252行,申請struct stripe_head slab。跟進函數grow_stripes:

1501 static int grow_stripes(struct r5conf *conf, int num)  
1502 {  
1503         struct kmem_cache *sc;  
1504         int devs = max(conf->raid_disks, conf->previous_raid_disks);  
1505   
1506         if (conf->mddev->gendisk)  
1507                 sprintf(conf->cache_name[0],  
1508                         "raid%d-%s", conf->level, mdname(conf->mddev));  
1509         else
1510                 sprintf(conf->cache_name[0],  
1511                         "raid%d-%p", conf->level, conf->mddev);  
1512         sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);  
1513   
1514         conf->active_name = 0;  
1515         sc = kmem_cache_create(conf->cache_name[conf->active_name],  
1516                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),  
1517                                0, 0, NULL);  
1518         if (!sc)  
1519                 return 1;  
1520         conf->slab_cache = sc;  
1521         conf->pool_size = devs;  
1522         while (num--)  
1523                 if (!grow_one_stripe(conf))  
1524                         return 1;  
1525         return 0;  
1526 }

1504行,計算數據盤數目。

1506行,設置slab名稱。

1515行,創建slab, 空間大小為sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev)是因為struct stripe_head尾部有devs個struct r5dev。

1523行,創建空閒struct stripe_head。然而只是簡單地創建就沒有必要跟進看了,但該函數中隱藏著一個最經常調用的函數release_stripe,所以還是有必要跟進的:

1477 static int grow_one_stripe(struct r5conf *conf)  
1478 {  
1479         struct stripe_head *sh;  
1480         sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);  
1481         if (!sh)  
1482                 return 0;  
1483  
1484         sh->raid_conf = conf;  
1485  
1486         spin_lock_init(&sh->stripe_lock);  
1487  
1488         if (grow_buffers(sh)) {  
1489                 shrink_buffers(sh);  
1490                 kmem_cache_free(conf->slab_cache, sh);  
1491                 return 0;  
1492         }  
1493         /* we just created an active stripe so... */
1494         atomic_set(&sh->count, 1);  
1495         atomic_inc(&conf->active_stripes);  
1496         INIT_LIST_HEAD(&sh->lru);  
1497         release_stripe(sh);  
1498         return 1;  
1499 }

 

1480行,新申請一個struct stripe_head。

1484行,關聯struct r5conf。

1488行,grow_buffers,為每個struct r5dev申請一個page頁用於stripe_head頁拷貝和計算校驗。頁指針保存在sh->dev[].page指針中。

1494行,設置struct stripe_head計數器,在1497行release_stripe中會遞減。

1495行,遞增陣列活躍條帶數。

1496行,lru鏈表初始化。

1497行,釋放struct stripe_head,添加到空閒條帶空閒鏈表。release_stripe最終會調用到do_release_stripe,do_release_stripe裡會執行到下面幾行:

228                         list_add_tail(&sh->lru, &conf->inactive_list);  
229                         wake_up(&conf->wait_for_stripe);  
230                         if (conf->retry_read_aligned)  
231                                 md_wakeup_thread(conf->mddev->thread);

228行,添加struct stripe_head到inactive_list,即條帶空閒鏈表。229行,喚醒等待空閒條帶的請求,因為每個陣列的struct stripe_head資源是有限的,申請不到時就在等待隊列上等候。231行,喚醒條塊讀請求。繼續返回到setup_conf函數中,這裡已經通過grow_stripes為陣列申請了NR_STRIPES個struct stripe_head。5262行,創建raid5主線程。這樣setup_conf函數也結束了,繼續返回到run函數中。5434-5437行,conf和mddev的關聯和賦值。5494-5556行,mddev相關域的賦值。5558行,mddev請求隊列struct queue_limits設置等等初始化。小結一下,raid5的run函數同raid1基本作用是一樣的,都是向上虛擬一個塊設備,向下包裝磁盤,建立讀寫請求的通道。區別在於raid5的讀寫是以struct stripe_head為基礎的,而在raid5的讀寫中也是圍繞著struct stripe_head展開的。下一小節介紹raid10陣列的運行。

出處:http://blog.csdn.net/liumangxiong

Copyright © Linux教程網 All Rights Reserved