Linux教程網 >> Linux基礎 >> Linux教程 >> Project 1-3: 鏈接分析之鏈接統計

Project 1-3: 鏈接分析之鏈接統計

日期：2017/2/28 16:27:18 编辑：Linux教程

現在我們組分析源碼和統計分析鏈接的工作正在同步進行，稍後還會有分析源碼和統計分析鏈接的進度報告發布。

本文說的是如何解析鏈接關系，供統計分析之用。

一句話——人生苦短，我用Python。

基本工作原理是遍歷mirror下面的網頁, 用正則表達式解析出鏈接地址, 然後輸出鏈接關系.
最後得到的文件可以作為下一個程序的輸入, 以統計網頁出度入度和計算PR值.

以下是源碼：

1 # coding: utf-8
2 #
3
4 import os, re
5
6 rootdir= '/home/xxx/workspace/heritrix/jobs/ccer-20100930010817713/mirror/www.ccer.pku.edu.cn'
7
8 dotfile = open('links.data', 'w', 4096000)
9
10 count = 0
11 urllist = []
12
13 def append2list(url):
14 if url not in urllist:
15 urllist.append(url)
16 return urllist.index(url)
17
18 def extract(dirr, name):
19 #print "extracting:", dirr, name
20 f = open(dirr+'/'+name, 'r')
21 cururl = 'http://' + dirr[dirr.find('www.ccer.pku.edu.cn'):] + '/' + name
22 curindex = append2list(cururl)
23
24 hrefs = re.findall(r'''href=('|")?([^\s'"><()]+)(\1?)''', f.read())
25 for href in hrefs:
26 if not href[0] == href[2]\
27 or href[1] == '#'\
28 or href[1] == './'\
29 or href[1].startswith('mailto:')\
30 or href[1].startswith('javascript')\
31 or href[1].endswith('.css')\
32 or href[1].endswith('.jpg')\
33 or href[1].endswith('.bmp')\
34 or href[1].endswith('.jpeg')\
35 or href[1].endswith('.ico')\
36 or href[1].endswith('.gif')\
37 or href[1].endswith('.pdf')\
38 or href[1].endswith('.ppt')\
39 or href[1].endswith('.doc')\
40 or href[1].endswith('.xls')\
41 or href[1].endswith('.pptx')\
42 or href[1].endswith('.docx')\
43 or href[1].endswith('.xlsx')\
44 or href[1].endswith('.zip')\
45 or href[1].endswith('.png'):
46 pass
47 else:
48 realref = href[1]
49 if not realref.startswith('http'): #relative links
50 if '.asp?' in realref:
51 realref = realref.replace('.asp?', '', 1) + '.asp' # file name on disk
52 realref = 'http://' + dirr[dirr.find('www.ccer.pku.edu.cn'):] + '/' + realref
53 #print realref
54 refindex = append2list(realref)
55 global count
56 dotfile.write('%d %d\n' % (curindex, refindex))
57 count += 1
58 if count % 10000 == 0:
59 print count
60 #f.close()
61
62 def filter(dummy, dirr, filess):
63 for name in filess:
64 if os.path.splitext(name)[1] in ['.asp', '.htm', '.html'] and os.path.isfile(dirr+'/'+name):
65 extract(dirr, name)
66
67 os.path.walk(rootdir, filter, None)
68
69 dotfile.close()
70
71 urlfile = open('linkindex.txt', 'w', 4096000)
72 for url in urllist:
73 urlfile.write(url + '\n')
74 urlfile.close()