歡迎來到Linux教程網
Linux教程網
Linux教程網
Linux教程網
Linux教程網 >> Linux編程 >> Linux編程 >> Python dataformat.py通用數據格式轉化腳本

Python dataformat.py通用數據格式轉化腳本

日期:2017/3/1 10:49:02   编辑:Linux編程

需求:在進行Hadoop測試時,需要造大量數據,例如某個表存在56列,但實際程序邏輯只適用到某幾列,我們造的數據 也只需要某幾列

構造幾列數據,轉化為對應數據表格式

涉及模塊:os,getopt,sys

輸入:源格式,文本文件

輸出:目標格式,文本文件

  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. #dataformat.py
  4. #this script change data from your source to the dest data format
  5. #2011-08-05 created version0.1
  6. #2011-10-29 add row-row mapping ,default row value .rebuild all functions. version0.2
  7. #next:add data auto generate by re expression
  8. import os,getopt,sys
  9. #讀入文件,返回所有行
  10. def read_file(path):
  11. f = open(path, "r")
  12. lines = f.readlines()
  13. f.close()
  14. return lines
  15. #處理一行,轉為目標格式,返回目標行
  16. def one_line_proc(parts, total, ft_map, outsp, empty_fill):
  17. toindex = 0
  18. outline = ""
  19. keys = ft_map.keys()
  20. for i in range(1, total+1):
  21. if i in keys:
  22. fill_index = ft_map[i]
  23. if fill_index.startswith("d"):
  24. outline += fill_index[1:]
  25. else:
  26. outline += parts[int(fill_index)-1]
  27. else:
  28. outline += empty_fill
  29. if i !=total:
  30. outline += outsp
  31. #TODO:加入使用默認值列 若是以d開頭,後面是默認,否則取文件對應列 done
  32. #TODO:這裡根據這個判斷長度也需要換掉 done
  33. return outline
  34. #處理入口,讀文件,循環處理每一行,寫出
  35. #輸入數據分隔符默認\t,輸出數據默認分隔符\t
  36. def process(inpath, total, to, outpath, insp="\t", outsp="\t", empty_fill=""):
  37. #TODO:這裡將to轉為映射格式 done
  38. ft_map = {}
  39. in_count = 0
  40. used_row = []
  41. for to_row in to:
  42. if r"\:" not in to_row and len(to_row.split(":"))==2:
  43. used_row.append(int(to_row.split(":")[1]))
  44. if r"\=" not in str(to_row) and len(str(to_row).split("="))==2:
  45. pass
  46. else:
  47. in_count += 1
  48. for to_row in to:
  49. if r"\=" not in str(to_row) and len(str(to_row).split("="))==2:
  50. ft_map.update({int(to_row.split("=")[0]):"d"+to_row.split("=")[1]})
  51. continue
  52. elif r"\:" not in to_row and len(to_row.split(":"))==2:
  53. ft_map.update({int(to_row.split(":")[0]):to_row.split(":")[1]})
  54. continue
  55. else:
  56. to_index = 0
  57. for i in range(1, 100):
  58. if i not in used_row:
  59. to_index = i
  60. break
  61. ft_map.update({int(to_row):str(to_index)})
  62. used_row.append(to_index)
  63. lines = read_file(inpath)
  64. f = open(outpath,"w")
  65. result=[]
  66. for line in lines:
  67. parts = line.strip("\n").split(insp)
  68. #TODO:這裡判斷長度必須換掉 done
  69. if len(parts) >= in_count:
  70. outline = one_line_proc(parts, total, ft_map, outsp, empty_fill)
  71. result.append(outline+"\n")
  72. f.writelines(result)
  73. f.close()
  74. #打印幫助信息
  75. def help_msg():
  76. print("功能:原數據文件轉為目標數據格式")
  77. print("選項:")
  78. print("\t -i inputfilepath [必輸,原文件路徑]")
  79. print("\t -t n [必輸,n為數字,目標數據總的域個數]")
  80. print("\t -a '1,3,4' [必輸,域編號字符串,逗號分隔。指定域用原數據字段填充,未指定用'0'填充]")
  81. print("\t -o outputfilepath [可選,默認為 inputfilepath.dist ]")
  82. print("\t -F 'FS' [可選,原文件域分隔符,默認為\\t ]")
  83. print("\t -P 'OFS' [可選,輸出文件的域分隔符,默認為\\t ]")
  84. sys.exit(0)
  85. #程序入口,讀入參數,執行
  86. def main():
  87. try:
  88. opts,args = getopt.getopt(sys.argv[1:],"F:P:t:a:i:o:f:h")
  89. for op,value in opts:
  90. if op in ("-h","-H","--help"):
  91. help_msg()
  92. if op == "-i":
  93. inpath = value
  94. elif op == "-o":
  95. outpath = value
  96. elif op == "-t":
  97. total = int(value)
  98. elif op == "-a":
  99. to = value.split(",")
  100. elif op == "-F":
  101. insp = value.decode("string_escape")
  102. elif op == "-P":
  103. outsp = value.decode("string_escape")
  104. elif op == "-f":
  105. empty_fill = value
  106. #考慮下這邊放在神馬地方合適
  107. if len(opts) < 3:
  108. print(sys.argv[0]+" : the amount of params must great equal than 3")
  109. sys.exit(1)
  110. except getopt.GetoptError:
  111. print(sys.argv[0]+" : params are not defined well!")
  112. if 'inpath' not in dir():
  113. print(sys.argv[0]+" : -i param is needed,input file path must define!")
  114. sys.exit(1)
  115. if 'total' not in dir():
  116. print(sys.argv[0]+" : -t param is needed,the fields of result file must define!")
  117. sys.exit(1)
  118. if 'to' not in dir():
  119. print(sys.argv[0]+" : -a param is needed,must assign the field to put !")
  120. sys.exit(1)
  121. if not os.path.exists(inpath):
  122. print(sys.argv[0]+" file : %s is not exists"%inpath)
  123. sys.exit(1)
  124. if 'empty_fill' not in dir():
  125. empty_fill = ''
  126. tmp=[]
  127. for st in to:
  128. tmp.append(str(st))
  129. to=tmp
  130. if 'outpath' not in dir():
  131. outpath = inpath+".dist"
  132. if 'insp' in dir() and 'outsp' in dir():
  133. process(inpath,total,to,outpath,insp,outsp,empty_fill=empty_fill)
  134. elif 'insp' in dir():
  135. process(inpath,total,to,outpath,insp,empty_fill=empty_fill)
  136. elif 'outsp' in dir():
  137. process(inpath,total,to,outpath,outsp=outsp,empty_fill=empty_fill)
  138. else:
  139. process(inpath,total,to,outpath,empty_fill=empty_fill)
  140. if __name__ =="__main__":
  141. main()

Copyright © Linux教程網 All Rights Reserved