- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- #dataformat.py
- #this script change data from your source to the dest data format
- #2011-08-05 created version0.1
- #2011-10-29 add row-row mapping ,default row value .rebuild all functions. version0.2
- #next:add data auto generate by re expression
-
- import os,getopt,sys
-
- #读入文件,返回所有行
- def read_file(path):
- f = open(path, "r")
- lines = f.readlines()
- f.close()
- return lines
-
- #处理一行,转为目标格式,返回目标行
- def one_line_proc(parts, total, ft_map, outsp, empty_fill):
- toindex = 0
- outline = ""
- keys = ft_map.keys()
- for i in range(1, total+1):
- if i in keys:
- fill_index = ft_map[i]
- if fill_index.startswith("d"):
- outline += fill_index[1:]
- else:
- outline += parts[int(fill_index)-1]
- else:
- outline += empty_fill
- if i !=total:
- outline += outsp
- #TODO:加入使用默认值列 若是以d开头,后面是默认,否则取文件对应列 done
- #TODO:这里根据这个判断长度也需要换掉 done
- return outline
-
- #处理入口,读文件,循环处理每一行,写出
- #输入数据分隔符默认 ,输出数据默认分隔符
- def process(inpath, total, to, outpath, insp=" ", outsp=" ", empty_fill=""):
- #TODO:这里将to转为映射格式 done
- ft_map = {}
- in_count = 0
- used_row = []
- for to_row in to:
- if r":" not in to_row and len(to_row.split(":"))==2:
- used_row.append(int(to_row.split(":")[1]))
- if r"=" not in str(to_row) and len(str(to_row).split("="))==2:
- pass
- else:
- in_count += 1
-
- for to_row in to:
- if r"=" not in str(to_row) and len(str(to_row).split("="))==2:
- ft_map.update({int(to_row.split("=")[0]):"d"+to_row.split("=")[1]})
- continue
- elif r":" not in to_row and len(to_row.split(":"))==2:
- ft_map.update({int(to_row.split(":")[0]):to_row.split(":")[1]})
- continue
- else:
- to_index = 0
- for i in range(1, 100):
- if i not in used_row:
- to_index = i
- break
- ft_map.update({int(to_row):str(to_index)})
- used_row.append(to_index)
-
- lines = read_file(inpath)
- f = open(outpath,"w")
- result=[]
- for line in lines:
- parts = line.strip("
").split(insp)
- #TODO:这里判断长度必须换掉 done
- if len(parts) >= in_count:
- outline = one_line_proc(parts, total, ft_map, outsp, empty_fill)
- result.append(outline+"
")
- f.writelines(result)
- f.close()
-
- #打印帮助信息
- def help_msg():
- print("功能:原数据文件转为目标数据格式")
- print("选项:")
- print(" -i inputfilepath [必输,原文件路径]")
- print(" -t n [必输,n为数字,目标数据总的域个数]")
- print(" -a "1,3,4" [必输,域编号字符串,逗号分隔。指定域用原数据字段填充,未指定用"0"填充]")
- print(" -o outputfilepath [可选,默认为 inputfilepath.dist ]")
- print(" -F "FS" [可选,原文件域分隔符,默认为\t ]")
- print(" -P "OFS" [可选,输出文件的域分隔符,默认为\t ]")
- sys.exit(0)
-
- #程序入口,读入参数,执行
- def main():
- try:
- opts,args = getopt.getopt(sys.argv[1:],"F:P:t:a:i:o:f:h")
-
- for op,value in opts:
- if op in ("-h","-H","--help"):
- help_msg()
- if op == "-i":
- inpath = value
- elif op == "-o":
- outpath = value
- elif op == "-t":
- total = int(value)
- elif op == "-a":
- to = value.split(",")
- elif op == "-F":
- insp = value.decode("string_escape")
- elif op == "-P":
- outsp = value.decode("string_escape")
- elif op == "-f":
- empty_fill = value
- #考虑下这边放在神马地方合适
- if len(opts) < 3:
- print(sys.argv[0]+" : the amount of params must great equal than 3")
- sys.exit(1)
-
- except getopt.GetoptError:
- print(sys.argv[0]+" : params are not defined well!")
-
- if "inpath" not in dir():
- print(sys.argv[0]+" : -i param is needed,input file path must define!")
- sys.exit(1)
-
- if "total" not in dir():
- print(sys.argv[0]+" : -t param is needed,the fields of result file must define!")
- sys.exit(1)
-
- if "to" not in dir():
- print(sys.argv[0]+" : -a param is needed,must assign the field to put !")
- sys.exit(1)
-
- if not os.path.exists(inpath):
- print(sys.argv[0]+" file : %s is not exists"%inpath)
- sys.exit(1)
-
- if "empty_fill" not in dir():
- empty_fill = ""
-
- tmp=[]
- for st in to:
- tmp.append(str(st))
- to=tmp
-
- if "outpath" not in dir():
- outpath = inpath+".dist"
-
- if "insp" in dir() and "outsp" in dir():
- process(inpath,total,to,outpath,insp,outsp,empty_fill=empty_fill)
- elif "insp" in dir():
- process(inpath,total,to,outpath,insp,empty_fill=empty_fill)
- elif "outsp" in dir():
- process(inpath,total,to,outpath,outsp=outsp,empty_fill=empty_fill)
- else:
- process(inpath,total,to,outpath,empty_fill=empty_fill)
-
- if __name__ =="__main__":
- main()
|