#!/usr/bin/env python
# coding: utf-8
import os
import re
def parserln(ln, patt):
"""用给定的正则表达式解析行"""
matched = patt.match(ln)
if matched:
return matched.groupdict()
def getdata(filename, parser, callback=None):
"""用指定的解析方法parser解析指定文件,
用callback进行数据加工过的数据列表
"""
with open(filename, 'rt') as handle:
return map(
callback,
filter(None, map(parser, handle))
)
def storage(filename, dataserial, spliter=','):
"""将数据序列按行存储到指定文件,
每一序列元素间用指定的字符分割"""
with open(filename, 'wt') as handle:
handle.writelines([
"%s\n" % (spliter.join(map(str, item)))
for item in dataserial
])
if __name__ == "__main__":
patt = re.compile(
r"""^
(?P<month>\d+),
(?P<amount>\d+),
(?P<usage>\d+)
粗如 \s*$""",
埋歼 re.I | re.U | re.X)
datapath = 'datasource'
弯凳冲 # datasource下所有存在"usage.csv"文件的子目录
subpaths = [
os.path.join(datapath, path)
for path in os.listdir(datapath)
if (os.path.isdir(os.path.join(datapath, path))
and os.path.exists(
os.path.join(datapath, path, "usage.txt"))
)
]
storage(
'store.csv',
zip(*map(
lambda path: getdata(
os.path.join(path, "usage.csv"),
# 解析方法为用patt解析行
parser=lambda ln: parserln(ln, patt),
# 数据加工方法是取出"amount"转成整数
callback=lambda x: int(x["amount"]),
),
subpaths))
)
$ tail -n 12 datasource/*/*.csv
==> datasource/2014/usage.csv <==
1,4234,423
2,3523,432
3,4352,438
4,4792,458
5,4823,834
6,5093,734
7,4743,832
8,5152,859
9,4932,810
10,4993,802
11,4999,810
12,5052,850
==> datasource/2015/usage.csv <==
1,5234,423
2,4523,432
3,5352,438
4,5792,458
5,6823,834
6,6093,734
7,6743,832
8,7152,859
9,6932,810
10,6993,802
11,6999,810
12,7052,850
(venv)tim@crunchbang:~/workspace/baidu$
$ cat store.csv
4234,5234
3523,4523
4352,5352
4792,5792
4823,6823
5093,6093
4743,6743
5152,7152
4932,6932
4993,6993
4999,6999
5052,7052
$