关于python处理文本的一个问题
比如我现在有一个文本
1|2|3|4|5
11|22|33|44|55
1|3|4|5|6|3
这样的行
我要找出一列里相同的行并去重。
这个咋处理呢
[解决办法]
#-*- coding: utf-8 -*-import rea = '1|2|3|4|5'b = '11|22|33|44|55'c = '1|3|4|5|6|3'ls = [a, b, c]NUM_PATTERN = r'^(\d+)(.*)'dict = {}def getLines(dict): p = re.compile(NUM_PATTERN) for _l in ls: value = p.findall(_l) k = value[0][0] v = value[0][1] dict[k] = vif __name__ == '__main__': getLines(dict) print dict
[解决办法]
#!/usr/bin/env pythonimport redef getLines(filename): ldict = {} pat = re.compile('^(\d+)') with open(filename) as fd: for line in fd: mat = pat.match(line) if mat: n = mat.group(1) if n in ldict: continue else: ldict[n] = 1 print line, fd.close()if __name__ == '__main__': getLines('a.txt')
[解决办法]
[code=Python][/code]
#! /usr/bin/env python
# -*- coding: cp936 -*-
'''这种结果是否正确'''
def noreplines(src):
lines = src.split('\n')
lnos = [lno for lno in (line.split('|') for line in lines)]
lefts = set(ln for ln in range(len(lines)))
for i in range(len(lnos[0])):
edict = {}
for j in range(len(lnos)):
if lnos[j][i] not in edict:
edict[lnos[j][i]] = j
lefts &= set(edict[key] for key in edict)
for ln in lefts:
print lines[ln]
def main():
src = '''1|2|3|4
1|2|4|5
2|4|5|5
3|4|6|6
4|5|7|7
5|6|7|8'''
noreplines(src)
if __name__ == '__main__':
main()
[解决办法]
#! /usr/bin/env python# -*- coding: cp936 -*-'''这种结果是否正确'''def noreplines(src): lines = src.split('\n') lnos = [lno for lno in (line.split('|') for line in lines)] lefts = set(ln for ln in range(len(lines))) for i in range(len(lnos[0])): edict = {} for j in range(len(lnos)): if lnos[j][i] not in edict: edict[lnos[j][i]] = j lefts &= set(edict[key] for key in edict) for ln in lefts: print lines[ln]def main(): src = '''1|2|3|41|2|4|52|4|5|53|4|6|64|5|7|75|6|7|8''' noreplines(src)if __name__ == '__main__': main()
[解决办法]
$cat test.txt
1|2|3|4
1|2|4|5
2|4|5|5
3|4|6|6
4|5|7|7
5|6|7|8
1|2|3|4|5
11|22|33|44|55
1|3|4|5|6|3
#!/usr/bin/python# encoding: utf-8class Parser: def __init__(self, spliter): self.spliter = spliter self.cached = set() def __call__(self, stream): for ln in stream: k, v = ln.split(self.spliter, 1) if k in self.cached: continue else: self.cached.add(k) yield lnparser = Parser('|')for ln in parser(open('test.txt')): print ln
[解决办法]
f = open('test.txt', 'r');dls = {};for l in reversed(f.readlines()): dls[l.split('|')[0]] = [len(dls), l];f.close();for k in sorted(dls, key=lambda x: dls[x][0], reverse=True): print(dls[k][1]);