-
Notifications
You must be signed in to change notification settings - Fork 54
/
split.py
60 lines (42 loc) · 1.5 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
'''
split a file into two randomly, line by line.
'''
import argparse
import sys
import random
parser = argparse.ArgumentParser( description = "Split a file into two randomly, line by line." )
parser.add_argument( "input_file", help = "path to an input file" )
parser.add_argument( "output_file1", help = "path to the first output file" )
parser.add_argument( "output_file2", help = "path to the second output file" )
parser.add_argument( "-p", "--probability", help = "probability of writing to the first file (default 0.9)",
default = 0.9, type = float )
parser.add_argument( "-r", "--random_seed", help = "random seed", default = False )
parser.add_argument( "-s", "--skip_headers", help = "skip the header line",
default = False, action = 'store_true' )
parser.add_argument( "-c", "--copy_headers", help = "copy the header line to both output files",
default = False, action = 'store_true' )
args = parser.parse_args()
if args.random_seed:
random.seed( args.random_seed )
i = open( args.input_file )
o1 = open( args.output_file1, 'wb' )
o2 = open( args.output_file2, 'wb' )
if args.skip_headers and args.copy_headers:
print "You can either skip or copy headers, not both."
quit()
elif args.skip_headers:
i.readline()
elif args.copy_headers:
headers = i.readline()
o1.write( headers )
o2.write( headers )
counter = 0
for line in i:
r = random.random()
if r > args.probability:
o2.write( line )
else:
o1.write( line )
counter += 1
if counter % 100000 == 0:
print counter