diff --git a/README.md b/README.md index 299bf69..e5011b0 100644 --- a/README.md +++ b/README.md @@ -18,32 +18,32 @@ Usage Run `ttv --help` to get help, or infer what you can from one of these examples: # Split CSV file into two sets of a fixed number of rows - $ ttv split data.csv --rows=train=9000 --rows=test=1000 --uncompressed-input + $ ttv split data.csv --rows=train=9000 --rows=test=1000 # Accepts gzipped data (no flag required). Shorthand argument version. As many splits as you like! - $ ttv split data.csv.gz --rows=train=65000,validation=15000,test=15000 + $ ttv split data.csv.gz --rows=train=65000,validation=15000,test=15000 -d - # Alternatively, specify proportion-based splits. -u is shorthand for --uncompressed-input - $ ttv split data.csv --props=train=0.8,test=0.2 -u + # Alternatively, specify proportion-based splits. + $ ttv split data.csv --prop=train=0.8,test=0.2 # When using proportions, include the total rows to get a progress bar - $ ttv split data.csv --props=train=0.8,test=0.2 --total-rows=1234 -u + $ ttv split data.csv --prop=train=0.8,test=0.2 --total-rows=1234 # Accepts data from stdin, compressed or not (must give a filename) $ cat data.csv | ttv split --rows=test=10000,train=90000 --output-prefix data -u - $ cat data.csv.gz | ttv split --rows=test=10000,train=90000 --output-prefix data + $ cat data.csv.gz | ttv split --rows=test=10000,train=90000 --output-prefix data -d # Using pigz for faster decompression - $ pigz -dc data.csv.gz | ttv split --prop=test=0.1,train=0.9 --chunk-size 5000 --output-prefix data -u + $ pigz -dc data.csv.gz | ttv split --prop=test=0.1,train=0.9 --chunk-size 5000 --output-prefix data # Split outputs into chunks for faster writing/reading later - $ ttv split data.csv.gz --rows=test=100000,train=900000 --chunk-size 5000 + $ ttv split data.csv.gz --rows=test=100000,train=900000 --chunk-size 5000 -d # Write outputs uncompressed - $ ttv split data.csv.gz --prop=test=0.5,train=0.5 --uncompressed-output + $ ttv split data.csv.gz --prop=test=0.5,train=0.5 # Reproducible splits using seed - $ ttv split data.csv.gz --prop=test=0.5,train=0.5 --chunk-size 1000 --seed 5330 + $ ttv split data.csv.gz --prop=test=0.5,train=0.5 --chunk-size 1000 --seed 5330 -d Development -----------