-
Notifications
You must be signed in to change notification settings - Fork 1
/
.zshrc
770 lines (739 loc) · 30.7 KB
/
.zshrc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
# Path to your oh-my-zsh installation.
export ZSH=~/.oh-my-zsh
# Set name of the theme to load.
# Look in ~/.oh-my-zsh/themes/
# Optionally, if you set this to "random", it'll load a random theme each
# time that oh-my-zsh is loaded.
ZSH_THEME="robbyrussell"
# Uncomment the following line to use case-sensitive completion.
# CASE_SENSITIVE="true"
# Uncomment the following line to use hyphen-insensitive completion. Case
# sensitive completion must be off. _ and - will be interchangeable.
# HYPHEN_INSENSITIVE="true"
# Uncomment the following line to disable bi-weekly auto-update checks.
# DISABLE_AUTO_UPDATE="true"
# Uncomment the following line to change how often to auto-update (in days).
# export UPDATE_ZSH_DAYS=13
# Uncomment the following line to disable colors in ls.
# DISABLE_LS_COLORS="true"
# Uncomment the following line to disable auto-setting terminal title.
# DISABLE_AUTO_TITLE="true"
# Uncomment the following line to enable command auto-correction.
# ENABLE_CORRECTION="true"
# Uncomment the following line to display red dots whilst waiting for completion.
# COMPLETION_WAITING_DOTS="true"
# Uncomment the following line if you want to disable marking untracked files
# under VCS as dirty. This makes repository status check for large repositories
# much, much faster.
# DISABLE_UNTRACKED_FILES_DIRTY="true"
# Uncomment the following line if you want to change the command execution time
# stamp shown in the history command output.
# The optional three formats: "mm/dd/yyyy"|"dd.mm.yyyy"|"yyyy-mm-dd"
# HIST_STAMPS="mm/dd/yyyy"
# Would you like to use another custom folder than $ZSH/custom?
# ZSH_CUSTOM=/path/to/new-custom-folder
# Which plugins would you like to load? (plugins can be found in ~/.oh-my-zsh/plugins/*)
# Custom plugins may be added to ~/.oh-my-zsh/custom/plugins/
# Example format: plugins=(rails git textmate ruby lighthouse)
# Add wisely, as too many plugins slow down shell startup.
plugins=(git)
# User configuration
export PATH="/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/usr/sbin/:$HOME/bin"
# export MANPATH="/usr/local/man:$MANPATH"
source $ZSH/oh-my-zsh.sh
# You may need to manually set your language environment
# export LANG=en_US.UTF-8
# Preferred editor for local and remote sessions
# if [[ -n $SSH_CONNECTION ]]; then
# export EDITOR='vim'
# else
# export EDITOR='mvim'
# fi
# Compilation flags
# export ARCHFLAGS="-arch x86_64"
# ssh
# export SSH_KEY_PATH="~/.ssh/dsa_id"
# Set personal aliases, overriding those provided by oh-my-zsh libs,
# plugins, and themes. Aliases can be placed here, though oh-my-zsh
# users are encouraged to define aliases within the ZSH_CUSTOM folder.
# For a full list of active aliases, run `alias`.
#
# Example aliases
# alias zshconfig="mate ~/.zshrc"
# alias ohmyzsh="mate ~/.oh-my-zsh"
#########
# set vim keybindings
bindkey -v
# get reverse history search
bindkey '^R' history-incremental-search-backward
HISTSIZE=100000
SAVEHIST=100000
# share history, eg tmux windows
setopt share_history
# set vim as default editor
export EDITOR=vim
# print name of most recently modified file in dir
function latest { ls -c $1 | sed -n '1p' | sed 's:^:\":g;s:$:\":g'; }
# print the most recent _n_ executable files under $PATH directories
# if no _n_ is given uses _n_ of 1
# example "latest-exec 3"
function latest-exec { n=$1; if [[ -z $n ]]; then n=1; fi; find $(echo $PATH | tr ':' '\n') -type f -executable -exec ls -tc {} + 2>/dev/null | head -n $n; }
# list oldest files over *n* MB under current dir
# note that MB count must be integer
function listold { find . -size +$1M -printf "%p\t%k\t%TY-%Tm-%Td\n" | sort -k3,3 -t' ' -n | awk -F'\t' '{print $1}'; }
# list _probable_ duplicates of the _n_ largest files under current dir, sort by size descending
# _probable_ duplicates have the same name and are the same size
# great for finding directories that are near identical and hidden in TB of data
# NB: maybedups does not run any checksums and cannot guarentee that files are actually duplicates!! it is especially bad with small files
# example: maybedups 1000 | csvlook -t | vim -
# TODO: add IDs by filename/kb size combo
function maybedups {
# choose how many of the biggest files to consider
n=$1
basenameSizePath_tsv=$(
# find the size of all files under current dir, in kb
find . -type f -exec du -ak {} + |\
sort -rn |\
# keep only the _n_ biggest
head -n $n |\
# make a TSV of file basename,size in kb,path relative to execution dir
tawk '{path=$2;gsub(/^.*[\/]/,"",$2);print $2,$1,path}'
)
basenameSize_dups=$(
echo "$basenameSizePath_tsv" |\
# consider just the file basenames and sizes in kb
tawk '{print $1,$2}' |\
sort |\
# count the unique file basenames and sizes in kb
uniq -c |\
# remove leading whitespace and make sure counts are first column in a new TSV
sed 's:^\s\+\([0-9]\+\s\+\):\1\t:g' |\
# if the basename/size pair occurs more than once among the considered files, print a TSV of file basename, size in kb
tawk '{if($1 > 1)print $2,$3}'
)
# if no possible duplicates were found, print error message and quit
if [[ -z "$basenameSize_dups" ]]; then
echo "ERROR: no near-duplicates were found based on basename and file size" 1>&2
else
# if possible duplicates were found, pull out paths relative to the execution dir
grep -Ff <(echo "$basenameSize_dups" | sed 's:^\|$:\t:g') <(echo "$basenameSizePath_tsv" | sed 's:^:\t:g') |\
sed 's:^\t::g'|\
# sort both by size and filename
# NB: '\t' as tab delimiter makes sort unhappy so we use echo - ridiculous!
sort -t"$(echo -e "\t")" -k 2,2rn -k 1,1d|\
# add header to output TSV
sed '1i\name\tsize_in_KB\tpath'
fi
}
# copy stdout to clipboard
# like Mac's pbcopy
alias clipboard="xclip -selection clip-board -i"
# show processes using ports
# like nethogs but small and simple
# credit goes to http://www.commandlinefu.com/commands/by/edo
function netpiglets {
lsof -P -i -n | cut -f 1 -d " "| uniq | tail -n +2
}
# subset pdf by page number
# user args: 1) input pdf, 2) hyphen separated page range, 3) output pdf
# example use pdf_subset in.pdf 1-10 out.pdf
function pdf_subset { pdftk A=$1 cat A$2 output $3; }
# cat TSV to this and output github flavored markdown table
function tsv2githubmd { in=$(cat); col_count=$(echo "$in" | awk -F'\t' '{print NF}' | sed -n '1p' ); second_line=$( yes -- --- | head -n $col_count | tr '\n' '|' | sed 's:|$::g' ); echo "$in" | sed "1 a\\$second_line" | sed 's:\t:|:g'; };
# GNU parallel why you no use GNU by default?
alias parallel='parallel --gnu'
# use mawk with tab delimiter for input and output
alias tawk='mawk -F "\t" -v OFS="\t"'
# use mawk with pipe delimiter for input and output
alias pawk='mawk -F "|" -v OFS="|"'
# use mawk with pipe delimiter for input and output
alias cawk='mawk -F "," -v OFS=","'
# print numbered header of a TSV
alias theader="head -n 1 | tr '\t' '\n' | nl -ba | sed 's:^[ \t]\+::g;s:[ \t]\+$::g'"
# print numbered header of a CSV
alias pheader="head -n 1 | tr '|' '\n' | nl -ba | sed 's:^[ \t]\+::g;s:[ \t]\+$::g'"
# print numbered header of a CSV
alias cheader="head -n 1 | tr ',' '\n' | nl -ba | sed 's:^[ \t]\+::g;s:[ \t]\+$::g'"
# convert Gnumeric-recognized tables to TSV
# more generic than csvformat from csvkit but slower, eg works on xlsx
# requires gnumeric's ssconvert
# NB: some character encoding issues
function table2tsv {
ssconvert --export-type Gnumeric_stf:stf_assistant -O 'separator=" "' fd://0 fd://1 2>/dev/null
}
# convert TSV to CSV - uses csvkit, assumes input is TSV
# also assumes max field size of no more than 1m characters
# ssconvert does not have these limit flags but it is much slower
function table2csv { csvformat -t -z 1000000 $1;}
# print frequency of unique entries descending
# keeps header in place
function sortfreq {
in=$(cat)
header=$(echo "$in" | head -n 1)
echo "$in" |\
sed '1d' |\
sort |\
uniq -c |\
sort -k1 -rn |\
sed 's:^[ \t]\+::g;s:[ \t]\+$::g;s:^\([0-9]\+\) :\1\t:g' |\
sed "1 icount\t${header}"
}
# return the count of non alpha / non digit characters sorted descending
# this only works well with English - should replace with UTF8 friendly funky_chars
function funky_chars { sed 's:\(.\):\1\n:g' | sort | uniq -c | sort -k1 -rn | tr -d '[:alpha:]' | awk '{if($2 !~ /^$/ && $2 !~ /[0-9]/)print $0}' ;}
# trim leading and trailing whitespace
alias trim="sed 's:^[ \t]\+::g;s:[ \t]\+$::g'"
# round to nearest user arg decimal
# eg 'cat foo.csv | round 0' to get whole numbers, and 'cat foo.csv | round 1' to round to first decimal place
function round { awk "{printf \"%3.$1f\n\", \$1}"; }
# sum a column in awk. don't use sci notation
# uses STDIN
function sumawk { awk '{ sum += $1 } END { printf "%.4f\n", sum }' ; }
# get records from a txt that have text in columns beyond what they should have
# user args: 1) input txt to check, 2) number of columns the file should have
function col_extra {
intsv=$(cat)
echo "$intsv" | tawk "{if(NF > $1)print \$0}"
}
# swap position of two columns in a TSV
# user args: 1) first col, 2) second col
# input TSV comes from STDIN
function col_swap { mawk -F'\t' " { OFS=\"\t\"; t = \$${1}; \$${1} = \$${2}; \$${2} = t; print; } "; }
# sort a TSV according to column names, using an arbitrary sort flag, eg -d or -n
# example use: col_sort -n in.tsv
# NB: relies on table2tsv, csvjoin, mawk
# NB: would be nice to use STDIN as input
function col_sort {
intsv=$2
cols_order=$(
cat $intsv |\
head -n 1 |\
tr "\t" "\n" |\
nl -ba |\
sed 's:^[ \t]\+::g;s:[ \t]\+$::g'
)
cols_sorted=$(
echo "$cols_order" |\
mawk -F'\t' '{print $2}'|\
sort $1 |\
nl -ba |\
sed 's:^[ \t]\+::g;s:[ \t]\+$::g'
)
cols_order=$(
echo -e "col_num\tcol_name\n$cols_order"
)
cols_sorted=$(
echo -e "col_num_new\tcol_name\n$cols_sorted"
)
awk_print_order=$(
csvjoin -t -c2,2 <( echo "$cols_order" ) <( echo "$cols_sorted" )|\
cut -d, --complement -f2,4|\
table2tsv |\
sort -k2,2 -n|\
cut -f1|\
sed '1d'|\
sed 's:^:$:g'|\
tr '\n' ','|\
sed 's:,$::g'
)
awk -F"\t" "{OFS=\"\t\";print $awk_print_order}" $intsv
}
# make a string for awk-style column printing ( eg "$1,$2" ) from a list of numbers
alias awkcols="sed 's:^:$:g'|tr '\n' ','| sed 's:,$::g'"
# list all tables and fields in a psql db, tables in left col and their fields in right col, tab separated
function psql_listcols { echo "\d" | psql -d $1 | mawk -F'|' '{print $2}' | sed '1,3d' | grep -vE "^$" | parallel 'echo copy \(select \* from {} limit 0\) to stdout\ with csv header\; | psql -d '$1' | tr "," "\n" | sed "s:^:{}:g" | sed "s:^[ \t]\+::g;s:[ \t]\+$::g;s:[ \t]\+:\t:g"'; }
# find all files in working dir with a given extension
# essentially recursive "ls *.foo"
# example use: find_ext shp
function find_ext { find . -type f -iregex ".*[.]$1$"; }
# URL encode
function url_encode { perl -MURI::Escape -e 'print uri_escape(<STDIN>); print "\n";';}
# URL unencode
function url_decode { perl -MURI::Escape -e 'print uri_unescape(<STDIN>); print "\n";';}
# HTML encode
# still getting 'wide character in print' message
function html_encode { perl -Mutf8 -MHTML::Entities -ne 'print encode_entities($_)'; }
# HTML decode
# still getting 'wide character in print' message
function html_decode { perl -Mutf8 -MHTML::Entities -ne 'print decode_entities($_)'; }
# open TSV with libreoffice calc
# libre is stupid
function libretsv { if [[ $( echo "$1" | grep -oE "[^.]*$" ) = "tsv" ]]; then libreoffice --calc $1; fi ;}
# use GNUplot to plot one or two columns of values, inspired by jeroenjanssens
# https://github.com/jeroenjanssens/data-science-at-the-command-line/blob/master/tools/dumbplot
# assumes tab separated, removes header if found, assumes plot type points unless user supplies argument
# example: cat foo.tsv | cut -f3,4 | dumbplot
# example: cat foo.tsv | cut -f3 | dumbplot lines
function dumbplot {
# hard coded plot size in characters - please change according to taste!
size="110 30"
# if user does not supply plot type, assumes points
if [[ -z $1 ]]; then
plotType=points
else
plotType=$1
fi
in=$(cat)
if [[ -n $( echo "$in" | head -n 1 | grep -vE "[0-9.-]+" ) ]]; then
echo "$in" |\
# first line appears to be a header because it has characters other than numbers, hyphens, and periods. remove it.
sed '1d' |\
gnuplot -e "set term dumb size $size; set datafile separator \"\t\"; plot \"-\" with $plotType"
else
# keep first line - does not appear to be a header
echo "$in" |\
gnuplot -e "set term dumb size $size; set datafile separator \"\t\"; plot \"-\" with $plotType"
fi
}
# write SQL to join an arbitrary number of tables to each other, given that they have a field of the same name to join on
# use double quoted list for table names
# example use: joinmany_psql "1 2 3 4" example_id_field "full outer join" foo_db
function joinmany_psql {
joinmany_psql () {
emulate -L ksh
field=$2
tables=($(echo "$1"))
num_elements=$( expr ${#tables[@]} - 1)
for n in $(seq 0 $num_elements)
do
if [[ $n -eq $num_elements ]]
then
false
elif [[ $n -eq 0 ]]
then
echo "\"${tables[$n]}\" $3 \"${tables[$( expr $n + 1 )]}\" USING (\"$2\")"
unset tables[$n]
else
echo "$3 \"${tables[$( expr $n + 1 )]}\" USING (\"$2\")"
unset tables[$n]
fi
done | tr '\n' ' ' | sed "s:^:SELECT * FROM :g" | sed "s:^:COPY (:g;s:$:) TO STDOUT WITH DELIMITER E'\t' CSV HEADER:g"
}
# join arbitrary number of TSVs or CSVs
# first imports to psql with txt2pgsql.pl, then uses function joinmany_psql
# NB: user must have postgre permissions to createdb and dropdb
# TODO: double quotes get quoted - just fix
# NB: must be executed in same directory as input files - ouch
# user args: 1) double quoted list of TSVs / CSVs, 2) name of field to join on, 3)
# example use: joinmany_csv "foo.csv bar.csv 1.csv" "example field" "full outer join" csv
function joinmany_csv {
# define list of TSVs
tsvs=( $1 )
# create a tmp pgsql db
tmpdb=$(mktemp)
createdb $tmpdb
# load up your list of TSVs into the tmpdb
# if there is a 4th user arg, check for "csv" or "tsv"
# if not, assume TSV
if [[ $# -ge 3 ]]; then
if [[ "$4" == "tsv" ]]; then
d=","
else
# if not csv, assume tsv
d="\t"
fi
else
d=","
fi
for i in "${tsvs[@]}"
do
txt2pgsql.pl -i $i -d "$d" -t "TEXT" -p $tmpdb | sh &>/dev/null
done
# now run joinmany_psql!
joinmany_psql "$1" "$2" "$3" $tmpdb
dropdb $tmpdb
}
# write SQLite to join an arbitrary number of CSVs, given that they have a field of the same name to join on
# NB: because SQLite lacks OUTER JOIN, this is not nearly as useful as joinmany_psql, though this uses CSVs directly which is convenient
# user args: 1) double quoted list of CSVs to join, with full path, 2) id field all will join on, 3) join type (just LEFT or INNER for SQLite), 4) export type - csv or tabs
# example use: join_multi_w_singleField "/tmp/1 /tmp/2 /tmp/3 /tmp/4" example_id_field left csv
function joinmany_csv {
field=$2
tables=( $1 )
# import each with rgrp's csv2sqlite.py
# use a sqlite db under tmp
# prefix "a_" to table names to allow for numeric table names
tmpdb=$(mktemp --suffix .sqlite)
# NB: csv2sqlite.py needs full path to input CSV
for n in "${tables[@]}"
do
csv2sqlite.py $n $tmpdb a_$(basename $n )
done
# change the tables array to use the new SQLite table names from the csv2sqlite.py import
tables=(
$(
echo "${tables[@]}" |\
tr ' ' '\n' |\
xargs -I '{}' basename {} |\
sed 's:^:a_:g'
)
)
# get the number of tables to join
num_elements=$( expr ${#tables[@]} - 1 )
# now print SQL to get all tables to join according to user specified join type, on user specified field
sqlite_string=$(
for n in $(seq 0 $num_elements)
do
# if the number of tables is reached, do nothing
if [[ $n -eq $num_elements ]]; then
false
# if the number of tables is not reached, join the current table to the next table using the user specified join type
elif [[ $n -eq 0 ]]; then
echo "\"${tables[$n]}\" $3 JOIN \"${tables[$( expr $n + 1 )]}\" USING ($2)"
unset tables[$n]
else
echo "$3 JOIN \"${tables[$( expr $n + 1 )]}\" USING ($2)"
unset tables[$n]
fi
done |\
tr '\n' ' ' |\
# surround the join SQL with SQL for copying to stdout as tsv
sed "s:^:SELECT * FROM :g;s:$:\;:g"
)
# make tmpsql file for commands to live
tmpsql=$(mktemp)
# use SQLite dialect for exporting as csv or tab, print header
echo -e ".mode $4\n.header on\n" > $tmpsql
echo "$sqlite_string" >> $tmpsql
# NB: not sure why, but saving to file and then pipe works but piping directly does not!
cat $tmpsql | sqlite3 $tmpdb
# cleanup tmp files
rm $tmpdb $tmpsql
}
# get a count of unique entries in every field in a TSV
function uniqvals {
intsv=$(cat)
header=$(echo "$intsv" | head -n 1)
nfields=$( echo "$header" | tr '\t' '\n' | wc -l )
for field in $(seq 1 $nfields)
do
echo "$intsv" | sed '1d' | mawk -F'\t' "{print \$$field}" | sort | uniq -c | sort -k1 -rn > /tmp/${field}
done
echo "$header"
paste -d'\t' $(seq 1 $nfields | sed 's:^:/tmp/:g')
}
# get unique values for a single field - fast, does not sort
# NB: keeps header in place
# example: cat foo.tsv | c 1 | unique
function unique {
mawk '!x[$0]++'
}
# the following two functions create ngrams - length is chosen by the user
# the first - rawgrams - provides a simplified word list, with no punctuation or standalone runs of numbers, all lowercase, breaks on whitespace
# the second - ngrams - uses rawgrams to make a set of ngrams for the input doc
# ngrams respects newlines as doc breaks - meaning it is meant for columns of text where each record is considered a doc
# it also requires that user specified gram length be respected, meaning that partial ngrams will be ignored
# some quirky stuff - it considers punctuation to be the start of a new term, except for apostrophes which it ignores - this is because of english possessive
# example use to find trigrams by frequency from the second column of a TSV, ignoring header: ngrams <( cat foo.tsv | tawk '{ print $2 }' | sed '1d' ) 3 | sortfreq
# TODO: ensure header is not part of ngram
function rawgrams {
# remove apostrophes because of english possessive
tr -d "'" |\
# convert punctuation to whitespace - this breaks new terms on punctuation
tr '[:punct:]' ' ' |\
# convert newline to colon - remember, there are no colons now that we removed punct
tr '\n' ':' |\
# convert colons to ' : ' - we provide the whitespace so they will be considered terms
# this helps us split "documents" on newline
sed 's:\:: \: :g' |\
# convert whitespace to newline
# this determines what is counted as a term
tr ' ' '\n' |\
# remove totally blank records
grep -vE "^$" |\
# remove records that are nothing but runs of numbers
# notice that we keep records that may contain numbers in their terms, for example '1st'
grep -vE "^[0-9]+$" |\
# convert all terms to lowercase
awk '{ print tolower($0) }'
}
function ngrams {
# use the rawgrams function on the first user arg to get terms
raw=$( cat | rawgrams )
# for every integer "n" between 1 and the user specified number of terms for the ngram,
# remove the first "n-1" terms and write to file ( note that writing to static filenames prevents using this in parallel ).
# then these files are pasted together.
# note that no terms are ignored for the first pass because the first term must be included!
paste $(
for n in $( seq 1 $1 )
do
if [[ $n > 1 ]]; then
d=$(($n-1))
echo "$raw" |\
sed "1,${d}d" > /tmp/$n
else
echo "$raw" > /tmp/$n
fi
echo /tmp/$n
done
) |\
# remove standalone colons - these were our standins for newline so we could consider newline a separator for new docs
grep -oE "[^:]+" |\
# remove tabs with no term next to them - this is to clean up after ngrams that were partly composed of ' : '
sed 's:^\t::g;s:\t$::g;s:\t\+:\t:g' |\
# enforce the user input gram term count - before this the ngrams could be shorter than user specification if the document was too short
awk -v gramlength=$1 '{ if ( NF == gramlength ) print $0 }'
}
# use shuf to randomly take _n_ records
# note that shuf is much faster than sort -R, and that specifying the number of records saves us from running wc each time
# example: cat foo.txt | samplekh 100
function samplekh {
# save whole input
in=$( cat )
# grab header
header=$( echo "$in" | head -n 1 )
echo "$in" |\
# remove header before sampling
sed '1d' |\
# use bitly's data_hacks sample.py! with user arg for prct
shuf -n $1 |\
# slap header back on
sed "1 i$header"
}
# function to make ID field - 1-based unique intergers, one per record
# hope no field is named "id"!
function mkid {
in=$(cat)
# saves header
header=$(echo "$in" | head -n 1)
echo "$in" |\
# removes header
sed '1d' |\
# NB: even numbers blank lines
nl -ba |\
# trims leading whitespace, inserts tab delimiter
sed 's:^\s\+::g;s:^\([0-9]\+\) :\1\t:g' |\
# adds back header
sed "1 iid\t${header}"
}
# convert every record of a TSV into a redis hash using the first field as the hash name field
# NB: may want to clean up quotes, definitely want an ID field
# example use: cat foo.tsv | tsv2redis && echo "hgetall 1" | redis-cli --pipe
function tsv2redis {
in=$(cat)
# get just the header
header=$(echo "$in" | head -n 1 )
# prepping for awk, get the name of every field followed by the field number
for_awk=$(
echo "$header" |\
sed 's:\t:\n:g' |\
sed 's:^\|$:":g' |\
nl |\
trim |\
sed 's:^:\$:g' |\
tawk '{print $2,$1}' |\
tr '\t' ',' |\
tr '\n' ',' |\
sed 's:,$::g' |\
sed 's:"\+:":g'
)
# prints the field names, then their contents, for each record, eg "FirstName Janez LastName Novak"
toCountBytes=$(
echo "$in" |\
sed '1d' |\
tawk "{print $for_awk}"
)
# write Redis RESP bulk string
echo "$toCountBytes" |\
# for each element of the hash, count the number of bytes and prepend a $.
# except at the beginning we must specify number of elements and that we are defining a hash with HMSET
# NB: LANG=C is needed for awk to use bytes for length() instead of characters
# RESP bulk strings take the form "*2\r\n$3\r\nfoo\r\n$3\r\nbar\r\n"
LANG=C tawk '{OFS="\n"; print "*"NF,"$5","HMSET"; for(i=2;i<=NF;i++)print "$"length($i),$i}' |\
# append carriage return to end line
# NB: sed using forward slash delimiters allow for special whitespace characters!
sed 's/$/\r/' |\
# actually pipe to redis server - assumes default port
redis-cli --pipe
}
# sort TSV fields using UNIX sort options in double quotes - keeps header!
# example sorting by field 2: cat foo.tsv | sortkh "-k2 -rn"
# may require single argument worth of sort flags, eg -rnk3,3
function sortkh {
in=$(cat)
header=$(echo "$in" | head -n 1)
echo "$in" |\
sed '1d' |\
# sort with user options in double quotes
sort $1 -t' '|\
sed "1 i$header"
}
# use [Rio](https://github.com/jeroenjanssens/data-science-at-the-command-line) to make a bar chart of TSV from STDIN - for example output of sortfreq
# NB: prints PNG to STDOUT so may want to pipe to "feh -" or redirect to file
# example use: cat foo.tsv | tawk '{print $2}' | sortfreq | bargraph year count "title" 20 | feh -ZF -
# note that sortkh is used to get the lowest value on the top of the graph, which will flip when ggplot's coord_flip() is used
# note that the only needed user args are x and y - you can ignore title, title size, and dimensions for faster graphing, ideal for piping to "feh -"
# high res example use: cat foo.tsv | tawk '{print $2}' | sortfreq | sortkh "-k1 -n" | bargraph year count "title" 20 6 9 > foo.png
function plotbars {
x=$1
y=$2
# get vars for axis titles - keeps original text
x_axis_title=$1
y_axis_title=$2
# variable names lose spaces, any quotes must be escaped
x=$(echo "$x" | sed 's: :.:g;s:"::g')
y=$(echo "$y" | sed 's: :.:g;s:"::g')
titleText=$3
titleSize=$4
height=$5
width=$6
if [[ "$#" -lt 5 ]]; then
# if only first user args are used, then
# sets no res/dimensions - smaller res but much faster
Rio -d'\t' -ge "df\$$x <- factor(df\$$x, levels=unique(df\$$x));ggplot(df,aes(y=$y,x=$x)) + geom_bar(stat=\"identity\") + coord_flip() + labs(title=\"$titleText\",x=\"$x_axis_title\",y=\"$y_axis_title\") + theme(plot.title=element_text(size=$titleSize))"
else
# sets user specified height and width, dpi = 600
# get tmp png name
tmppng=$(mktemp -u --suffix .png)
Rio -d'\t' -ge "png(\"$tmppng\",units=\"in\",height=$5,width=$6,res=600); df\$$x <- factor(df\$$x, levels=unique(df\$$x));ggplot(df,aes(y=$y,x=$x)) + geom_bar(stat=\"identity\") + coord_flip() + labs(title=\"$titleText\",x=\"$x_axis_title\",y=\"$y_axis_title\") + theme(plot.title=element_text(size=$titleSize)); dev.off()" 1>/dev/null
cat $tmppng
fi
}
# quick cut
# example to print 8th and 9th fields of a TSV: cat foo.tsv | c 8,9
function c {
cut -f$1
}
# quick access to keepass2 via cli
# NB: assumes folder ~/sync - used by syncthing
# NB: you should change this to the path to your keepass2 db
function k {
kpcli --kdb ~/sync/Database.kdbx
}
# quick screen locking
function s {
# turn screen off
xset dpms force off
# lock screen
slock
}
# quick generic git add, commit (send to editor), and push
function p {
git add . && git add -u && git commit && git push origin master
}
# quick duckduckgo search on surfraw
# unless an elvis is specified
# NB: you should remove /usr/bin/sr to use this
function sr {
elvi=$(surfraw -elvi | grep -v "GLOBAL ELVI"|grep -oE "^[^--]*")
if [[ -z $(echo "$elvi" | grep -E "\b$1\b") ]]; then
surfraw duckduckgo "$@"
else
surfraw "$@"
fi
}
# quick access to graphical web browser vimb
# will visit ^.*:// directly but all others to duckduckgo
function v {
if [[ -n $(echo "$@" | grep -E "^.*://" ) ]]; then
/usr/local/bin/vimb "$@"
else
surfraw -browser=/usr/local/bin/vimb duckduckgo "$@"
fi
}
# show permissions in octal format
function lsoctal {
stat -c '%A %a %n' "$1"
}
# given a docker image and container name,
# makes docker stop container -> rm container -> rm image -> build image ->\
# run container -> start container -> exec container
# for example: dockertrial "popanthro/2016-05-16" popanthro MYSQL_ROOT_PASSWORD=root /opt/popanthro:/var/www
function dockertrial {
image="$1"
container="$2"
# TODO: do not require $run_env and $volume options
run_env="$3"
volume="$4"
sudo docker stop ${container}; sudo docker rm ${container}; sudo docker rmi "${image}"
sudo docker build -t "${image}" .
# NB: --restart always is used so we can take advantage of services running on system restart
# NB: untested
sudo docker run --restart always --name ${container} -d -e ${run_env} -v "${volume}" "${image}"
sudo docker start ${container} ; sudo docker exec -ti ${container} bash
}
# for easy ssh with key with passphrase
# NB: adds slight overhead to each multiplexer window
# NB: you should change the key name to be relevant for you
eval $(keychain --quiet --eval battuta-id_rsa)
# list info about mysql tables and columns
# example: mysql_listcols myuser mydb |csvlook -t | less -S
# NB: expects user pass in ~/.my.cnf
# NB: table divider of a ton of hyphens is extremely sloppy
function mysql_listcols {
echo "show tables;" | mysql -N -u $1 $2 | parallel --gnu 'cols=$(echo "select * from information_schema.columns where table_name=\"{}\""|mysql -u '$1' '$2'); echo "$cols\n----------------------------------------------"'
}
# connect to VMs easily with functions!
# expects a server running sshd and Windows and Linux VirtualBox VMs
# what these do _not_ do:
# 1. handle x2goclient
# 2. handle Windows RDP client
# 3. automatically detect protocol and port of VM given VM name (can do this later with a table that could be in a shared git repo)
# NB: user 'virtual' is hard coded as host user belonging to group vboxusers and owner of '~/VirtualBox\ VMs'
### the functions...
###|-----------------+----------------------------------------------------+----------------------------|
###| function | purpose | example use |
###|-----------------+----------------------------------------------------+----------------------------|
###| vm-list | list all VMs | vm-list |
###| vm-listrunning | list only running VMs | vm-listrunning |
###| vm-start | start the named VM | vm-start w10-production01 |
###| vm-stop | stop the named VM | vm-stop w10-production01 |
###| vm-ssh | SSH to the named VM given username and port number | vm-ssh myUser 2203 |
###| vm-rdp | RDP to the named VM given username and port number | vm-rdp myUser 3389 |
###|-----------------+----------------------------------------------------+----------------------------|
function vm-list {
ssh virtual@EXAMPLE_HOST.org VBoxManage list vms
}
function vm-listrunning {
ssh virtual@EXAMPLE_HOST.org VBoxManage list runningvms
}
# example use: vm-start w10-production01
function vm-start {
ssh virtual@EXAMPLE_HOST.org VBoxManage startvm "$1" --type headless &
}
# example use: vm-stop w10-production01
function vm-stop {
ssh virtual@EXAMPLE_HOST.org VBoxManage controlvm "$1" acpipowerbutton
}
# example use: vm-ssh myUser 2203
function vm-ssh {
ssh -X -p "$2" "$1"@EXAMPLE_HOST.org
}
# example use: vm-rdp myUser 3389
# if no port is given assume 3389
# Note: you should fill in the password to your own Windows VM
function vm-rdp {
if [[ -n "$2" ]]; then
2=3389
fi
ssh -N -L "$2":localhost:"$2" virtual@EXAMPLE_HOST.org &
xfreerdp -u "$1" -p 'EXAMPLE_PASSWORD' -x l -z --plugin rdpsnd --data alsa -- localhost -v
}
# pretty print TSVs and preview them with the pager less
# NB: alias replaces csvlook altogether so I hope you did not want to look at actual CSVs
alias csvlook="csvlook -t | less -S"
# makes new leading field by concatenating arbitrary existing fields according to cut syntax
# NB: assumes TSV input and output, like "cut" orders fields numerically regardless of user input, eg "1-3,8" is same as "8,1-3"
# example use: cat foo.tsv | mkid_concat "1-3,8"
function mkid_concat {
in=$(cat)
concat_field=$(
echo "$in" |\
cut -f "$1" |\
sed 's:\t:_:g'
)
paste <(echo "$concat_field") <(echo "$in")
}
# NB: do not use!
# TODO replace shuf as number generator
# no clue if shuf is at all secure
# example use: diceware 9 /path/to/diceware.wordlist.asc
function diceware {
diceware=$(cat "$2" | nl | trim )
for i in $(seq 1 "$1")
do
grep -Ef <(shuf -i 1-7776 -n 1 | sed 's:^:^:g;s:$:\t:g' ) <(echo "$diceware") |\
c 3 |\
tr '\n' ' '
done | sed 's:\s$::g'
}