forked from chrchang/plink-ng
-
Notifications
You must be signed in to change notification settings - Fork 0
/
plink_help.c
2247 lines (2241 loc) · 125 KB
/
plink_help.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "plink_common.h"
uint32_t edit1_match(uint32_t len1, char* s1, uint32_t len2, char* s2) {
// permit one difference of the following forms:
// - inserted/deleted character
// - replaced character
// - adjacent pair of swapped characters
uint32_t diff_found = 0;
uint32_t pos = 0;
if (len1 == len2) {
while (pos < len1) {
if (s1[pos] != s2[pos]) {
if (diff_found) {
if ((diff_found == 2) || (s1[pos] != s2[pos - 1]) || (s1[pos - 1] != s2[pos])) {
return 0;
}
}
diff_found++;
}
pos++;
}
} else if (len1 == len2 - 1) {
do {
if (s1[pos - diff_found] != s2[pos]) {
if (diff_found) {
return 0;
}
diff_found++;
}
pos++;
} while (pos < len2);
} else if (len1 == len2 + 1) {
do {
if (s1[pos] != s2[pos - diff_found]) {
if (diff_found) {
return 0;
}
diff_found++;
}
pos++;
} while (pos < len1);
} else {
return 0;
}
return 1;
}
#define MAX_EQUAL_HELP_PARAMS 22
typedef struct {
uint32_t iters_left;
uint32_t param_ct;
char** argv;
uintptr_t unmatched_ct;
uintptr_t* all_match_arr;
uintptr_t* prefix_match_arr;
uintptr_t* perfect_match_arr;
uint32_t* param_lens;
uint32_t preprint_newline;
} Help_ctrl;
void help_print(const char* cur_params, Help_ctrl* help_ctrl_ptr, uint32_t postprint_newline, const char* payload) {
// unmatched_ct fixed during call, *unmatched_ct_ptr may decrease
uint32_t unmatched_ct = help_ctrl_ptr->unmatched_ct;
uint32_t print_this = 0;
uint32_t cur_param_lens[MAX_EQUAL_HELP_PARAMS];
char* cur_param_start[MAX_EQUAL_HELP_PARAMS];
uint32_t arg_uidx;
uint32_t cur_param_ct;
uint32_t cur_param_idx;
uint32_t arg_idx;
uint32_t uii;
uint32_t payload_len;
char* payload_ptr;
char* line_end;
char* payload_end;
if (help_ctrl_ptr->param_ct) {
strcpy(tbuf, cur_params);
cur_param_ct = 1;
cur_param_start[0] = tbuf;
payload_ptr = strchr(tbuf, '\t');
while (payload_ptr) {
*payload_ptr++ = '\0';
cur_param_start[cur_param_ct++] = payload_ptr;
payload_ptr = strchr(payload_ptr, '\t');
}
if (help_ctrl_ptr->iters_left) {
if (help_ctrl_ptr->unmatched_ct) {
arg_uidx = 0;
if (help_ctrl_ptr->iters_left == 2) {
for (arg_idx = 0; arg_idx < unmatched_ct; arg_idx++) {
arg_uidx = next_unset_unsafe(help_ctrl_ptr->all_match_arr, arg_uidx);
for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
if (!strcmp(cur_param_start[cur_param_idx], help_ctrl_ptr->argv[arg_uidx])) {
SET_BIT(help_ctrl_ptr->perfect_match_arr, arg_uidx);
SET_BIT(help_ctrl_ptr->prefix_match_arr, arg_uidx);
SET_BIT(help_ctrl_ptr->all_match_arr, arg_uidx);
help_ctrl_ptr->unmatched_ct -= 1;
break;
}
}
arg_uidx++;
}
} else {
for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
cur_param_lens[cur_param_idx] = strlen(cur_param_start[cur_param_idx]);
}
for (arg_idx = 0; arg_idx < unmatched_ct; arg_idx++) {
arg_uidx = next_unset_unsafe(help_ctrl_ptr->all_match_arr, arg_uidx);
uii = help_ctrl_ptr->param_lens[arg_uidx];
for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
if (cur_param_lens[cur_param_idx] > uii) {
if (!memcmp(help_ctrl_ptr->argv[arg_uidx], cur_param_start[cur_param_idx], uii)) {
SET_BIT(help_ctrl_ptr->prefix_match_arr, arg_uidx);
SET_BIT(help_ctrl_ptr->all_match_arr, arg_uidx);
help_ctrl_ptr->unmatched_ct -= 1;
break;
}
}
}
arg_uidx++;
}
}
}
} else {
for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
cur_param_lens[cur_param_idx] = strlen(cur_param_start[cur_param_idx]);
}
for (arg_uidx = 0; arg_uidx < help_ctrl_ptr->param_ct; arg_uidx++) {
if (IS_SET(help_ctrl_ptr->prefix_match_arr, arg_uidx)) {
if (!print_this) {
if (IS_SET(help_ctrl_ptr->perfect_match_arr, arg_uidx)) {
for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
if (!strcmp(cur_param_start[cur_param_idx], help_ctrl_ptr->argv[arg_uidx])) {
print_this = 1;
break;
}
}
} else {
uii = help_ctrl_ptr->param_lens[arg_uidx];
for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
if (cur_param_lens[cur_param_idx] > uii) {
if (!memcmp(help_ctrl_ptr->argv[arg_uidx], cur_param_start[cur_param_idx], uii)) {
print_this = 1;
break;
}
}
}
}
}
} else {
for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
if (edit1_match(cur_param_lens[cur_param_idx], cur_param_start[cur_param_idx], help_ctrl_ptr->param_lens[arg_uidx], help_ctrl_ptr->argv[arg_uidx])) {
print_this = 1;
if (!IS_SET(help_ctrl_ptr->all_match_arr, arg_uidx)) {
SET_BIT(help_ctrl_ptr->all_match_arr, arg_uidx);
help_ctrl_ptr->unmatched_ct -= 1;
}
break;
}
}
}
}
if (print_this) {
payload_len = strlen(payload);
if (payload[payload_len - 2] == '\n') {
payload_end = (char*)(&(payload[payload_len - 1]));
} else {
payload_end = (char*)(&(payload[payload_len]));
}
if (help_ctrl_ptr->preprint_newline) {
putchar('\n');
}
help_ctrl_ptr->preprint_newline = postprint_newline;
payload_ptr = (char*)payload;
do {
line_end = strchr(payload_ptr, '\n') + 1;
uii = (uint32_t)(line_end - payload_ptr);
if (uii > 2) {
payload_ptr = &(payload_ptr[2]);
uii -= 2;
}
memcpyx(tbuf, payload_ptr, uii, 0);
fputs(tbuf, stdout);
payload_ptr = line_end;
} while (payload_ptr < payload_end);
}
}
} else {
fputs(payload, stdout);
}
}
int32_t disp_help(uint32_t param_ct, char** argv) {
// yes, this is overkill. But it should be a good template for other
// command-line programs to use.
uint32_t param_ctl = (param_ct + (BITCT - 1)) / BITCT;
int32_t retval = 0;
Help_ctrl help_ctrl;
uint32_t arg_uidx;
uint32_t arg_idx;
uint32_t net_unmatched_ct;
int32_t col_num;
int32_t leading_dashes;
help_ctrl.iters_left = param_ct? 2 : 0;
help_ctrl.param_ct = param_ct;
help_ctrl.argv = argv;
help_ctrl.unmatched_ct = param_ct;
help_ctrl.param_lens = NULL;
help_ctrl.all_match_arr = NULL;
help_ctrl.argv = NULL;
if (param_ct) {
help_ctrl.param_lens = (uint32_t*)malloc(param_ct * sizeof(int32_t));
if (!help_ctrl.param_lens) {
goto disp_help_ret_NOMEM;
}
help_ctrl.all_match_arr = (uintptr_t*)malloc(param_ctl * 3 * sizeof(intptr_t));
if (!help_ctrl.all_match_arr) {
goto disp_help_ret_NOMEM;
}
leading_dashes = 0;
for (arg_uidx = 0; arg_uidx < param_ct; arg_uidx++) {
if (argv[arg_uidx][0] == '-') {
leading_dashes = 1;
break;
}
}
if (leading_dashes) {
help_ctrl.argv = (char**)malloc(param_ct * sizeof(char*));
if (!help_ctrl.argv) {
goto disp_help_ret_NOMEM;
}
for (arg_uidx = 0; arg_uidx < param_ct; arg_uidx++) {
if (argv[arg_uidx][0] == '-') {
if (argv[arg_uidx][1] == '-') {
help_ctrl.argv[arg_uidx] = &(argv[arg_uidx][2]);
} else {
help_ctrl.argv[arg_uidx] = &(argv[arg_uidx][1]);
}
} else {
help_ctrl.argv[arg_uidx] = argv[arg_uidx];
}
}
} else {
help_ctrl.argv = argv;
}
for (arg_idx = 0; arg_idx < param_ct; arg_idx++) {
help_ctrl.param_lens[arg_idx] = strlen(help_ctrl.argv[arg_idx]);
}
fill_ulong_zero(help_ctrl.all_match_arr, param_ctl * 3);
help_ctrl.prefix_match_arr = &(help_ctrl.all_match_arr[param_ctl]);
help_ctrl.perfect_match_arr = &(help_ctrl.all_match_arr[param_ctl * 2]);
help_ctrl.preprint_newline = 1;
} else {
help_ctrl.argv = NULL;
fputs(
"\nIn the command line flag definitions that follow,\n"
" * [square brackets] denote a required parameter, where the text between the\n"
" brackets describes its nature.\n"
" * <angle brackets> denote an optional modifier (or if '|' is present, a set\n"
" of mutually exclusive optional modifiers). Use the EXACT text in the\n"
" definition, e.g. '--dummy acgt'.\n"
" * There's one exception to the angle brackets/exact text rule: when an angle\n"
" bracket term ends with '=[value]', '[value]' designates a variable\n"
" parameter.\n"
" * {curly braces} denote an optional parameter, where the text between the\n"
" braces describes its nature.\n"
" * An ellipsis (...) indicates that you may enter multiple parameters of the\n"
" specified type.\n"
, stdout);
fputs(cmdline_format_str, stdout);
fputs(
"Most " PROG_NAME_CAPS " runs require exactly one main input fileset. The following flags\n"
"are available for defining its form and location:\n\n"
, stdout);
}
do {
help_print("bfile\tbed\tbim\tfam", &help_ctrl, 1,
" --bfile {prefix} : Specify .bed + .bim + .fam prefix (default '" PROG_NAME_STR "').\n"
" --bed [filename] : Specify full name of .bed file.\n"
" --bim [filename] : Specify full name of .bim file.\n"
" --fam [filename] : Specify full name of .fam file.\n\n"
);
help_print("file\ttfile\tlfile\tvcf\tbcf\tdata\t23file\tkeep-autoconv", &help_ctrl, 1,
" --keep-autoconv : With --file/--tfile/--lfile/--vcf/--bcf/--data/--23file,\n"
" don't delete autogenerated binary fileset at end of run.\n\n"
);
help_print("file\tped\tmap", &help_ctrl, 1,
" --file {prefix} : Specify .ped + .map filename prefix (default '" PROG_NAME_STR "').\n"
" --ped [filename] : Specify full name of .ped file.\n"
" --map [filename] : Specify full name of .map file.\n\n"
);
help_print("bfile\tfam\tfile\tped\tno-fid\tno-parents\tno-sex\tno-pheno", &help_ctrl, 1,
" --no-fid : .fam/.ped file does not contain column 1 (family ID).\n"
" --no-parents : .fam/.ped file does not contain columns 3-4 (parents).\n"
" --no-sex : .fam/.ped file does not contain column 5 (sex).\n"
" --no-pheno : .fam/.ped file does not contain column 6 (phenotype).\n\n"
);
help_print("tfile\ttped\ttfam", &help_ctrl, 1,
" --tfile {prefix} : Specify .tped + .tfam filename prefix (default '" PROG_NAME_STR "').\n"
" --tped [fname] : Specify full name of .tped file.\n"
" --tfam [fname] : Specify full name of .tfam file.\n\n"
);
help_print("lfile\treference\tallele-count", &help_ctrl, 1,
" --lfile {prefix} : Specify .lgen + .map + .fam (long-format fileset) prefix.\n"
" --reference [fn] : Specify default allele file accompanying --lfile input.\n"
" --allele-count : When used with --lfile + --reference, specifies that the\n"
" .lgen file contains reference allele counts.\n\n"
);
help_print("vcf\tbcf", &help_ctrl, 1,
" --vcf [filename] : Specify full name of .vcf or .vcf.gz file.\n"
" --bcf [filename] : Specify full name of BCF2 file.\n\n"
);
help_print("data\tgen\tbgen\tsample", &help_ctrl, 1,
" --data {prefix} : Specify Oxford .gen + .sample prefix (default '" PROG_NAME_STR "').\n"
" --gen [filename] : Specify full name of .gen or .gen.gz file.\n"
" --bgen [f] <snpid-chr> : Specify full name of .bgen file.\n"
" --sample [fname] : Specify full name of .sample file.\n\n"
);
help_print("23file", &help_ctrl, 1,
" --23file [fname] {FID} {IID} {sex} {pheno} {pat. ID} {mat. ID} :\n"
" Specify 23andMe input file.\n\n"
);
#ifndef STABLE_BUILD
help_print("cfile\tcnv-list\tgfile", &help_ctrl, 1,
" --cfile [prefix] : Specify .cnv + .fam + .cnv.map (segmental CNV) prefix.\n"
" --cnv-list [fn] : Specify full name of .cnv file.\n"
" --gfile [prefix] : Specify .gvar + .fam + .map (genetic variant) prefix.\n\n"
);
#endif
help_print("grm\tgrm-gz\tgrm-bin\trel-cutoff\tgrm-cutoff", &help_ctrl, 1,
" --grm-gz {prfx} : Specify .grm.gz + .grm.id (GCTA rel. matrix) prefix.\n"
" --grm-bin {prfx} : Specify .grm.bin + .grm.N.bin + .grm.id (GCTA triangular\n"
" binary relationship matrix) filename prefix.\n\n"
);
help_print("dummy", &help_ctrl, 1,
" --dummy [sample ct] [SNP ct] {missing geno freq} {missing pheno freq}\n"
" <acgt | 1234 | 12> <scalar-pheno>\n"
" This generates a fake input dataset with the specified number of samples\n"
" and SNPs. By default, the missing genotype and phenotype frequencies are\n"
" zero, and genotypes are As and Bs (change the latter with\n"
" 'acgt'/'1234'/'12'). The 'scalar-pheno' modifier causes a normally\n"
" distributed scalar phenotype to be generated instead of a binary one.\n\n"
);
help_print("simulate\tsimulate-qt", &help_ctrl, 1,
" --simulate [simulation parameter file] <tags | haps> <acgt | 1234 | 12>\n"
" --simulate-qt [simulation parameter file] <tags | haps> <acgt | 1234 | 12>\n"
" --simulate generates a fake input dataset with disease-associated SNPs,\n"
" while --simulate-qt generates a dataset with quantitative trait loci.\n\n"
);
if (!param_ct) {
fputs(
"Output files have names of the form '" PROG_NAME_STR ".{extension}' by default. You can\n"
"change the '" PROG_NAME_STR "' prefix with\n\n"
, stdout);
}
help_print("out", &help_ctrl, 1,
" --out [prefix] : Specify prefix for output files.\n\n"
);
if (!param_ct) {
fputs(
"Most runs also require at least one of the following commands:\n\n"
, stdout);
}
help_print("make-bed", &help_ctrl, 1,
" --make-bed\n"
" Create a new binary fileset. Unlike the automatic text-to-binary\n"
" converters (which only heed chromosome filters), this supports all of\n"
" " PROG_NAME_CAPS "'s filtering flags.\n"
);
help_print("make-just-bim\tmake-just-fam", &help_ctrl, 1,
" --make-just-bim\n"
" --make-just-fam\n"
" Variants of --make-bed which only write a new .bim or .fam file. Can be\n"
" used with only .bim/.fam input.\n"
" USE THESE CAUTIOUSLY. It is very easy to desynchronize your binary\n"
" genotype data and your .bim/.fam indexes if you use these commands\n"
" improperly. If you have any doubt, stick with --make-bed.\n"
);
help_print("recode\trecode12\ttab\ttranspose\trecode-lgen\trecodeAD\trecodead\trecodeA\trecodea\trecode-rlist\trecode-allele\tlist\twith-reference\trecode-vcf\tfid\tiid\trecode-beagle\trecode-bimbam\trecode-fastphase\trecodeHV\trecodehv\trecode-structure", &help_ctrl, 1,
" --recode <01 | 12> <23 | A{-transpose} | AD | beagle{-nomap} | bimbam{-1chr}\n"
" | compound-genotypes | fastphase{-1chr} | HV{-1chr} | lgen{-ref} |\n"
" list | oxford | rlist | structure | transpose | vcf | vcf-fid |\n"
" vcf-iid> <tab | tabx | spacex | bgz> <include-alt>\n"
" Create a new text fileset with all filters applied. By default, the\n"
" fileset consists of a .ped and a .map file, readable with --file.\n"
" * The '12' modifier causes A1 (usually minor) alleles to be coded as '1'\n"
" and A2 alleles to be coded as '2', while '01' maps A1 -> 0 and A2 -> 1.\n"
" * The '23' modifier causes a 23andMe-formatted file to be generated. This\n"
" can only be used on a single sample's data (--keep may be handy).\n"
" * The 'AD' modifier causes an sample-major additive (0/1/2) + dominant\n"
" (het = 1, otherwise 0) component file, suitable for loading from R, to be\n"
" generated. If you don't want the dominant component, use 'A' instead.\n"
" If you need uncounted alleles to be named in the header line, add the\n"
" 'include-alt' modifier.\n"
" * The 'A-transpose' modifier causes a variant-major additive component file\n"
" to be generated.\n"
" * The 'beagle' modifier causes unphased per-autosome .dat and .map files,\n"
" readable by early BEAGLE versions, to be generated, while 'beagle-nomap'\n"
" generates a single .beagle.dat file.\n"
" * The 'bimbam' modifier causes a BIMBAM-formatted fileset to be generated.\n"
" If your input data only contains one chromosome, you can use\n"
" 'bimbam-1chr' instead to write a two-column .pos.txt file.\n"
" * The 'compound-genotypes' modifier removes the space between pairs of\n"
" allele codes for the same variant when generating a .ped + .map fileset.\n"
" * The 'fastphase' modifier causes per-chromosome fastPHASE files to be\n"
" generated. If your input data only contains one chromosome, you can use\n"
" 'fastphase-1chr' instead to exclude the chromosome number from the file\n"
" extension.\n"
" * The 'HV' modifier causes a Haploview-format .ped + .info fileset to be\n"
" generated per chromosome. 'HV-1chr' is analogous to 'fastphase-1chr'.\n"
" * The 'lgen' modifier causes a long-format fileset (loadable with --lfile)\n"
" to be generated, while 'lgen-ref' generates a (usually) smaller\n"
" long-format fileset loadable with --lfile + --reference.\n"
" * The 'list' modifier creates a genotype-based list, while 'rlist' creates\n"
" a rare-genotype fileset.\n"
" * 'oxford' causes an Oxford-format .gen + .sample fileset to be generated.\n"
" * The 'structure' modifier causes a Structure-format file to be generated.\n"
" * 'transpose' creates a transposed text fileset (loadable with --tfile).\n"
" * 'vcf', 'vcf-fid', and 'vcf-iid' result in production of a VCFv4.2 file.\n"
" 'vcf-fid' and 'vcf-iid' cause family IDs or within-family IDs\n"
" respectively to be used for the sample IDs in the last header row, while\n"
" 'vcf' merges both IDs and puts an underscore between them.\n"
" If the 'bgz' modifier is added, the VCF file is block-gzipped.\n"
" The A2 allele is saved as the reference and normally flagged as not based\n"
" on a real reference genome ('PR' INFO field value). When it is important\n"
" for reference alleles to be correct, you'll also want to include\n"
" --a2-allele and --real-ref-alleles in your command.\n"
" * The 'tab' modifier makes the output mostly tab-delimited instead of\n"
" mostly space-delimited. 'tabx' and 'spacex' force all tabs and all\n"
" spaces, respectively.\n\n"
);
help_print("flip-scan\tflip-scan-verbose\tflipscan", &help_ctrl, 1,
" --flip-scan <verbose>\n"
" (alias: --flipscan)\n"
" LD-based scan for case/control strand inconsistency.\n\n"
);
help_print("write-covar", &help_ctrl, 1,
" --write-covar\n"
" If a --covar file is loaded, --make-bed/--make-just-fam and --recode\n"
" automatically generate an updated version (with all filters applied).\n"
" However, if you do not wish to simultaneously generate a new genotype file,\n"
" you can use --write-covar to just produce a pruned covariate file.\n\n"
);
help_print("write-cluster", &help_ctrl, 1,
" --write-cluster <omit-unassigned>\n"
" If clusters are specified with --within/--family, this generates a new\n"
" cluster file (with all filters applied). The 'omit-unassigned' modifier\n"
" causes unclustered samples to be omitted from the file; otherwise their\n"
" cluster is 'NA'.\n\n"
);
help_print("write-set\tset-table", &help_ctrl, 1,
" --write-set\n"
" --set-table\n"
" If sets have been defined, --write-set dumps 'END'-terminated set\n"
" membership lists to {output prefix}.set, while --set-table writes a\n"
" variant-by-set membership table to {output prefix}.set.table.\n\n"
);
help_print("merge\tbmerge\tmerge-list\tmerge-mode", &help_ctrl, 1,
" --merge [.ped filename] [.map filename]\n"
" --merge [text fileset prefix]\n"
" --bmerge [.bed filename] [.bim filename] [.fam filename]\n"
" --bmerge [binary fileset prefix]\n"
" Merge the given fileset with the initially loaded fileset, writing the\n"
" result to {output prefix}.bed + .bim + .fam. (It is no longer necessary to\n"
" simultaneously specify --make-bed.)\n"
" --merge-list [filename]\n"
" Merge all filesets named in the text file with the reference fileset, if\n"
" one was specified. (However, this can also be used *without* a reference;\n"
" in that case, the newly created fileset is then treated as the reference by\n"
" most other PLINK operations.) The text file is interpreted as follows:\n"
" * If a line contains only one name, it is assumed to be the prefix for a\n"
" binary fileset.\n"
" * If a line contains exactly two names, they are assumed to be the full\n"
" filenames for a text fileset (.ped first, then .map).\n"
" * If a line contains exactly three names, they are assumed to be the full\n"
" filenames for a binary fileset (.bed, then .bim, then .fam).\n\n"
);
help_print("write-snplist\tlist-23-indels", &help_ctrl, 1,
" --write-snplist\n"
" --list-23-indels\n"
" --write-snplist writes a .snplist file listing the names of all variants\n"
" which pass the filters and inclusion thresholds you've specified, while\n"
" --list-23-indels writes the subset with 23andMe-style indel calls (D/I\n"
" allele codes).\n\n"
);
help_print("list-duplicate-vars", &help_ctrl, 1,
" --list-duplicate-vars <require-same-ref> <ids-only> <suppress-first>\n"
" --list-duplicate-vars writes a .dupvar file describing all groups of\n"
" variants with matching positions and allele codes.\n"
" * By default, A1/A2 allele assignments are ignored; use 'require-same-ref'\n"
" to override this.\n"
" * Normally, the report contains position and allele codes. To remove them\n"
" (and produce a file directly usable with e.g. --extract/--exclude), use\n"
" 'ids-only'. Note that this command will fail in 'ids-only' mode if any\n"
" of the reported IDs are not unique.\n"
" * 'suppress-first' causes the first variant ID in each group to be omitted\n"
" from the report.\n\n"
);
help_print("freq\tfreqx\tfrqx\tcounts", &help_ctrl, 1,
" --freq <counts> <gz>\n"
" --freqx <gz>\n"
" --freq generates a basic allele frequency (or count, if the 'counts'\n"
" modifier is present) report. This can be combined with --within/--family\n"
" to produce a cluster-stratified allele frequency/count report instead.\n"
" --freqx generates a more detailed genotype count report, designed for use\n"
" with --read-freq.\n\n"
);
help_print("missing", &help_ctrl, 1,
" --missing <gz>\n"
" Generate sample- and variant-based missing data reports. If clusters are\n"
" defined, the variant-based report is cluster-stratified. 'gz' causes the\n"
" output files to be gzipped.\n\n"
);
help_print("test-mishap", &help_ctrl, 1,
" --test-mishap\n"
" Check for association between missing calls and flanking haplotypes.\n\n"
);
help_print("hardy\thardy2", &help_ctrl, 1,
" --hardy <midp> <gz>\n"
" Generate a Hardy-Weinberg exact test p-value report. (This does NOT\n"
" simultaneously filter on the p-value any more; use --hwe for that.) With\n"
" the 'midp' modifier, the test applies the mid-p adjustment described in\n"
" Graffelman J, Moreno V (2013) The mid p-value in exact tests for\n"
" Hardy-Weinberg Equilibrium.\n\n"
);
help_print("mendel", &help_ctrl, 1,
" --mendel\n"
" Generate a Mendel error report.\n\n"
);
help_print("het\tibc", &help_ctrl, 1,
" --het <small-sample> <gz>\n"
" --ibc\n"
" Estimate inbreeding coefficients. --het reports method-of-moments\n"
" estimates, while --ibc calculates all three values described in Yang J, Lee\n"
" SH, Goddard ME and Visscher PM (2011) GCTA: A Tool for Genome-wide Complex\n"
" Trait Analysis. (That paper also describes the relationship matrix\n"
" computation we reimplement.)\n"
" * These functions require decent MAF estimates. If there are very few\n"
" samples in your immediate fileset, --read-freq is practically mandatory\n"
" since imputed MAFs are wildly inaccurate in that case.\n"
" * They also assume the marker set is in approximate linkage equilibrium.\n"
" * By default, --het omits the n/(n-1) multiplier in Nei's expected\n"
" homozygosity formula. The 'small-sample' modifier causes it to be\n"
" included, while forcing --het to use MAFs imputed from founders in the\n"
" immediate dataset.\n\n"
);
help_print("check-sex\timpute-sex\tupdate-sex\tsex-check", &help_ctrl, 1,
" --check-sex {female max F} {male min F}\n"
" --check-sex ycount {female max F} {male min F} {female max Y obs}\n"
" {male min Y obs}\n"
" --check-sex y-only {female max Y obs} {male min Y obs}\n"
" --impute-sex {female max F} {male min F}\n"
" --impute-sex ycount {female max F} {male min F} {female max Y obs}\n"
" {male min Y obs}\n"
" --impute-sex y-only {female max Y obs} {male min Y obs}\n"
" --check-sex normally compares sex assignments in the input dataset with\n"
" those imputed from X chromosome inbreeding coefficients.\n"
" * Make sure that the X chromosome pseudo-autosomal region has been split\n"
" off (with e.g. --split-x) before using this.\n"
" * You also need decent MAF estimates (so, with very few samples in your\n"
" immediate fileset, use --read-freq), and your marker set should be in\n"
" approximate linkage equilibrium.\n"
" * By default, F estimates smaller than 0.2 yield female calls, and values\n"
" larger than 0.8 yield male calls. If you pass numeric parameter(s) to\n"
" --check-sex, the first two control these thresholds.\n"
" There are now two modes which consider Y chromosome data.\n"
" * In 'ycount' mode, gender is still imputed from the X chromosome, but\n"
" female calls are downgraded to ambiguous whenever more than 0 nonmissing\n"
" Y genotypes are present, and male calls are downgraded when fewer than 0\n"
" are present. (Note that these are counts, not rates.) These thresholds\n"
" are controllable with --check-sex ycount's optional 3rd and 4th numeric\n"
" parameters.\n"
" * In 'y-only' mode, gender is imputed from nonmissing Y genotype counts.\n"
" The male minimum threshold defaults to 1 instead of zero in this case.\n"
" --impute-sex changes sex assignments to the imputed values, and is\n"
" otherwise identical to --check-sex. It must be used with\n"
" --make-bed/--recode/--write-covar.\n\n"
);
help_print("fst\tFst", &help_ctrl, 1,
" --fst <case-control>\n"
" (alias: --Fst)\n"
" Estimate Wright's Fst for each autosomal diploid variant using the method\n"
" introduced in Weir BS, Cockerham CC (1984) Estimating F-statistics for the\n"
" analysis of population structure, given a set of subpopulations defined via\n"
" --within. Raw and weighted global means are also reported.\n"
" * If you're interested in the global means, it is usually best to perform\n"
" this calculation on a marker set in approximate linkage equilibrium.\n"
" * If you have only two subpopulations, you can represent them with\n"
" case/control status and use the 'case-control' modifier.\n\n"
);
help_print("indep\tindep-pairwise\tindep-pairphase", &help_ctrl, 1,
" --indep [window size]<kb> [step size (variant ct)] [VIF threshold]\n"
" --indep-pairwise [window size]<kb> [step size (variant ct)] [r^2 threshold]\n"
" --indep-pairphase [window size]<kb> [step size (variant ct)] [r^2 threshold]\n"
" Generate a list of markers in approximate linkage equilibrium. With the\n"
" 'kb' modifier, the window size is in kilobase instead of variant count\n"
" units. (Pre-'kb' space is optional, i.e. '--indep-pairwise 500 kb 5 0.5'\n"
" and '--indep-pairwise 500kb 5 0.5' have the same effect.)\n"
" Note that you need to rerun " PROG_NAME_CAPS " using --extract or --exclude on the\n"
" .prune.in/.prune.out file to apply the list to another computation.\n\n"
);
help_print("r\tr2\tmatrix\tinter-chr\tD\tdprime\twith-freqs\tld", &help_ctrl, 1,
" --r <square | square0 | triangle | inter-chr> <gz | bin | bin4> <spaces>\n"
" <in-phase> <dprime> <with-freqs> <yes-really>\n"
" --r2 <square | square0 | triangle | inter-chr> <gz | bin | bin4> <spaces>\n"
" <in-phase> <dprime> <with-freqs> <yes-really>\n"
" LD statistic reports. --r yields raw inter-variant correlations, while\n"
" --r2 reports their squares. You can request results for all pairs in\n"
" matrix format (if you specify 'bin' or one of the shape modifiers), all\n"
" pairs in table format ('inter-chr'), or a limited window in table format\n"
" (default).\n"
" * The 'gz' modifier causes the output text file to be gzipped.\n"
" * 'bin' causes the output matrix to be written in double-precision binary\n"
" format, while 'bin4' specifics single-precision binary. The matrix is\n"
" square if no shape is explicitly specified.\n"
" * By default, text matrices are tab-delimited; 'spaces' switches this.\n"
" * 'in-phase' adds a column with in-phase allele pairs to table-formatted\n"
" reports. (This cannot be used with very long allele codes.)\n"
" * 'dprime' adds Lewontin's D-prime statistic to table-formatted reports,\n"
" and forces both r/r^2 and D-prime to be based on the maximum likelihood\n"
" solution to the cubic equation discussed in Gaunt T, Rodriguez S, Day I\n"
" (2007) Cubic exact solutions for the estimation of pairwise haplotype\n"
" frequencies.\n"
" * 'with-freqs' adds MAF columns to table-formatted reports.\n"
" * Since the resulting file can easily be huge, you're required to add the\n"
" 'yes-really' modifier when requesting an unfiltered, non-distributed all\n"
" pairs computation on more than 400k variants.\n"
" * These computations can be subdivided with --parallel (even when the\n"
" 'square' modifier is active).\n"
" --ld [variant ID] [variant ID] <hwe-midp>\n"
" This displays haplotype frequencies, r^2, and D' for a single pair of\n"
" variants. When there are multiple biologically possible solutions to the\n"
" haplotype frequency cubic equation, all are displayed (instead of just the\n"
" maximum likelihood solution identified by --r/--r2), along with HWE exact\n"
" test statistics.\n\n"
);
help_print("show-tags", &help_ctrl, 1,
" --show-tags [filename]\n"
" --show-tags all\n"
" * If a file is specified, list all variants which tag at least one variant\n"
" named in the file. (This will normally be a superset of the original\n"
" list, since a variant is considered to tag itself here.)\n"
" * If 'all' mode is specified, for each variant, each *other* variant which\n"
" tags it is reported.\n\n"
);
help_print("blocks\thap\thap-all\thap-assoc\thap-freq\thap-impute\thap-impute-verbose\thap-linear\thap-logistic\thap-max-phase\thap-min-phase-prob\thap-miss\thap-omnibus\thap-only\thap-phase\thap-phase-wide\thap-pp\thap-snps\thap-tdt\thap-window\tchap\twhap", &help_ctrl, 1,
" --blocks <no-pheno-req> <no-small-max-span>\n"
" Estimate haplotype blocks, via Haploview's interpretation of the block\n"
" definition suggested by Gabriel S et al. (2002) The Structure of Haplotype\n"
" Blocks in the Human Genome.\n"
" * Normally, samples with missing phenotypes are not considered by this\n"
" computation; the 'no-pheno-req' modifier lifts this restriction.\n"
" * Normally, size-2 blocks may not span more than 20kb, and size-3 blocks\n"
" are limited to 30kb. The 'no-small-max-span' modifier removes these\n"
" limits.\n"
" The .blocks file is valid input for PLINK 1.07's --hap command. However,\n"
" the --hap... family of flags has not been reimplemented in PLINK 1.9 due to\n"
" poor phasing accuracy relative to other software; for now, we recommend\n"
" using BEAGLE instead of PLINK for case/control haplotype association\n"
" analysis. (You can use '--recode beagle' to export data to BEAGLE 3.3.)\n"
" We apologize for the inconvenience, and plan to develop variants of the\n"
" --hap... flags which handle pre-phased data effectively.\n\n"
);
help_print("distance", &help_ctrl, 1,
" --distance <square | square0 | triangle> <gz | bin | bin4> <ibs> <1-ibs>\n"
" <allele-ct> <flat-missing>\n"
" Write a lower-triangular tab-delimited table of (weighted) genomic\n"
" distances in allele count units to {output prefix}.dist, and a list of the\n"
" corresponding sample IDs to {output prefix}.dist.id. The first row of the\n"
" .dist file contains a single {genome 1-genome 2} distance, the second row\n"
" has the {genome 1-genome 3} and {genome 2-genome 3} distances in that\n"
" order, etc.\n"
" * It is usually best to perform this calculation on a marker set in\n"
" approximate linkage equilibrium.\n"
" * If the 'square' or 'square0' modifier is present, a square matrix is\n"
" written instead; 'square0' fills the upper right triangle with zeroes.\n"
" * If the 'gz' modifier is present, a compressed .dist.gz file is written\n"
" instead of a plain text file.\n"
" * If the 'bin' modifier is present, a binary (square) matrix of\n"
" double-precision floating point values, suitable for loading from R, is\n"
" instead written to {output prefix}.dist.bin. ('bin4' specifies\n"
" single-precision numbers instead.) This can be combined with 'square0'\n"
" if you still want the upper right zeroed out, or 'triangle' if you don't\n"
" want to pad the upper right at all.\n"
" * If the 'ibs' modifier is present, an identity-by-state matrix is written\n"
" to {output prefix}.mibs. '1-ibs' causes distances expressed as genomic\n"
" proportions (i.e. 1 - IBS) to be written to {output prefix}.mdist.\n"
" Combine with 'allele-ct' if you want to generate the usual .dist file as\n"
" well.\n"
" * By default, distance rescaling in the presence of missing genotype calls\n"
" is sensitive to allele count distributions: if variant A contributes, on\n"
" average, twice as much to other pairwise distances as variant B, a\n"
" missing call at variant A will result in twice as large of a missingness\n"
" correction. To turn this off (because e.g. your missing calls are highly\n"
" nonrandom), use the 'flat-missing' modifier.\n"
" * The computation can be subdivided with --parallel.\n"
);
help_print("distance-matrix\tibs-matrix\tmatrix", &help_ctrl, 1,
" --distance-matrix\n"
" --ibs-matrix\n"
" These deprecated commands are equivalent to '--distance 1-ibs flat-missing\n"
" square' and '--distance ibs flat-missing square', respectively, except that\n"
" they generate space- instead of tab-delimited text matrices.\n\n"
);
help_print("make-rel", &help_ctrl, 1,
" --make-rel <square | square0 | triangle> <gz | bin | bin4>\n"
" <cov | ibc2 | ibc3>\n"
" Write a lower-triangular variance-standardized realized relationship matrix\n"
" to {output prefix}.rel, and corresponding IDs to {output prefix}.rel.id.\n"
" * It is usually best to perform this calculation on a marker set in\n"
" approximate linkage equilibrium.\n"
" * 'square', 'square0', 'triangle', 'gz', 'bin', and 'bin4' act as they do\n"
" on --distance.\n"
" * The 'cov' modifier removes the variance standardization step, causing a\n"
" covariance matrix to be calculated instead.\n"
" * By default, the diagonal elements in the relationship matrix are based on\n"
" --ibc's Fhat1; use the 'ibc2' or 'ibc3' modifiers to base them on Fhat2\n"
" or Fhat3 instead.\n"
" * The computation can be subdivided with --parallel.\n"
);
help_print("make-grm\tmake-grm-bin\tgrm\tgrm-bin\tmake-grm-gz", &help_ctrl, 1,
" --make-grm-gz <no-gz> <cov | ibc2 | ibc3>\n"
" --make-grm-bin <cov | ibc2 | ibc3>\n"
" --make-grm-gz writes the relationships in GCTA's original gzipped list\n"
" format, which describes one pair per line, while --make-grm-bin writes them\n"
" in GCTA 1.1+'s single-precision triangular binary format. Note that these\n"
" formats explicitly report the number of valid observations (where neither\n"
" sample has a missing call) for each pair, which is useful input for some\n"
" scripts.\n"
" These computations can be subdivided with --parallel.\n\n"
);
help_print("rel-cutoff\tgrm-cutoff", &help_ctrl, 1,
" --rel-cutoff {val}\n"
" (alias: --grm-cutoff)\n"
" Exclude one member of each pair of samples with relatedness greater than\n"
" the given cutoff value (default 0.025). If no later operation will cause\n"
" the list of remaining samples to be written to disk, this will save it to\n"
" {output prefix}.rel.id.\n"
" Note that maximizing the remaining sample size is equivalent to the NP-hard\n"
" maximum independent set problem, so we use a greedy algorithm instead of\n"
" guaranteeing optimality. (Use the --make-rel and --keep/--remove flags if\n"
" you want to try to do better.)\n\n"
);
help_print("ibs-test\tgroupdist", &help_ctrl, 1,
" --ibs-test {permutation count}\n"
" --groupdist {iters} {d}\n"
" Given case/control phenotype data, these commands consider three subsets of\n"
" the distance matrix: pairs of affected samples, affected-unaffected pairs,\n"
" and pairs of unaffected samples. Each of these subsets has a distribution\n"
" of pairwise genomic distances; --ibs-test uses permutation to estimate\n"
" p-values re: which types of pairs are most similar, while --groupdist\n"
" focuses on the differences between the centers of these distributions and\n"
" estimates standard errors via delete-d jackknife.\n\n"
);
help_print("regress-distance\tregress-rel", &help_ctrl, 1,
" --regress-distance {iters} {d}\n"
" Linear regression of pairwise genomic distances on pairwise average\n"
" phenotypes and vice versa, using delete-d jackknife for standard errors. A\n"
" scalar phenotype is required.\n"
" * With less than two parameters, d is set to {number of people}^0.6 rounded\n"
" down. With no parameters, 100k iterations are run.\n"
" --regress-rel {iters} {d}\n"
" Linear regression of pairwise genomic relationships on pairwise average\n"
" phenotypes, and vice versa. Defaults for iters and d are the same as for\n"
" --regress-distance.\n\n"
);
help_print("genome\tZ-genome\trel-check\timpossible\tnudge\tgenome-full\tunbounded", &help_ctrl, 1,
" --genome <gz> <rel-check> <full> <unbounded> <nudge>\n"
" Generate an identity-by-descent report.\n"
" * It is usually best to perform this calculation on a marker set in\n"
" approximate linkage equilibrium.\n"
" * The 'rel-check' modifier excludes pairs of samples with different FIDs\n"
" from the final report.\n"
" * 'full' adds raw pairwise comparison data to the report.\n"
" * The P(IBD=0/1/2) estimator employed by this command sometimes yields\n"
" numbers outside the range [0,1]; by default, these are clipped. The\n"
" 'unbounded' modifier turns off this clipping.\n"
" * Then, when PI_HAT^2 < P(IBD=2), 'nudge' adjusts the final P(IBD=0/1/2)\n"
" estimates to a theoretically possible configuration.\n"
" * The computation can be subdivided with --parallel.\n\n"
);
help_print("homozyg\thomozyg-snp\thomozyg-kb\thomozyg-density\thomozyg-gap\thomozyg-het\thomozyg-window-snp\thomozyg-window-het\thomozyg-window-missing\thomozyg-window-threshold", &help_ctrl, 1,
" --homozyg <group | group-verbose> <consensus-match> <extend>\n"
" <subtract-1-from-lengths>\n"
" --homozyg-snp [min var count]\n"
" --homozyg-kb [min length]\n"
" --homozyg-density [max inverse density (kb/var)]\n"
" --homozyg-gap [max internal gap kb length]\n"
" --homozyg-het [max hets]\n"
" --homozyg-window-snp [scanning window size]\n"
" --homozyg-window-het [max hets in scanning window hit]\n"
" --homozyg-window-missing [max missing calls in scanning window hit]\n"
" --homozyg-window-threshold [min scanning window hit rate]\n"
" These commands request a set of run-of-homozygosity reports, and allow you\n"
" to customize how they are generated.\n"
" * If you're satisfied with all the default settings described below, just\n"
" use --homozyg with no modifiers. Otherwise, --homozyg lets you change a\n"
" few binary settings:\n"
" * 'group{-verbose}' adds a report on pools of overlapping runs of\n"
" homozygosity. (Automatically set when --homozyg-match is present.)\n"
" * With 'group{-verbose}', 'consensus-match' causes pairwise segmental\n"
" matches to be called based on the variants in the pool's consensus\n"
" segment, rather than the variants in the pairwise intersection.\n"
" * Due to how the scanning window algorithm works, it is possible for a\n"
" reported ROH to be adjacent to a few homozygous variants. The 'extend'\n"
" modifier causes them to be included in the reported ROH if that\n"
" wouldn't cause a violation of the --homozyg-density bound.\n"
" * By default, segment bp lengths are calculated as [end bp position] -\n"
" [start bp position] + 1. Therefore, reports normally differ slightly\n"
" from PLINK 1.07, which does not add 1 at the end. For testing\n"
" purposes, you can use the 'subtract-1-from-lengths' modifier to apply\n"
" the old formula.\n"
" * By default, only runs of homozygosity containing at least 100 variants,\n"
" and of total length >= 1000 kilobases, are noted. You can change these\n"
" minimums with --homozyg-snp and --homozyg-kb, respectively.\n"
" * By default, a ROH must have at least one variant per 50 kb on average;\n"
" change this bound with --homozyg-density.\n"
" * By default, if two consecutive variants are more than 1000 kb apart, they\n"
" cannot be in the same ROH; change this bound with --homozyg-gap.\n"
" * By default, a ROH can contain an unlimited number of heterozygous calls;\n"
" you can impose a limit with --homozyg-het.\n"
" * By default, the scanning window contains 50 variants; change this with\n"
" --homozyg-window-snp.\n"
" * By default, a scanning window hit can contain at most 1 heterozygous\n"
" call and 5 missing calls; change these limits with --homozyg-window-het\n"
" and --homozyg-window-missing, respectively.\n"
" * By default, for a variant to be eligible for inclusion in a ROH, the hit\n"
" rate of all scanning windows containing the variant must be at least\n"
" 0.05; change this threshold with --homozyg-window-threshold.\n\n"
);
help_print("cluster\tcc\tgroup-avg\tgroup-average\tcluster-missing", &help_ctrl, 1,
" --cluster <cc> <group-avg | old-tiebreaks> <missing> <only2>\n"
" Cluster samples using a pairwise similarity statistic (normally IBS).\n"
" * The 'cc' modifier forces every cluster to have at least one case and one\n"
" control.\n"
" * The 'group-avg' modifier causes clusters to be joined based on average\n"
" instead of minimum pairwise similarity.\n"
" * The 'missing' modifier causes clustering to be based on\n"
" identity-by-missingness instead of identity-by-state, and writes a\n"
" space-delimited identity-by-missingness matrix to disk.\n"
" * The 'only2' modifier causes only a .cluster2 file (which is valid input\n"
" for --within) to be written; otherwise 2 other files will be produced.\n"
" * By default, IBS ties are not broken in the same manner as PLINK 1.07, so\n"
" final cluster solutions tend to differ. This is generally harmless.\n"
" However, to simplify testing, you can use the 'old-tiebreaks' modifier to\n"
" force emulation of the old algorithm.\n\n"
);
#ifndef NOLAPACK
help_print("pca\tmake-rel\tmake-grm\tmake-grm-gz\tmake-grm-bin", &help_ctrl, 1,
" --pca {count} <header> <tabs> <var-wts>\n"
" Calculates a variance-standardized relationship matrix (use\n"
" --make-rel/--make-grm-gz/--make-grm-bin to dump it), and extracts the top\n"
" 20 principal components.\n"
" * It is usually best to perform this calculation on a marker set in\n"
" approximate linkage equilibrium.\n"
" * You can change the number of PCs by passing a numeric parameter.\n"
" * The 'header' modifier adds a header line to the .eigenvec output file.\n"
" (For compatibility with the GCTA flag of the same name, the default is no\n"
" header line.)\n"
" * The 'tabs' modifier causes the .eigenvec file(s) to be tab-delimited.\n"
" * The 'var-wts' modifier requests an additional .eigenvec.var file with PCs\n"
" expressed as variant weights instead of sample weights.\n\n"
);
#endif
help_print("neighbour\tneighbor", &help_ctrl, 1,
" --neighbour [n1] [n2]\n"
" (alias: --neighbor)\n"
" Report IBS distances from each sample to their n1th- to n2th-nearest\n"
" neighbors, associated Z-scores, and the identities of those neighbors.\n"
" Useful for outlier detection.\n\n"
);
help_print("assoc\tmodel\tfisher\tperm\tmperm\tperm-count\tcounts\tp2\tset-test\tmodel-dom\tmodel-gen\tmodel-rec\tmodel-trend\tgenedrop\tqt-means\ttrend", &help_ctrl, 1,
" --assoc <perm | mperm=[value]> <perm-count> <fisher | fisher-midp> <counts>\n"
" <set-test>\n"
/*
" --assoc <perm | mperm=[value]> <genedrop> <perm-count> <fisher | fisher-midp>\n"
" <counts> <set-test>\n"
*/
" --assoc <perm | mperm=[value]> <perm-count> <qt-means> <lin> <set-test>\n"
/*
" --model <perm | mperm=[value]> <genedrop> <perm-count>\n"
*/
" --model <perm | mperm=[value]> <perm-count>\n"
" <fisher | fisher-midp | trend-only> <set-test>\n"
" <dom | rec | gen | trend>\n"
" Basic association analysis report.\n"
" Given a case/control phenotype, --assoc performs a 1df chi-square allelic\n"
" test, while --model performs 4 other tests as well (1df dominant gene\n"
" action, 1df recessive gene action, 2df genotypic, Cochran-Armitage trend).\n"
" * With 'fisher'/'fisher-midp', Fisher's exact test is used to generate\n"
" p-values. 'fisher-midp' also applies Lancaster's mid-p adjustment.\n"
" * 'perm' causes an adaptive permutation test to be performed.\n"
" * 'mperm=[value]' causes a max(T) permutation test with the specified\n"
" number of replications to be performed.\n"
/*
" * 'genedrop' causes offspring genotypes to be regenerated via gene-dropping\n"
" in the permutation test.\n"
*/
" * 'perm-count' causes the permutation test report to include counts instead\n"
" of frequencies.\n"
" * 'counts' causes --assoc to report allele counts instead of frequencies.\n"
" * 'set-test' tests the significance of variant sets. Requires permutation;\n"
" can be customized with --set-p/--set-r2/--set-max.\n"
" * 'dom', 'rec', 'gen', and 'trend' force the corresponding test to be used\n"
" as the basis for --model permutation. (By default, the most significant\n"
" result among the allelic, dominant, and recessive tests is used.)\n"
" * 'trend-only' causes only the trend test to be performed.\n"
" Given a quantitative phenotype, --assoc normally performs a Wald test.\n"
" * In this case, the 'qt-means' modifier causes trait means and standard\n"
" deviations stratified by genotype to be reported as well.\n"
" * 'lin' causes the Lin statistic to be computed, and makes it the basis for\n"
" multiple-testing corrections and permutation tests.\n"
" Several other flags (most notably, --aperm) can be used to customize the\n"
" permutation test.\n\n"
);
help_print("mh\tbd\tmh2\thomog\tcmh", &help_ctrl, 1,
" --mh <perm | mperm=[value]> <perm-count> <set-test>\n"
" (alias: --cmh)\n"
" --bd <perm | perm-bd | mperm=[value]> <perm-count> <set-test>\n"
" --mh2\n"
" --homog\n"
" Given a case/control phenotype and a set of clusters, --mh computes 2x2xK\n"
" Cochran-Mantel-Haenszel statistics for each variant, while --bd also\n"
" performs the Breslow-Day test for odds ratio homogeneity. Permutation and\n"
" variant set testing based on the CMH (default) or Breslow-Day (when\n"
" 'perm-bd' is present) statistic are supported.\n"
" The following similar analyses are also available:\n"
" * --mh2 swaps the roles of case/control status and cluster membership,\n"
" performing a phenotype-stratified IxJxK Cochran-Mantel-Haenszel test on\n"
" association between cluster assignments and genotypes.\n"
" * --homog executes an alternative to the Breslow-Day test, based on\n"
" partitioning of the chi-square statistic.\n\n"
);
help_print("gxe\tmcovar", &help_ctrl, 1,
" --gxe {covariate index}\n"
" Given both a quantitative phenotype and a case/control covariate loaded\n"
" with --covar defining two groups, --gxe compares the regression coefficient\n"
" derived from considering only members of one group to the regression\n"
" coefficient derived from considering only members of the other. By\n"
" default, the first covariate in the --covar file defines the groups; use\n"
" e.g. '--gxe 3' to base them on the third covariate instead.\n\n"
);
help_print("linear\tlogistic\tperm\tmperm\tperm-count\tset-test\tgenotypic\thethom\tdominant\trecessive\tno-snp\thide-covar\tsex\tno-x-sex\tinteraction\tstandard-beta\tbeta", &help_ctrl, 1,
/*
" --linear <perm | mperm=[value]> <genedrop> <perm-count> <set-test>\n"
*/
#ifndef NOLAPACK
" --linear <perm | mperm=[value]> <perm-count> <set-test>\n"
" <genotypic | hethom | dominant | recessive | no-snp> <hide-covar>\n"
" <sex | no-x-sex> <interaction> <beta> <standard-beta> <intercept>\n"
#endif
/*
" --logistic <perm | mperm=[value]> <genedrop> <perm-count> <set-test>\n"
*/
" --logistic <perm | mperm=[value]> <perm-count> <set-test>\n"
" <genotypic | hethom | dominant | recessive | no-snp> <hide-covar>\n"
" <sex | no-x-sex> <interaction> <beta>\n"
" Multi-covariate association analysis on a quantitative (--linear) or\n"
" case/control (--logistic) phenotype. Normally used with --covar.\n"
" * 'perm' normally causes an adaptive permutation test to be performed on\n"
" the main effect, while 'mperm=[value]' starts a max(T) permutation test.\n"
/*
" * 'genedrop' causes offspring genotypes to be regenerated via gene-dropping\n"
" in the permutation test.\n"
*/
" * 'perm-count' causes the permutation test report to include counts instead\n"
" of frequencies.\n"
" * 'set-test' tests the significance of variant sets. Requires permutation;\n"
" can be customized with --set-p/--set-r2/--set-max.\n"
" * The 'genotypic' modifier adds an additive effect/dominance deviation 2df\n"
" joint test (0/1/2 and 0/1/0 coding), while 'hethom' uses 0/0/1 and 0/1/0\n"
" coding instead. If permutation is also requested, these modifiers cause\n"
" permutation to be based on the joint test.\n"
" * 'dominant' and 'recessive' specify a model assuming full dominance or\n"
" recessiveness, respectively, for the A1 allele.\n"
" * 'no-snp' causes regression to be performed only on the phenotype and the\n"
" covariates, without reference to genomic data. If permutation is also\n"
" requested, results are reported for all covariates.\n"
" * 'hide-covar' removes covariate-specific lines from the report.\n"
" * By default, sex (male = 1, female = 0) is automatically added as a\n"
" covariate on X chromosome variants, and nowhere else. The 'sex' modifier\n"
" causes it to be added everywhere, while 'no-x-sex' excludes it.\n"
" * 'interaction' adds genotype x covariate interactions to the model. This\n"
" cannot be used with the usual permutation tests; use --tests to define\n"
" the permutation test statistic instead.\n"
" * For logistic regressions, the 'beta' modifier causes regression\n"
" coefficients instead of odds ratios to be reported.\n"
" * With --linear, the 'standard-beta' modifier standardizes the phenotype\n"
" and all predictors to zero mean and unit variance before regression, and\n"
" the 'intercept' modifier adds intercepts to the main report.\n\n"
);
help_print("dosage\twrite-dosage", &help_ctrl, 1,
" --dosage [allele dosage file] <noheader> <skip0=[i]> <skip1=[j]> <skip2=[k]>\n"
" <dose1> <format=[m]> <Zout> <occur | standard-beta> <sex>\n"
" --dosage [list file] list <sepheader | noheader> <skip0=[i]> <skip1=[j]>\n"
" <skip2=[k]> <dose1> <format=[m]> <Zout> <occur | standard-beta>\n"
" <sex>\n"
" --write-dosage\n"