forked from jrflab/modules
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile.inc
263 lines (216 loc) · 10.3 KB
/
Makefile.inc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
ifneq ("$(wildcard project_config.inc)", "")
include project_config.inc
endif
include modules/config.inc
ifdef PROJECT_CONFIG
include $(PROJECT_CONFIG)
endif
ifneq ("$(wildcard config.inc)", "")
include config.inc
endif
# default reference:
REF ?= b37
include modules/genome_inc/$(REF).inc
ifndef MAKEFILE_INC
SAMPLE_SET_FILE ?= sample_sets.txt
SAMPLE_FILE ?= samples.txt
SAMPLE_SPLIT_FILE ?= samples.split.txt # generate using scripts/prepareFastq.sh [remove underscores from sample names]
PROJECT_NAME = $(shell pwd | sed 's:.*/projects/::; s:.*/data/::; s:.*kinglab/::; s:/:_:g')
ifneq ($(wildcard $(SAMPLE_FILE)),)
SAMPLES ?= $(shell sed '/^\#/d' $(SAMPLE_FILE))
endif
get_tumors = $(wordlist 1,$(shell expr $(words $1) - 1),$1)
get_normal = $(lastword $1)
get_space_sets = $(shell sed '/^\#/d; s/\s\+/ /g; s/\s\+$$//;' $(SAMPLE_SET_FILE) | sed -n '$(1)p')
get_underscore_sets = $(shell sed '/^\#/d; s/\s\+/_/g; s/\s\+$$//;' $(SAMPLE_SET_FILE) | sed -n '$(1)p')
ifneq ($(wildcard $(SAMPLE_SET_FILE)),)
NUM_SETS := $(shell sed '/^\#/d' $(SAMPLE_SET_FILE) | wc -l | cut -f 1 -d' ')
SETS_SEQ := $(shell seq 1 $(NUM_SETS))
$(foreach i,$(SETS_SEQ), \
$(eval set.$i := $(call get_space_sets,$i)))
# define set_lookup.$(sample)
$(foreach i,$(SETS_SEQ), \
$(foreach sample,$(set.$i), \
$(eval set_lookup.$(sample) := $i)))
# define SAMPLE_SETS to contain sample_sample_normal
$(foreach i,$(SETS_SEQ), \
$(eval SAMPLE_SETS += $(call get_underscore_sets,$i)))
# tumor.SET => tumors
$(foreach i,$(SETS_SEQ), \
$(eval tumor.$(call get_underscore_sets,$i) := $(call get_tumors,$(set.$i))))
# normal.SET => normal
$(foreach i,$(SETS_SEQ), \
$(eval normal.$(call get_underscore_sets,$i) := $(call get_normal,$(set.$i))))
NORMAL_SAMPLES = $(foreach i,$(SETS_SEQ),$(call get_normal,$(set.$i)))
TUMOR_SAMPLES = $(foreach i,$(SETS_SEQ),$(call get_tumors,$(set.$i)))
SAMPLE_PAIRS = $(foreach set,$(SAMPLE_SETS),$(foreach tumor,$(tumor.$(set)),$(tumor)_$(normal.$(set))))
# define tumor.pair and normal.pair to retrieve tumor/normal from pairs
$(foreach set,$(SAMPLE_SETS), \
$(foreach tumor,$(tumor.$(set)), \
$(eval tumor.$(tumor)_$(normal.$(set)) := $(tumor))))
$(foreach set,$(SAMPLE_SETS), \
$(foreach tumor,$(tumor.$(set)), \
$(eval normal.$(tumor)_$(normal.$(set)) := $(normal.$(set)))))
# tumor_normal.$(tumor) to retrieve tumor_normal
$(foreach set,$(SAMPLE_SETS), \
$(foreach tumor,$(tumor.$(set)), \
$(eval tumor_normal.$(tumor) := $(tumor)_$(normal.$(set)))))
$(foreach set,$(SAMPLE_SETS), \
$(foreach tumor,$(tumor.$(set)), \
$(eval tumor_normal.$(normal.$(set)) := $(tumor)_$(normal.$(set)))))
# set.$(sample) to retrieve sample sets (underscores)
$(foreach i,$(SETS_SEQ), \
$(foreach sample,$(set.$i), \
$(eval set.$(sample) := $(word $i,$(SAMPLE_SETS)))))
UNMATCHED_SAMPLES = $(shell sed '/^\#/d' $(SAMPLE_FILE) $(SAMPLE_SET_FILE) | tr ' ' '\n' | sort | uniq -u)
SAMPLE_SET_PAIRS = $(shell echo "$(SAMPLE_PAIRS) $(SAMPLE_SETS)" | tr ' ' '\n' | sort | uniq)
$(foreach set,$(SAMPLE_SET_PAIRS), \
$(eval samples.$(set) := $(shell echo "$(set)" | sed 's/_/ /g')))
endif
ifneq ($(wildcard $(SAMPLE_SPLIT_FILE)),)
A = $(shell cut -f1 $(SAMPLE_SPLIT_FILE))
B = $(shell cut -f2 $(SAMPLE_SPLIT_FILE))
$(foreach i,$(shell seq 1 $(words $(A))),$(eval split.$(word $i,$(A)) += $(word $i,$(B))))
UNSPLIT_SAMPLES = $(B)
SPLIT_SAMPLES = $(shell cut -f1 $(SAMPLE_SPLIT_FILE) | sort | uniq)
endif
ALL_SAMPLES = $(SAMPLE_PAIRS) $(SAMPLES)
SHELL = /bin/bash
export TMPDIR := $(HOME)/share/data/$(USER)/tmp
# PICARD tools
PICARD_MEM = 10G
PICARD_OPTS = VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=4000000
PICARD_DIR ?= $(HOME)/share/usr/lib/java
ANALYZE_COV = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/AnalyzeCovariates.jar $(PICARD_OPTS)
SORT_SAM = $(call SORT_SAM_MEM,$(PICARD_MEM))
SORT_SAM_MEM = $(JAVA) -Xmx$1 -jar $(PICARD_DIR)/SortSam.jar $(PICARD_OPTS) TMP_DIR=$(TMPDIR)
REORDER_SAM = $(call REORDER_SAM_MEM,$(PICARD_MEM))
REORDER_SAM_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/ReorderSam.jar $(PICARD_OPTS)
MARK_DUP = $(call $(MARK_DUP_MEM,$(PICARD_MEM)))
MARK_DUP_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/MarkDuplicates.jar $(PICARD_OPTS)
MERGE_SAMS = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/MergeSamFiles.jar $(PICARD_OPTS)
INTERVAL_LIST_TOOL = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/IntervalListTools.jar $(PICARD_OPTS)
CREATE_SEQ_DICT = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CreateSequenceDictionary.jar $(PICARD_OPTS)
CALC_HS_METRICS = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CalculateHsMetrics.jar $(PICARD_OPTS)
COLLECT_MULT_METRICS = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CollectMultipleMetrics.jar $(PICARD_OPTS)
COLLECT_TARGETED_METRICS = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CollectTargetedPcrMetrics.jar $(PICARD_OPTS)
FIX_MATE = $(call FIX_MATE_MEM,$(PICARD_MEM))
FIX_MATE_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/FixMateInformation.jar $(PICARD_OPTS) TMP_DIR=$(TMPDIR)
SAM_TO_FASTQ = $(call SAM_TO_FASTQ_MEM,$(PICARD_MEM))
SAM_TO_FASTQ_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/SamToFastq.jar $(PICARD_OPTS)
CLEANBAM = $(call CLEANBAM_MEM,$(PICARD_MEM))
CLEANBAM_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/CleanSam.jar $(PICARD_OPTS)
ADD_RG = $(call ADD_RG_MEM,$(PICARD_MEM))
ADD_RG_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/AddOrReplaceReadGroups.jar $(PICARD_OPTS)
VALIDATE_SAM = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/ValidateSamFile.jar $(PICARD_OPTS)
PICARD = $(JAVA8) -Xmx$2 -jar $(PICARD2_JAR) $1 $(PICARD_OPTS)
VARSCAN = $(JAVA8) -Xmx10G -jar $(VARSCAN_JAR)
# SNP EFF
SNP_EFF = $(call SNP_EFF_MEM,$(DEFAULT_JAVA_MEM))
SNP_EFF_MEM = $(JAVA8) -Xmx$1 -jar $(SNP_EFF_JAR)
SNP_SIFT_MEM = $(JAVA8) -Xmx$1 -jar $(SNP_SIFT_JAR)
SNP_SIFT = $(call SNP_SIFT_MEM,$(DEFAULT_JAVA_MEM))
VCF_EFF_ONE_PER_LINE = $(HOME)/share/usr/snpEff-4.1/scripts/vcfEffOnePerLine.pl
VCF_JOIN_EFF = modules/scripts/joinEff.pl
COUNT_SAMPLES = $(shell expr `sed 's/\..*//; s:.*/::' <<< $1 | grep -o "_" | wc -l` + 1)
JAVA_ARGS = -Djava.io.tmpdir=$(TMPDIR)
#JAVA_TMP := $(HOME)/tmp
JAVA := java $(JAVA_ARGS)
JAVA6 := $(JAVA6_BIN) $(JAVA_ARGS)
JAVA7 := $(JAVA7_BIN) $(JAVA_ARGS)
JAVA8 := $(JAVA8_BIN) $(JAVA_ARGS)
#JAVA = /usr/bin/java $(JAVA_ARGS)
NO_RM ?= false
ifeq ($(NO_RM),true)
RM := touch
RMR = touch
else
RM := rm -f
RMR := rm -r
endif
# define $(,) and $( ) for escaping commas and spaces
, := ,
space :=
space +=
$(space) :=
$(space) +=
# $(call strip-suffix,filename)
strip-suffix = $(firstword $(subst ., ,$(1)))
LINK = ln -svf $(notdir $1) $(2) && touch $1
NOW := $(shell date +"%F")
MKDIR = mkdir -p -m 775
MKDIRS = $(MKDIR) $(LOGDIR)/$(@D) $(@D)
LOG = $(PWD)/$(LOGDIR)/$(@).log
UMASK = 002
export PATH := $(JRFLAB_MODULES_ENV)/bin:$(PATH)
ECHO_ACTIVATE_ENV = unset PYTHONPATH; OLDPATH=\$$PATH; tries=0; until [[ \$$tries -gt 10 ]] || source $1/bin/activate $1; do export PATH=\$$OLDPATH; tries=\`expr \$$tries + 1\`; jitter=\`expr \$$RANDOM % 10 + 1\`; sleep \`expr \$$jitter \\* \$$tries\`; done
ACTIVATE_ENV = unset PYTHONPATH; OLDPATH=$$PATH; tries=0; until [[ $$tries -gt 10 ]] || source $1/bin/activate $1; do export PATH=$$OLDPATH; tries=`expr $$tries + 1`; jitter=`expr $$RANDOM % 10 + 1`; sleep `expr $$jitter \* $$tries`; done
# SGE variables
#QUEUE ?= jrf.q all.q
QUEUE ?=
WALLTIMES = MEM SHORT LONG
### PBS
PBS_NODES = gpu-2-14 cpu-6-1 gpu-1-4
#### SGE
SGE_PARALLEL_ENV = smp
SGE_NODES = e02 e06
SGE_QSUB_SCRIPT = python modules/scripts/qsub.py
export SGE_ROOT = /common/sge
MAKE_PID := $(shell echo $$PPID)
JOB_FLAG := $(filter -j%, $(subst -j ,-j,$(shell ps T | grep "^\s*$(MAKE_PID).*$(MAKE)")))
JOBS := $(subst -j,,$(JOB_FLAG))
# run locally
RUN_FIRST = umask $(UMASK); $(if $(and $(findstring true,$(USE_CLUSTER)),$(JOBS)),sleep $$[ ( $$RANDOM % 20 ) + 1 ]s;) $(MKDIRS)
INIT = $(RUN_FIRST); $(call ACTIVATE_ENV,$(JRFLAB_MODULES_ENV)); umask $(UMASK); set -o pipefail;
INIT_ENV = $(RUN_FIRST); $(call ACTIVATE_ENV,$1); umask $(UMASK); set -o pipefail;
CLUSTER_ENGINE ?= SGE
USE_CLUSTER ?= true
RUN_OPTS = $(if $(findstring false,$(USE_CLUSTER)),-l) -o $@ $(if $(CLUSTER_ENGINE),-g $(CLUSTER_ENGINE)) \
--default_env $(JRFLAB_MODULES_ENV) -S $(SHELL) \
--log_file $(LOG) $(if $($(CLUSTER_ENGINE)_NODES),--servers $($(CLUSTER_ENGINE)_NODES)) \
$(if $(PROJECT_NAME),-p $(PROJECT_NAME))
#usage $(call RUN,args,script)
RUN = echo $2 | python modules/scripts/run.py $(RUN_OPTS) $1
#MD5 = md5sum $(1:.md5=) > $1
MD5 = md5sum $(@:.md5=) > $@
#CHECK_MD5 = md5sum -c $(filter %.md5,$^); i=0; until md5sum --status -c $(filter %.md5,$^); do if (( $$i % 5 == 0 )); then date; md5sum -c $(filter %.md5,$^); fi; if (( $$i > 20 )); then echo giving up; exit 1; fi; sleep 5; let i=i+1 ; done;
CHECK_MD5 = for i in {0..20}; do if md5sum -c $(filter %.md5,$^); then break; fi; sleep 5; done;
<M = $(<:.md5=)
^M = $(^:.md5=)
@M = $(@:.md5=)
<<M = $(word 2,$(^M))
<<<M = $(word 3,$(^M))
# $(call CHECK_VCF,vcf.file,target,command)
# check for valid header, ie non-empty vcf file
CHECK_VCF = mkdir -p $(@D); if [ `grep -v '^\#' $< | wc -l` -eq 0 ] && [ `grep '^\#CHROM' $< | wc -l` -eq 1 ]; then cp $< $@; else $1; fi
CHECK_MAF = mkdir -p $(@D); if [ `grep -v '^\#' $< | wc -l` -eq 1 ] && [ `grep '^Hugo_Symbol' $< | wc -l` -eq 1 ]; then cp $< $@; else $1; fi
CHECK_UVCF = mkdir -p $(@D); if [ `grep -v '^\#' $< | wc -l` -eq 0 ] && [ `grep '^\#CHROM' $< | wc -l` -eq 1 ]; then sed 's/FILTER\tINFO/FILTER\tUPS-COORDINATE\tINFO/' $< > $@; else $1; fi
# verify that vcf files have #CHROM
# usage: $(call VERIFY_VCF,tmp,final_dest)
VERIFY_VCF = if grep -q '^\#CHROM' $1; then mv $1 $2 && if [ -s $1.idx ]; then mv $1.idx $2.idx; fi ; else false; fi
VERIFY_VCF_GZ = if zgrep -q '^\#CHROM' $1; then mv $1 $2 && if [ -s $1.tbi ]; then mv $1.tbi $2.tbi; fi ; else false; fi
<< = $(word 2,$^)
<<< = $(word 3,$^)
<<<< = $(word 4,$^)
4< = $(word 4,$^)
<<<<< = $(word 5,$^)
5< = $(word 5,$^)
KNIT = $(RSCRIPT) modules/scripts/knit.R
PASS_FILTER_VCF = python modules/vcf_tools/pass_filter_vcf.py
define R_INIT
dir.create('$(@D)', showWarnings = F, recursive = T)
qw <- function(x) unlist(strsplit(x, "[[:space:]]+"))
endef
ifeq ($(USE_CLUSTER),true)
ifeq ($(CLUSTER_ENGINE),SGE)
export DRMAA_LIBRARY_PATH = /common/sge/lib/lx24-amd64/libdrmaa.so.1.0
else
ifeq ($(CLUSTER_ENGINE),LSF)
export DRMAA_LIBRARY_PATH = /admin/lsf/10.1/linux3.10-glibc2.17-x86_64/lib/libdrmaa.so
else
export DRMAA_LIBRARY_PATH = /opt/torque/lib/libdrmaa.so.1
endif
endif
endif
endif
MAKEFILE_INC = true