-
Notifications
You must be signed in to change notification settings - Fork 6
/
get_read_length_per_gene.php
executable file
·318 lines (285 loc) · 13 KB
/
get_read_length_per_gene.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/php
<?php
/*******************************************************************************
*
* GetReadLengthPerGene takes Gene IDs and determines the read length
* distribution for its transcripts per sample, from both transcriptome and
* genome alignment SAM files. It requires the mm10_gene_list.txt file to see
* which transcripts belongs to which gene, the mm10_transcript_positions.txt
* file to check the strand and the positions of the transcripts, and the SAM
* files.
* The script creates one file per transcript, the read lengths in the first
* column, and the number of reads with this length per sample in the next
* columns.
* It also creates a summary file, with all read lengths summed up.
*
* Created : 2015-01-13
* Modified : 2015-09-12
* Version : 0.2
*
* Copyright : 2015-2016 Leiden University Medical Center; http://www.LUMC.nl/
* Programmer : Ing. Ivo F.A.C. Fokkema <[email protected]>
*
* Changelog : 0.2 2016-09-12
* Added link to inc-lib-json.php to be compatible with PHP
* versions < 5.2.0.
* 0.1 2015-01-15
* First version.
*
*
* This work is licensed under the Creative Commons
* Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
* copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/
* or send a letter to:
* Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
*
*************/
$_SETT =
array(
'version' => '0.2',
'output_file_format' => '{{GENE}}_{{TRANSCRIPT}}_read_length_distribution.txt',
'terminal_width' => 100,
);
echo 'GetReadLengthPerTranscript v.' . $_SETT['version'] . "\n";
// 'PLEASE DO NOT USE THIS SCRIPT ON A NETWORK DRIVE; IT CAN BE INCREDIBLY SLOW THERE.' . "\n\n";
$aFiles = $_SERVER['argv'];
$sScriptName = array_shift($aFiles);
$sCWD = dirname($sScriptName);
if (!function_exists('json_encode') && is_readable($sCWD . '/inc-lib-json.php')) {
require $sCWD . '/inc-lib-json.php'; // For PHP <= 5.2.0.
}
if (count($aFiles) < 4) {
die('Usage: ' . $sScriptName . ' FILE_WITH_GENES_TO_ANALYSE GENE_LIST_FILE TRANSCRIPT_POSITION_FILE SAM_FILE1 [SAM_FILE2 [SAM_FILE3 [...]]]' . "\n\n");
}
// Check if all files can be read.
foreach ($aFiles as $sFile) {
if (!is_readable($sFile)) {
die('Unable to read ' . $sFile . '.' . "\n");
}
}
// First, load file with gene symbols to analyse.
$sGeneFile = array_shift($aFiles);
$aGeneFile = file($sGeneFile, FILE_IGNORE_NEW_LINES);
$aGenesToAnalyze = array(); // '<gene>' => array('<transcript>', '<transcript>', ...);
print('Parsing genes to analyze... ');
foreach ($aGeneFile as $sLine) {
if (!trim($sLine) || $sLine{0} == '#') {
// Comment, header.
continue; // Skip this line, continue to next.
}
// We don't check the file much.
$aGenesToAnalyze[$sLine] = array();
}
unset($aGeneFile);
print('done, loaded ' . count($aGenesToAnalyze) . ' genes in memory.' . "\n");
// Load transcript/gene info file.
$sGeneFile = array_shift($aFiles); // Shifts next argument off the array (the gene info file's name), we don't need it anymore after this.
$aGeneFile = file($sGeneFile, FILE_IGNORE_NEW_LINES);
$aTranscriptsToAnalyze = array(); // transcript => array(chr, strand, transcript_start, transcript_end)
print('Parsing gene/transcript information file... ');
foreach ($aGeneFile as $sLine) {
if (!trim($sLine) || $sLine{0} == '#') {
// Comment, header.
continue; // Skip this line, continue to next.
}
list($sTranscript, $sStrand, $sGene) = explode("\t", $sLine);
if (isset($aGenesToAnalyze[$sGene])) {
$aGenesToAnalyze[$sGene][] = $sTranscript;
$aTranscriptsToAnalyze[$sTranscript] = array();
}
}
unset($aGeneFile);
// If the wrong file has been passed, we have no valid transcripts. Then it will
// make no sense at all to continue.
if (!count($aTranscriptsToAnalyze)) {
die("\n" .
'Didn\'t find any valid transcripts. Make sure you passed the correct gene list and gene information file as the first two arguments.' . "\n\n");
}
print('done, loaded ' . count($aTranscriptsToAnalyze) . ' transcripts in memory.' . "\n");
// Prepare transcript locations file, read into memory.
$sTranscriptPositionsFile = array_shift($aFiles);
$aTranscriptPositionsFile = file($sTranscriptPositionsFile, FILE_IGNORE_NEW_LINES);
$nTranscripts = 0;
print('Parsing transcript locations file... ');
foreach ($aTranscriptPositionsFile as $nLine => $sLine) {
$nLine ++;
if (!trim($sLine) || $sLine{0} == '#') {
continue;
}
if (preg_match('/^([NX][MR]_\d+\.\d+)\t(\d{1,2}|[XY])\t([+-])\t([\[\]0-9,]+)$/', $sLine, $aRegs)) {
// Valid transcript position found.
list(,$sTranscriptWithVersion, $sChr, $sStrand, $sExonPositions) = $aRegs;
$sTranscript = substr($sTranscriptWithVersion, 0, strpos($sTranscriptWithVersion . '.', '.'));
if (isset($aTranscriptsToAnalyze[$sTranscript])) {
if (!($aExonPositions = json_decode($sExonPositions))) {
die("\n" .
'Can\'t parse line ' . $nLine . ' in file ' . $sTranscriptPositionsFile . '.' . "\n\n");
}
$nStart = min($aExonPositions[0]);
$nEnd = max($aExonPositions[count($aExonPositions)-1]);
$aTranscriptsToAnalyze[$sTranscript] = array('chr' . $sChr, $sStrand, $nStart, $nEnd);
$nTranscripts ++;
}
}
}
unset($aTranscriptPositionsFile);
// If the wrong file has been passed, we have no valid transcripts. Then it will
// make no sense at all to continue.
if (!$nTranscripts) {
die("\n" .
'Didn\'t find any valid transcript positions. Make sure you passed the correct transcript location file as the third argument.' . "\n\n");
}
print('done, loaded ' . $nTranscripts . ' transcript positions in memory.' . "\n");
// If this number does not match the number of total transcripts, should we die. We're going to get errors, otherwise...
if (count($aTranscriptsToAnalyze) != $nTranscripts) {
die('Not all transcripts we should analyze, have annotation. Please update the transcripts annotation file: ' . $sTranscriptPositionsFile . "\n\n");
}
print("\n");
$aData = array('' => array()); // 'NR_000001' => array('<length>' => array('<sample>' => '<coverage>'); (transcript '' is for a big summary)
$aSamples = array(); // To easily see which samples we have.
foreach ($aFiles as $sFile) {
$nLine = 0;
// SAM files are BIG. Very, very, very, BIG. Really. HUGE, actually. GBs. We need to read this line by line, to prevent a memory error.
$fIn = @fopen($sFile, 'r');
if (!$fIn) {
die('Unable to open ' . $sFile . '.' . "\n\n");
}
$nFileSize = filesize($sFile);
$nBytesRead = 0;
// Try and see the sample name, and the type of file. This could be done more generally, but for now we just want to be fast.
$sBasename = basename($sFile);
if (preg_match('/^merged_(..).fastq.trunc(.+)_M25.sam$/', $sBasename, $aRegs)) {
$sSample = $aRegs[1];
if (strpos($aRegs[2], 'genome') !== false) {
$sType = 'genome';
} else {
$sType = 'transcriptome';
}
} else {
$sSample = $sBasename;
if (strpos($sBasename, 'genome') !== false) {
$sType = 'genome';
} else {
$sType = 'transcriptome';
}
}
print('Parsing ' . $sSample . ' ' . $sType . ' (' . $sFile . ')' . "\n");
$sSample = $sSample{0}; // Keep 'A' for 'A1' to 'A3'.
$aSamples[] = $sSample;
while ($sLine = fgets($fIn)) {
$nLine ++;
$nBytesRead += strlen($sLine);
$sLine = rtrim($sLine);
if ($sLine{0} == '@') {
// The genomic alignment files come with a long list of headers, all starting with a @.
continue;
}
if ($sType == 'transcriptome') {
list(,, $sReference,,,,,,, $sRead) = explode("\t", $sLine);
$lRead = strlen($sRead);
list(,,, $sTranscriptWithVersion) = explode('|', $sReference);
$sTranscript = substr($sTranscriptWithVersion, 0, strpos($sTranscriptWithVersion . '.', '.'));
$aTranscriptsMatching = array($sTranscript); // Because we need a loop for the genomic SAM file.
} else {
list(, $nBitFlag, $sChr, $nStartPosition,,,,,, $sRead) = explode("\t", $sLine);
$sStrand = ($nBitFlag & 16? '-' : '+');
$lRead = strlen($sRead);
$nEndPosition = $nStartPosition + $lRead;
// Now check if any of the transcripts we're looking for, overlaps with this location.
// One genomic location, may match multiple transcripts.
$aTranscriptsMatching = array();
foreach ($aTranscriptsToAnalyze as $sTranscript => $aTranscript) {
list($sChrTranscript, $sStrandTranscript, $nTranscriptStart, $nTranscriptEnd) = $aTranscript;
if ($sChr == $sChrTranscript && $sStrand == $sStrandTranscript && (($nStartPosition >= $nTranscriptStart && $nStartPosition <= $nTranscriptEnd) || ($nEndPosition >= $nTranscriptStart && $nEndPosition <= $nTranscriptEnd))) {
$aTranscriptsMatching[] = $sTranscript;
}
}
}
if (!$sRead) {
die('Can\'t parse line ' . $nLine . ' in file ' . $sFile . '.' . "\n\n");
}
foreach ($aTranscriptsMatching as $sTranscript) {
if (!isset($aTranscriptsToAnalyze[$sTranscript])) {
// We only count transcripts that we are interested in!
continue;
}
if (!isset($aData[$sTranscript])) {
$aData[$sTranscript] = array();
}
if (!isset($aData[$sTranscript][$lRead])) {
$aData[$sTranscript][$lRead] = array();
}
if (!isset($aData[$sTranscript][$lRead][$sSample])) {
$aData[$sTranscript][$lRead][$sSample] = 0;
}
$aData[$sTranscript][$lRead][$sSample] ++;
}
if (!($nLine % 50000)) {
$nPercentageRead = round($nBytesRead/$nFileSize, 2);
$nAvailableWidth = $_SETT['terminal_width'] - 8 - strlen($nLine);
$lDone = round($nPercentageRead*$nAvailableWidth);
print(str_repeat(chr(8), $_SETT['terminal_width']) .
'[' . str_repeat('=', $lDone) . str_repeat(' ', $nAvailableWidth - $lDone) . '] ' . $nLine . ' ' . str_pad(round($nPercentageRead*100), 3, ' ', STR_PAD_LEFT) . '%');
}
}
$nAvailableWidth = $_SETT['terminal_width'] - 8 - strlen($nLine);
print(str_repeat(chr(8), $_SETT['terminal_width']) .
'[' . str_repeat('=', $nAvailableWidth) . '] ' . $nLine . ' 100%');
fclose($fIn);
print("\n" .
'Done reading ' . $nLine . ' lines.' . "\n");
}
print("\n" .
'All files done, writing output...' . "\n");
$aSamples = array_unique($aSamples); // Needed when we group replicates.
// Loop through genes, loop through transcripts, write output.
foreach ($aGenesToAnalyze as $sGene => $aTranscripts) {
print($sGene . '...');
if (!$aTranscripts) {
print(' (no transcripts found)' . "\n");
continue;
}
foreach ($aTranscripts as $sTranscript) {
print(' ' . $sTranscript);
if (!isset($aData[$sTranscript])) {
print(' (no reads found)');
} else {
ksort($aData[$sTranscript]); // Sort on read length.
$sFile = str_replace(array('{{GENE}}', '{{TRANSCRIPT}}'), array($sGene, $sTranscript), $_SETT['output_file_format']);
$fOut = @fopen($sFile, 'w');
fputs($fOut, 'read_length' . "\t" . implode("\t", $aSamples) . "\r\n");
foreach ($aData[$sTranscript] as $lRead => $aCoverage) {
// Prepare total summary, first.
if (!isset($aData[''][$lRead])) {
$aData[''][$lRead] = 0;
}
// Then the outfile.
fputs($fOut, $lRead);
foreach ($aSamples as $sSample) {
// Total summary first.
$aData[''][$lRead] += (!isset($aCoverage[$sSample])? 0 : $aCoverage[$sSample]);
// Then the output.
fputs($fOut, "\t" . (!isset($aCoverage[$sSample])? 0 : $aCoverage[$sSample]));
}
fputs($fOut, "\r\n");
}
fclose($fOut);
print(' (total coverage: ' . array_sum(array_map("array_sum", $aData[$sTranscript])) . ')');
}
}
print("\n");
}
// Total summary.
print('Total summary...');
ksort($aData['']); // Sort on read length.
$sFile = str_replace(array('{{GENE}}', '{{TRANSCRIPT}}'), array('ALL', 'ALL'), $_SETT['output_file_format']);
$fOut = @fopen($sFile, 'w');
fputs($fOut, 'read_length' . "\t" . 'coverage' . "\r\n");
foreach ($aData[''] as $lRead => $nCoverage) {
// Then the outfile.
fputs($fOut, $lRead . "\t" . $nCoverage . "\r\n");
}
fclose($fOut);
print(' (' . array_sum($aData['']) . ' total coverage)' . "\n");
die('All files done.' . "\n");
?>