From d7f0b9930a364528c0698dd8679765b35b6b7cb3 Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Thu, 15 Sep 2016 13:50:06 -0700
Subject: [PATCH] Fixed bug with bz2 files. Added check for file compression

---
 midas/utility.py     | 12 +++++++-
 scripts/run_midas.py | 70 ++++++++++++++++++++++++++------------------
 2 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/midas/utility.py b/midas/utility.py
index e9afd7b..6de989b 100644
--- a/midas/utility.py
+++ b/midas/utility.py
@@ -4,7 +4,7 @@
 # Copyright (C) 2015 Stephen Nayfach
 # Freely distributed under the GNU General Public License (GPLv3)
 
-import io, os, stat, sys, resource, gzip, platform, subprocess
+import io, os, stat, sys, resource, gzip, platform, subprocess, bz2
 
 __version__ = '1.0.0'
 
@@ -113,6 +113,16 @@ def auto_detect_file_type(inpath):
 		else: sys.exit("Filetype [fasta, fastq] of %s could not be recognized" % inpath)
 	infile.close()
 
+def check_compression(inpath):
+	""" Check that file extension matches expected compression """
+	ext = inpath.split('.')[-1]
+	file = iopen(inpath)
+	try:
+		next(file)
+		file.close()
+	except:
+		sys.exit("\nError: File extension '%s' does not match expected compression" % ext)
+
 def iopen(inpath, mode='r'):
 	""" Open input file for reading regardless of compression [gzip, bzip] or python version """
 	ext = inpath.split('.')[-1]
diff --git a/scripts/run_midas.py b/scripts/run_midas.py
index 8b0f713..80daa41 100755
--- a/scripts/run_midas.py
+++ b/scripts/run_midas.py
@@ -125,9 +125,12 @@ def species_arguments():
 	parser.add_argument('program', help=argparse.SUPPRESS)
 	parser.add_argument('outdir', type=str, help='Path to directory to store results. Name should correspond to sample identifier.')
 	parser.add_argument('-1', type=str, dest='m1', required=True,
-		help="FASTA/FASTQ file containing 1st mate if paired or unpaired reads")
+		help="""FASTA/FASTQ file containing 1st mate if using paired-end reads.
+Otherwise FASTA/FASTQ containing unpaired reads.
+Can be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)""")
 	parser.add_argument('-2', type=str, dest='m2',
-		help="FASTA/FASTQ file containing 2nd mate if paired")
+		help="""FASTA/FASTQ file containing 2nd mate if using paired-end reads.
+Can be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)""")
 	parser.add_argument('-n', type=int, dest='max_reads',
 		help="""Number of reads to use from input file(s) (use all)""")
 	parser.add_argument('-t', dest='threads', default=1,
@@ -170,14 +173,17 @@ def check_species(args):
 	if not os.path.isdir('%s/species' % args['outdir']):
 		os.makedirs('%s/species' % args['outdir'])
 	if args['word_size'] < 12:
-		sys.exit("\nInvalid word size: %s. Must be greater than or equal to 12" % args['word_size'])
+		sys.exit("\nError: Invalid word size: %s. Must be greater than or equal to 12" % args['word_size'])
 	if args['mapid'] and (args['mapid'] < 0 or args['mapid'] > 100):
-		sys.exit("\nInvalid mapping identity: %s. Must be between 0 and 100" % args['mapid'])
+		sys.exit("\nError: Invalid mapping identity: %s. Must be between 0 and 100" % args['mapid'])
 	if args['aln_cov'] < 0 or args['aln_cov'] > 1:
-		sys.exit("\nInvalid alignment coverage: %s. Must be between 0 and 1" % args['aln_cov'])
+		sys.exit("\nError: Invalid alignment coverage: %s. Must be between 0 and 1" % args['aln_cov'])
 	for arg in ['m1', 'm2']:
 		if args[arg] and not os.path.isfile(args[arg]):
-			sys.exit("\nInput file does not exist: '%s'" % args[arg])
+			sys.exit("\nError: Input file does not exist: '%s'" % args[arg])
+	# check input compression
+	if args['m1']: utility.check_compression(args['m1'])
+	if args['m2']: utility.check_compression(args['m2'])
 
 def create_directories(program, args):
 	dirs = [args['outdir']]
@@ -254,9 +260,9 @@ def gene_arguments():
 	db.add_argument('--species_id', type=str, dest='gc_id', metavar='CHAR', help='One or more species identifiers to include in database. Separate ids with a comma')
 	align = parser.add_argument_group('Read alignment options (if using --align)')
 	align.add_argument('-1', type=str, dest='m1',
-		help='FASTA/FASTQ file containing 1st mate if paired or unpaired reads')
+		help='FASTA/FASTQ file containing 1st mate if using paired-end reads.\nOtherwise FASTA/FASTQ containing unpaired reads.\nCan be gzip\'ed (extension: .gz) or bzip2\'ed (extension: .bz2)')
 	align.add_argument('-2', type=str, dest='m2',
-		help='FASTA/FASTQ file containing 2nd mate if paired')
+		help='FASTA/FASTQ file containing 2nd mate if using paired-end reads.\nCan be gzip\'ed (extension: .gz) or bzip2\'ed (extension: .bz2)')
 	align.add_argument('-s', type=str, dest='speed', default='very-sensitive',
 		choices=['very-fast', 'fast', 'sensitive', 'very-sensitive'],
 		help='Alignment speed/sensitivity (very-sensitive)')
@@ -388,8 +394,10 @@ def snp_arguments():
 	db.add_argument('--species_topn', type=int, dest='gc_topn', metavar='INT', help='Include top N most abundant species')
 	db.add_argument('--species_id', type=str, dest='gc_id', metavar='CHAR', help='One or more species identifiers to include in database. Separate ids with a comma')
 	align = parser.add_argument_group('Read alignment options (if using --align)')
-	align.add_argument('-1', type=str, dest='m1', help='FASTA/FASTQ file containing 1st mate if paired or unpaired reads')
-	align.add_argument('-2', type=str, dest='m2', help='FASTA/FASTQ file containing 2nd mate if paired')
+	align.add_argument('-1', type=str, dest='m1',
+		help='FASTA/FASTQ file containing 1st mate if using paired-end reads.\nOtherwise FASTA/FASTQ containing unpaired reads.\nCan be gzip\'ed (extension: .gz) or bzip2\'ed (extension: .bz2)')
+	align.add_argument('-2', type=str, dest='m2',
+		help='FASTA/FASTQ file containing 2nd mate if using paired-end reads.\nCan be gzip\'ed (extension: .gz) or bzip2\'ed (extension: .bz2)')
 	align.add_argument('-s', type=str, dest='speed', default='very-sensitive',
 		choices=['very-fast', 'fast', 'sensitive', 'very-sensitive'],
 		help='Bowtie2 alignment speed/sensitivity (very-sensitive)')
@@ -474,36 +482,39 @@ def check_genes(args):
 	profile='%s/species/species_profile.txt' % args['outdir']
 	if not os.path.isfile(profile):
 		if (args['gc_topn'] or args['gc_cov']) and args['build_db']:
-			sys.exit("\nCould not find species abundance profile: %s\nTo specify species with --species_topn or --species_cov you must have run: run_midas.py species" % profile)
+			sys.exit("\nError: Could not find species abundance profile: %s\nTo specify species with --species_topn or --species_cov you must have run: run_midas.py species" % profile)
 	# no database but --align specified
 	if (args['align']
 		and not args['build_db']
 		and not os.path.isfile('%s/genes/temp/pangenomes.fa' % args['outdir'])):
-		error = "\nYou've specified --align, but no database has been built"
+		error = "\nError: You've specified --align, but no database has been built"
 		error += "\nTry running with --build_db"
 		sys.exit(error)
 	# no bamfile but --cov specified
 	if (args['cov']
 		and not args['align']
 		and not os.path.isfile('%s/genes/temp/pangenome.bam' % args['outdir'])):
-		error = "\nYou've specified --call_genes, but no alignments were found"
+		error = "\nError: You've specified --call_genes, but no alignments were found"
 		error += "\nTry running with --align"
 		sys.exit(error)
 	# no reads
 	if args['align'] and not args['m1']:
-		sys.exit("\nTo align reads, you must specify path to input FASTA/FASTQ")
+		sys.exit("\nError: To align reads, you must specify path to input FASTA/FASTQ")
 	# check input file paths
 	for arg in ['m1', 'm2']:
 		if args[arg] and not os.path.isfile(args[arg]):
-			sys.exit("\nInput file does not exist: '%s'" % args[arg])
+			sys.exit("\nError: Input file does not exist: '%s'" % args[arg])
+	# check input compression
+	if args['m1']: utility.check_compression(args['m1'])
+	if args['m2']: utility.check_compression(args['m2'])
 	# input options
 	if args['m2'] and not args['m1']:
-		sys.exit("\nMust specify -1 and -2 if aligning paired end reads")
+		sys.exit("\nError: Must specify -1 and -2 if aligning paired end reads")
 	# sanity check input values
 	if args['mapid'] < 1 or args['mapid'] > 100:
-		sys.exit("\nMAPID must be between 1 and 100")
+		sys.exit("\nError: MAPID must be between 1 and 100")
 	if args['aln_cov'] < 0 or args['aln_cov'] > 1:
-		sys.exit("\nALN_COV must be between 0 and 1")
+		sys.exit("\nError: ALN_COV must be between 0 and 1")
 
 def check_snps(args):
 	""" Check validity of command line arguments """
@@ -522,12 +533,12 @@ def check_snps(args):
 	profile='%s/species/species_profile.txt' % args['outdir']
 	if not os.path.isfile(profile):
 		if (args['gc_topn'] or args['gc_cov']) and args['build_db']:
-			sys.exit("\nCould not find species abundance profile: %s\nTo specify species with --species_topn or --species_cov you must have run: run_midas.py species" % profile)
+			sys.exit("\nError: Could not find species abundance profile: %s\nTo specify species with --species_topn or --species_cov you must have run: run_midas.py species" % profile)
 	# no database but --align specified
 	if (args['align']
 		and not args['build_db']
 		and not os.path.isfile('%s/snps/temp/genomes.fa' % args['outdir'])):
-		error = "\nYou've specified --align, but no database has been built"
+		error = "\nError: You've specified --align, but no database has been built"
 		error += "\nTry running with --build_db"
 		sys.exit(error)
 	# no bamfile but --call specified
@@ -535,7 +546,7 @@ def check_snps(args):
 		and not args['align']
 		and not os.path.isfile('%s/snps/temp/genomes.bam' % args['outdir'])
 		):
-		error = "\nYou've specified --call_snps, but no alignments were found"
+		error = "\nError: You've specified --call_snps, but no alignments were found"
 		error += "\nTry running with --align"
 		sys.exit(error)
 	# no genomes but --call specified
@@ -543,26 +554,29 @@ def check_snps(args):
 		and not args['build_db']
 		and not os.path.isfile('%s/snps/temp/genomes.fa' % args['outdir'])
 		):
-		error = "\nYou've specified --call_snps, but the no genome database was found"
+		error = "\nError: You've specified --call_snps, but the no genome database was found"
 		error += "\nTry running with --build_db"
 		sys.exit(error)
 	# no reads
 	if args['align'] and not args['m1']:
-		sys.exit("\nTo align reads, you must specify path to input FASTA/FASTQ")
+		sys.exit("\nError: To align reads, you must specify path to input FASTA/FASTQ")
 	# check input file paths
 	for arg in ['m1', 'm2']:
 		if args[arg] and not os.path.isfile(args[arg]):
-			sys.exit("\nInput file does not exist: '%s'" % args[arg])
+			sys.exit("\nError: Input file does not exist: '%s'" % args[arg])
+	# check input compression
+	if args['m1']: utility.check_compression(args['m1'])
+	if args['m2']: utility.check_compression(args['m2'])
 	# input options
 	if args['m2'] and not args['m1']:
-		sys.exit("\nMust specify -1 and -2 if aligning paired end reads")
+		sys.exit("\nError: Must specify -1 and -2 if aligning paired end reads")
 	# sanity check input values
 	if args['mapid'] < 1 or args['mapid'] > 100:
-		sys.exit("\nMAPQ must be between 1 and 100")
+		sys.exit("\nError: MAPQ must be between 1 and 100")
 	if args['mapq'] < 0 or args['mapq'] > 100:
-		sys.exit("\nMAPQ must be between 0 and 100")
+		sys.exit("\nError: MAPQ must be between 0 and 100")
 	if args['baseq'] < 0 or args['baseq'] > 100:
-		sys.exit("\nBASEQ must be between 0 and 100")
+		sys.exit("\nError: BASEQ must be between 0 and 100")
 
 if __name__ == '__main__':