From 29ff0ccf55992d8fa2c34507352b9150ac4c0a0b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 28 Nov 2023 14:59:51 +0000 Subject: [PATCH] fix assert which fails when no matches are found --- pgscatalog_utils/match/combine_matches.py | 33 ++++++++++++++++++----- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index a716f3f..32597c6 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -23,9 +23,12 @@ def combine_matches(): config.OUTDIR = args.outdir with pl.StringCache(): - scorefile = read_scorefile(path=args.scorefile, chrom=None) # chrom=None to read all variants + scorefile = read_scorefile(path=args.scorefile, + chrom=None) # chrom=None to read all variants logger.debug("Reading matches") - matches = pl.concat([pl.scan_ipc(x, memory_map=False, rechunk=False) for x in args.matches], rechunk=False) + matches = pl.concat( + [pl.scan_ipc(x, memory_map=False, rechunk=False) for x in args.matches], + rechunk=False) logger.debug("Labelling match candidates") params: dict[str, bool] = make_params_dict(args) @@ -49,7 +52,20 @@ def _check_duplicate_vars(matches: pl.LazyFrame): .collect() .get_column('count') .to_list()) - assert max_occurrence == [1], "Duplicate IDs in final matches" + + match n := max_occurrence[0]: + case None: + logger.critical("No variant matches found") + logger.critical( + "Did you set the correct genome build? Did you impute your genomes?") + raise ValueError + case _ if n > 1: + logger.critical("Duplicate IDs in final matches") + logger.critical( + "Please double check your genomes for duplicates and try again") + raise ValueError + case _: + logger.info("No duplicate variants found") def _parse_args(args=None): @@ -61,18 +77,21 @@ def _parse_args(args=None): parser.add_argument('-m', '--matches', dest='matches', required=True, nargs='+', help=' List of match files') parser.add_argument('--min_overlap', dest='min_overlap', required=True, - type=float, help=' Minimum proportion of variants to match before error') + type=float, + help=' Minimum proportion of variants to match before error') parser.add_argument('-IDs', '--filter_IDs', dest='filter', help=' Path to file containing list of variant IDs that can be included in the final scorefile.' '[useful for limiting scoring files to variants present in multiple datasets]') - parser = add_match_args(parser) # params for labelling matches + parser = add_match_args(parser) # params for labelling matches parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') parser.add_argument('--split', dest='split', default=False, action='store_true', help=' Write scorefiles split per chromosome?') - parser.add_argument('--combined', dest='combined', default=False, action='store_true', + parser.add_argument('--combined', dest='combined', default=False, + action='store_true', help=' Write scorefiles in combined format?') - parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) + parser.add_argument('-n', dest='n_threads', default=1, + help=' n threads for matching', type=int) parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args)