diff --git a/pgscatalog_utils/ancestry/read.py b/pgscatalog_utils/ancestry/read.py index 79b1b84..77e6476 100644 --- a/pgscatalog_utils/ancestry/read.py +++ b/pgscatalog_utils/ancestry/read.py @@ -18,7 +18,7 @@ def read_pcs(loc_pcs: list[str],dataset: str, loc_related_ids=None, nPCs=None): for i, path in enumerate(loc_pcs): logger.debug("Reading PCA projection: {}".format(path)) - df = pd.read_csv(path, sep='\t') + df = pd.read_csv(path, sep='\t', converters={"IID": str}) df['sampleset'] = dataset df.set_index(['sampleset', 'IID'], inplace=True) @@ -46,7 +46,10 @@ def read_pcs(loc_pcs: list[str],dataset: str, loc_related_ids=None, nPCs=None): IDs_related = [x.strip() for x in infile.readlines()] proj.loc[proj.index.get_level_values(level=1).isin(IDs_related), 'Unrelated'] = False else: - proj['Unrelated'] = np.nan + # if unrelated is all nan -> dtype is float64 + # if unrelated is only true / false -> dtype is bool + # if unrelated contains None + proj['Unrelated'] = None return proj @@ -76,7 +79,7 @@ def read_pgs(loc_aggscore, onlySUM: bool): :return: """ logger.debug('Reading aggregated score data: {}'.format(loc_aggscore)) - df = pd.read_csv(loc_aggscore, sep='\t', index_col=['sampleset', 'IID']) + df = pd.read_csv(loc_aggscore, sep='\t', index_col=['sampleset', 'IID'], converters={"IID": str}) if onlySUM: df = df[[x for x in df.columns if x.endswith('_SUM')]] rn = [x.rstrip('_SUM') for x in df.columns]