diff --git a/scripts/biomedical/NCBI_dbSNP/background_process.sh b/scripts/biomedical/NCBI_dbSNP/background_process.sh
new file mode 100644
index 000000000..3ce645150
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/background_process.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Sleep as long as N jobs of given name are running in background
+
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 <python script path> <input file path>"
+  exit 1
+fi
+
+PYTHON_PATH=$1
+INPUT_PATH=$2
+
+echo " $PYTHON_PATH - $INPUT_PATH"
+
+# Count the number of background jobs
+function num_jobs {
+  local name="$PYTHON_PATH"; shift;
+  if [[ "$name" == "" ]]; then
+    echo  $(jobs -r | wc -l)
+  else
+    echo $(ps -ef | egrep "$name" | wc -l)
+  fi
+}
+
+# Get the number of cores
+function num_cores {
+  [[ -z "$cores" ]] && cores=$(cat /proc/cpuinfo | grep "processor" | wc -l)
+  cores=${cores:-"10"}
+  echo $cores
+}
+
+# Sleep while there are atleast N background jobs
+function sleep_while_active {
+  max_jobs=6
+  job_name="$PYTHON_PATH"
+  max_jobs=${max_jobs:-$(num_cores)}
+  j=$(num_jobs $job_name);
+  echo "No of jobs $j and job name $job_name"
+  while (( $j > ${max_jobs:-0} )); do
+    sleep 1;
+    j=$(num_jobs $job_name);
+  done;
+}
+
+# Run processes in background
+csv_files=$(ls $INPUT_PATH*_shard_*.vcf | xargs -n 1 basename)
+#echo "files $csv_files"
+for file in $csv_files; do
+  # Run the process per file in background
+  # python $2--input_file=$file & sleep_while_active
+  echo "$PYTHON_PATH --input_file=$file"
+  python3 $PYTHON_PATH --input_file=$file & sleep_while_active
+done
+wait
+
diff --git a/scripts/biomedical/NCBI_dbSNP/download.sh b/scripts/biomedical/NCBI_dbSNP/download.sh
new file mode 100644
index 000000000..9d29c2d84
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/download.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+mkdir -p input; cd input
+
+# download genome assemblies files
+curl -L -O https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.14_GRCh37.p13/GCA_000001405.14_GRCh37.p13_assembly_report.txt
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_assembly_report.txt
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/NAMES.RRF.gz
+gunzip NAMES.RRF.gz
+mv NAMES.RRF NAMES.txt
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MGDEF.RRF.gz
+gunzip MGDEF.RRF.gz
+mv MGDEF.RRF MGDEF.txt
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MGSTY.RRF.gz
+gunzip MGSTY.RRF.gz
+mv MGSTY.RRF MGSTY.txt
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz
+gunzip MedGenIDMappings.txt.gz
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id
+gunzip gene_condition_source_id
+mv gene_condition_source_id gene_condition_source_id.txt
+
+mkdir -p GCF25; cd GCF25
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/GCF_000001405.25.gz
+gunzip GCF_000001405.25.gz
+mv GCF_000001405.25 GCF_000001405.25.vcf
+
+cd ..
+mkdir -p GCF40; cd GCF40
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz
+gunzip GCF_000001405.40.gz
+mv GCF_000001405.40 GCF_000001405.40.vcf
+
+cd ..
+mkdir -p freq; cd freq
+
+curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/population_frequency/latest_release/freq.vcf.gz
+gunzip freq.vcf.gz
+cd ..
diff --git a/scripts/biomedical/NCBI_dbSNP/run.sh b/scripts/biomedical/NCBI_dbSNP/run.sh
new file mode 100644
index 000000000..a77ec2144
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/run.sh
@@ -0,0 +1,43 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+# make all required directories
+mkdir -p output
+mkdir -p output/GCF25
+mkdir -p output/GCF40
+mkdir -p output/GCF40/hg38
+mkdir -p output/GCF40/hg38alleledisease
+mkdir -p output/GCF40/hg38alleledrug
+mkdir -p output/GCF40/hg38alleles
+mkdir -p output/GCF40/hg38freq
+mkdir -p output/freq
+
+echo "File split started"
+sh split_files.sh
+echo "Splitting  Completed"
+
+# Command
+echo "Running python script"
+
+python3 scripts/process_medgen.py
+python3 scripts/process_genome_assembly_report.py
+python3 scripts/process_gene_condition_source.py
+
+echo "Start background process"
+sh background_process.sh scripts/process_dbsnp_hg19_positions.py input/GCF25/
+sh background_process.sh scripts/process_dbsnp_hg38.py input/GCF40/
+sh background_process.sh scripts/process_dbsnp_freq.py input/freq/
+echo "Background process completed"
diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_freq.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_freq.py
new file mode 100644
index 000000000..ae632a450
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_freq.py
@@ -0,0 +1,249 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Author: Pradeep Kumar Krishnaswamy
+Date: 13-Oct-2024
+Name: process_dbsnp_freq
+Description: cleaning the NCBI dbSNP freq input file.
+@source data: Download NCBI dbSNP data from FTP location. Refer to download.sh for details
+"""
+
+import csv
+import os
+import sys
+import time
+from copy import deepcopy
+from absl import flags
+from absl import logging
+from datetime import datetime as dt
+
+MODULE_DIR = os.path.dirname(os.path.dirname(__file__))
+
+# Setup path for import from data/util
+# or set `export PYTHONPATH="./:<repo>/data/util"` in bash
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+_DATA_DIR = _SCRIPT_DIR.split('/data/')[0]
+sys.path.append(os.path.join(_DATA_DIR, 'data/util'))
+
+import file_util
+from counters import Counters
+
+# for local testing purpose only
+# from Utils.counters import Counters
+# import Utils.file_util as file_util
+
+_FLAGS = flags.FLAGS
+
+flags.DEFINE_string('input_file', 'freq_shard_aa.vcf',
+                    'Input file to process. Mandatory to pass this argument')
+flags.DEFINE_string('output_dir', 'output/freq', 'Output directory for generated files.')
+flags.DEFINE_string('input_dir', 'input/freq', 'Input directory where .vcf files downloaded.')
+
+_FLAGS(sys.argv)
+
+CSV_DICT = {
+    'dcid_gv': '',
+    'dcid': '',
+    'name': '',
+    'alleleFrequency': '',
+    'alternativeAllele': '',
+    'genotypeHeterozygousFrequency': '',
+    'genotypeHomozygousAlternativeFrequency': '',
+    'genotypeHomozygousReferenceFrequency': '',
+    'hardyWeinbergEquationPValue': '',
+    'isGlobalPopulation': '',
+    'measuredPopulation': '',
+    'referenceAllele': '',
+    'rsID': '',
+    'sampleSize': ''
+}
+
+COLUMN_CODES_DICT = {
+    9: {
+        'name': 'European',
+        'isGlobal': False
+    },
+    10: {
+        'name': 'African Others',
+        'isGlobal': False
+    },
+    11: {
+        'name': 'East Asian',
+        'isGlobal': False
+    },
+    12: {
+        'name': 'African American',
+        'isGlobal': False
+    },
+    13: {
+        'name': 'Latin American 1',
+        'isGlobal': False
+    },
+    14: {
+        'name': 'Latin American 2',
+        'isGlobal': False
+    },
+    15: {
+        'name': 'Other Asian',
+        'isGlobal': False
+    },
+    16: {
+        'name': 'South Asian',
+        'isGlobal': False
+    },
+    17: {
+        'name': 'Other',
+        'isGlobal': False
+    },
+    18: {
+        'name': 'African',
+        'isGlobal': False
+    },
+    19: {
+        'name': 'Asian',
+        'isGlobal': False
+    },
+    20: {
+        'name': 'Total',
+        'isGlobal': True
+    }
+}
+
+
+def process_input_csv(input_file: str, output_freq_file_path: str) -> None:
+    """ Row by row processing of NCBI dbSNP freq input file
+    Args:
+        input_file (str): file path to process
+        output_freq_file_path (str): output file path to save cleaned csv
+    """
+    with open(output_freq_file_path, 'w') as output_hg38_freq:
+        writer_hg38_freq = csv.DictWriter(output_hg38_freq, CSV_DICT, extrasaction='ignore')
+        writer_hg38_freq.writeheader()
+        counters = Counters()
+        counters.add_counter('total', file_util.file_estimate_num_rows(input_file))
+
+        with open(input_file, 'r') as input_file_csv:
+            for line in input_file_csv:
+                # skip row
+                if line[0] == '#':
+                    continue
+                # process this row
+                else:
+                    input_row = line.replace('\n', '').split('\t')
+
+                    dciv_gv = f'bio/{input_row[2]}'
+                    rsID = input_row[2]
+                    #print(rsID, end='\r')
+                    if rsID == '.':
+                        continue
+                    refAllele = input_row[3]
+                    altAllele = input_row[4]
+
+                    for i in range(9, 21):
+                        try:
+                            row = deepcopy(CSV_DICT)
+                            row['dcid_gv'] = dciv_gv
+                            #print(dciv_gv, end='\r')
+                            row['rsID'] = rsID
+                            row['referenceAllele'] = refAllele
+                            row['alternativeAllele'] = altAllele
+                            row['dcid'] = f"bio/{rsID}_{COLUMN_CODES_DICT[i]['name'].replace(' ', '_')}"
+                            row['name'] = f'"{rsID} {COLUMN_CODES_DICT[i]["name"]} Population Frequency"'
+                            row['isGlobalPopulation'] = COLUMN_CODES_DICT[i]['isGlobal']
+                            row['measuredPopulation'] = COLUMN_CODES_DICT[i]['name']
+                            row['genotypeHeterozygousFrequency'] = "0.00000"
+                            row['genotypeHomozygousAlternativeFrequency'] = "0.00000"
+                            row['genotypeHomozygousReferenceFrequency'] = "0.00000"
+                            if input_row[i].count(':') != 5:
+                                # Skip this record as it cannot be unpacked to desired format
+                                continue
+                            # column format “AN:AC:HWEP:GR:GV:GA”.
+                            row = parse_freq_row(input_row[i], refAllele, altAllele, row)
+                            writer_hg38_freq.writerow(row)
+
+                        except Exception as e:
+                            print(input_row[i], e, rsID)
+                counters.add_counter('processed', 1)
+
+
+def parse_freq_row(freq_value, refAllele, altAllele, row):
+    """ parser for freq input value for the given row dict
+
+    Args:
+        freq_value (_type_): freq string e.g. format “AN:AC:HWEP:GR:GV:GA”.
+        refAllele (_type_): referenceAllele
+        altAllele (_type_): alternativeAllele
+        row (_type_): input row dict
+
+    Returns:
+        _type_: row dict
+    """
+    AN, AC, HWEP, GR, GV, GA = [
+        int(x) if x.lstrip("-").isdigit() else x for x in freq_value.split(':')
+    ]
+    if isinstance(AC, str):
+        ref_val = 0.0
+        alt_alleles = AC.split(',')
+        if AN != 0:
+            ref_val = eval(f"({AN}-{'-'.join(alt_alleles)})/{AN}")
+
+        alt_lst = altAllele.split(',')
+        alt_val_lst = []
+
+        for idx, alt_allele in enumerate(alt_alleles):
+            alt_val = "0.00000"
+            if AN != 0:
+                alt_val = f"{(int(alt_allele)/AN):.5f}"
+            alt_val_lst.append(f"{alt_lst[idx]}:{alt_val}")
+        row['alleleFrequency'] = f"{refAllele}:{ref_val:.5f}, {', '.join(alt_val_lst)}"
+
+    else:
+        if AN == 0:
+            row['alleleFrequency'] = f"{refAllele}:0.00000, {altAllele}:0.00000"
+
+        else:
+            row['alleleFrequency'] = f"{refAllele}:{((AN-AC)/AN):.5f}, {altAllele}:{(AC/AN):.5f}"
+
+    if AN != 0:
+        row['genotypeHeterozygousFrequency'] = f"{(GV / (AN / 2)):.5f}"
+        row['genotypeHomozygousAlternativeFrequency'] = f"{(GA / (AN / 2)):.5f}"
+        row['genotypeHomozygousReferenceFrequency'] = f"{(GR / (AN / 2)):.5f}"
+
+    row['hardyWeinbergEquationPValue'] = HWEP
+    row['sampleSize'] = AN
+    return row
+
+
+def main(input_file: str) -> None:
+    """ Main method
+
+    Args:
+        input_file (str): file path to process
+    """
+    start_time = time.time()
+    logging.set_verbosity('info')
+    logging.info(f"Freq processing input file {input_file} - {dt.now()}")
+    input_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file)
+
+    output_file = input_file.split('.')[0] + '.csv'
+    output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file)
+    logging.info(f"output_csv {output_csv}")
+
+    process_input_csv(input_file_path, output_csv)
+    logging.info(f"Time taken to process {((time.time() - start_time)/60):.2f} - {dt.now()}")
+
+
+if __name__ == '__main__':
+    main(_FLAGS.input_file)
diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg19_positions.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg19_positions.py
new file mode 100644
index 000000000..0f9099f32
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg19_positions.py
@@ -0,0 +1,154 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Author: Pradeep Kumar Krishnaswamy
+Date: 13-Oct-2024
+Name: process_dbsnp_hg19_positions
+Description: cleaning the NCBI dbSNP HG19 positions input file.
+@source data: Download NCBI dbSNP data from FTP location. Refer to download.sh for details
+"""
+
+import csv
+import os
+import sys
+import copy
+import json
+import random
+import time
+from absl import flags
+from absl import logging
+
+MODULE_DIR = os.path.dirname(os.path.dirname(__file__))
+
+# Setup path for import from data/util
+# or set `export PYTHONPATH="./:<repo>/data/util"` in bash
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+_DATA_DIR = _SCRIPT_DIR.split('/data/')[0]
+sys.path.append(os.path.join(_DATA_DIR, 'data/util'))
+
+import file_util
+from counters import Counters
+
+# for local testing purpose only
+# from Utils.counters import Counters
+# import Utils.file_util as file_util
+
+_FLAGS = flags.FLAGS
+flags.DEFINE_string('input_file', 'gcf25_shard_aa.vcf',
+                    'Input file to process. Mandatory to pass this argument')
+flags.DEFINE_string('output_dir', 'output/GCF25', 'Output directory for generated files.')
+flags.DEFINE_string('input_dir', 'input/GCF25', 'Input directory where .vcf files downloaded.')
+flags.DEFINE_string('json_dir', 'output', 'Directory of json file generated from genome_assembly')
+
+_FLAGS(sys.argv)
+
+CSV_DICT = {
+    'dcid': '',
+    'name': '',
+    'dcid_pos': '',
+    'name_pos': '',
+    'inChromosome': '',
+    'position': '',
+    'rsID': ''
+}
+
+hg19_genome_assembly_file_name = 'hg19_genome_assembly_report.json'
+HG19_REFSEQ_DICT = {}
+
+
+def load_json():
+    """ load hg19 genome assembly file file
+    """
+    global HG19_REFSEQ_DICT
+    hg19_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.json_dir,
+                                  hg19_genome_assembly_file_name)
+
+    hg19_dict = None
+    with open(hg19_file_path, 'r') as f:
+        hg19_dict = json.load(f)
+    for hg in hg19_dict:
+        HG19_REFSEQ_DICT[hg['refSeqAccession']] = hg['dcid']
+
+
+def parse_hg19_row(input_row, hg19_ref_seq):
+    """ parse hg19 row
+
+    Args:
+        input_row (_type_): input dict
+        hg19_ref_seq (_type_): hg19 genome assembly
+
+    Returns:
+        _type_: row dict
+    """
+    current_row = copy.deepcopy(CSV_DICT)
+    current_row['dcid'] = f'bio/{input_row[2]}'
+    current_row['name'] = input_row[2]
+    if hg19_ref_seq:
+        current_row['dcid_pos'] = f'{hg19_ref_seq}_{input_row[1]}'
+        current_row['name_pos'] = f'"hg19 {hg19_ref_seq.replace("bio/hg19_", "")} {input_row[1]}"'
+        current_row['inChromosome'] = hg19_ref_seq
+    current_row['position'] = input_row[1]
+    current_row['rsID'] = input_row[2]
+    return current_row
+
+
+def main(input_file_name: str) -> None:
+    """ Main method
+
+    Args:
+        input_file (str): file path to process
+    """
+    logging.set_verbosity('info')
+    logging.info(f"HG18 processing input file {input_file_name}")
+    start_time = time.time()
+
+    load_json()
+    global HG19_REFSEQ_DICT
+    logging.set_verbosity('info')
+    input_file = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file_name)
+    logging.info(f"HG19 processing input file {input_file}")
+    output_file = input_file_name.split('.')[0] + '.csv'
+    output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file)
+    logging.info(f"output_csv {output_csv}")
+
+    counters = Counters()
+    counters.add_counter('total', file_util.file_estimate_num_rows(input_file))
+
+    with open(input_file, 'r') as input_file_csv:
+        with open(output_csv, 'w') as output_file_csv:
+            writer = csv.DictWriter(output_file_csv, CSV_DICT)
+            # write header
+            writer.writeheader()
+            for line in input_file_csv:
+                # skip row
+                if line[0] == '#':
+                    continue
+                # process this row
+                else:
+                    input_row = line.replace('\n', '').split('\t')
+                    hg19_ref_seq = None
+                    if input_row[0] in HG19_REFSEQ_DICT:
+                        hg19_ref_seq = HG19_REFSEQ_DICT[input_row[0]]
+                    current_row = parse_hg19_row(input_row, hg19_ref_seq)
+                    if current_row:
+                        # write to output
+                        writer.writerow(current_row)
+                counters.add_counter('processed', 1)
+
+    logging.info(f"Time taken to process {((time.time() - start_time)/60):.2f}")
+
+
+if __name__ == '__main__':
+    main(_FLAGS.input_file)
diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg38.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg38.py
new file mode 100644
index 000000000..c82fe7292
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg38.py
@@ -0,0 +1,944 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Author: Pradeep Kumar Krishnaswamy
+Date: 13-Oct-2024
+Name: process_dbsnp_hg38
+Description: cleaning the NCBI dbSNP HG38 input file.
+@source data: Download NCBI dbSNP data from FTP location. Refer to download.sh for details
+"""
+
+import csv
+import os
+import sys
+import re
+import json
+import struct
+import typing
+import time
+from copy import deepcopy
+from absl import flags
+from absl import logging
+from datetime import datetime as dt
+
+MODULE_DIR = os.path.dirname(os.path.dirname(__file__))
+
+# Setup path for import from data/util
+# or set `export PYTHONPATH="./:<repo>/data/util"` in bash
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+_DATA_DIR = _SCRIPT_DIR.split('/data/')[0]
+sys.path.append(os.path.join(_DATA_DIR, 'data/util'))
+
+import file_util
+from counters import Counters
+
+# for local testing purpose only
+# from Utils.counters import Counters
+# import Utils.file_util as file_util
+
+_FLAGS = flags.FLAGS
+# flag dict
+flags.DEFINE_string('input_file', 'gcf40_shard_aa.vcf',
+                    'Input file to process. Mandatory to pass this argument')
+flags.DEFINE_string('output_dir', 'output/GCF40', 'Output directory for generated files.')
+flags.DEFINE_string('input_dir', 'input/GCF40', 'Input directory where .vcf files downloaded.')
+flags.DEFINE_string('mapping_file_dir', 'output', 'path of the cui_dcid_mapping.csv file.')
+flags.DEFINE_string('json_dir', 'output', 'Directory of json file generated from genome_assembly')
+flags.DEFINE_string(
+    'gene_id_dcid_mapping', 'ncbi_gene_id_dcid_mapping.csv',
+    'Please specify the path to the "ncbi_gene_id_dcid_mapping.csv" file generated in Gene import. If not provided, the script will default to the current working directory.'
+)
+
+_FLAGS(sys.argv)
+
+# Declare Universal Variables
+_BASE_32_MAP = [
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l',
+    'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z', 'e'
+]
+_NUM_BITS_32 = 5
+_LONG_ID_LEN = 13
+
+HG38_DICT = {
+    'dcid': '',
+    'name': '',
+    'dcid_pos': '',
+    'name_pos': '',
+    'chrom': '',
+    'position': '',
+    'alleleOrigin': '',
+    'alternativeAllele': '',
+    'dbSNPBuildID': '',
+    'geneID': '',
+    'geneID_2': '',
+    'referenceAllele': '',
+    'rsID': '',
+    'suspectReasonCode': '',
+    'variantClass': '',
+    'genotypesAvailable': False,
+    'hasNonSynonymousFrameShift': False,
+    'hasNonSynonymousMissenseMutation': False,
+    'hasNonSynonymousNonsenseMutation': False,
+    'hasSynonymousMutation': False,
+    'isCommonVariant': False,
+    'isInAcceptorSpliceSite': False,
+    'isInDonorSpliceSite': False,
+    'isInIntron': False,
+    'isInFivePrimeGeneRegion': False,
+    'isInFivePrimeUTR': False,
+    'isInThreePrimeGeneRegion': False,
+    'isInThreePrimeUTR': False,
+    'isPublished': False
+}
+HG38_ALLELES_DICT = {
+    'dcid': '',
+    'dcid_allele': '',
+    'name_allele': '',
+    'CLNHGVS': '',
+    'variant': '',
+    'ARUP_Laboratories\x2c_Molecular_Genetics_and_Genomics\x2cARUP_Laboratories': '',
+    'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID': '',
+    'HGMD': '',
+    'OMIM': '',
+    'PharmGKB': '',
+    'PharmGKB_Clinical_Annotation': '',
+    'UniProtKB': ''
+}
+HG38_ALLELES_DISEASE_DICT = {
+    'dcid': '',
+    'dcid_allele': '',
+    'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID': '',
+    'geneticTestingRegistryID': '',
+    'humanGeneMutationDatabaseID': '',
+    'omimID': '',
+    'pharmGKBID': '',
+    'uniProtID': '',
+    'dcid_disease': '',
+    'name_disease': '',
+    'experimentalFactorOntologyID': '',
+    'geneReviewsID': '',
+    'humanPhenotypeOntologyID': '',
+    'medicalGeneticsSummariesID': '',
+    'medicalSubjectHeadingID': '',
+    'officeOfRareDiseasesId': '',
+    'orphaNumber': '',
+    'snomedCT': '',
+    'medGenID': '',
+    'dcid_disease_allele_association': '',
+    'name_disease_allele_association': '',
+    'CLNORIGIN': '',
+    'CLNSIG': '',
+    'CLNREVSTAT': '',
+    'geneID': '',
+    'geneticTestingRegistryID': '',
+    'pharmGKBID': '',
+    'CLNACC': ''
+}
+HG38_ALLELES_DRUG_DICT = {
+    'dcid': '',
+    'dcid_allele': '',
+    'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID': '',
+    'geneticTestingRegistryID': '',
+    'humanGeneMutationDatabaseID': '',
+    'omimID': '',
+    'pharmGKBID': '',
+    'uniProtID': '',
+    'dcid_disease': '',
+    'name_disease': '',
+    'compound_dcid': '',
+    'experimentalFactorOntologyID': '',
+    'geneReviewsID': '',
+    'humanPhenotypeOntologyID': '',
+    'medicalGeneticsSummariesID': '',
+    'medicalSubjectHeadingID': '',
+    'officeOfRareDiseasesId': '',
+    'orphaNumber': '',
+    'snomedCT': '',
+    'medGenID': '',
+    'dcid_disease_allele_association': '',
+    'name_disease_allele_association': '',
+    'CLNORIGIN': '',
+    'CLNSIG': '',
+    'CLNREVSTAT': '',
+    'geneID': '',
+    'geneticTestingRegistryID': '',
+    'pharmGKBID': '',
+    'CLNACC': ''
+}
+HG38_FREQ_DICT = {
+    'dcid': '',
+    'dcid_freq': '',
+    'name_freq': '',
+    'alleleFrequency': '',
+    'measuredPopulation': '',
+    'rsID': ''
+}
+
+CUI_DCID_MAPPING_DICT = {}
+
+dbsnp_hg38_file_name = 'hg38/{0}'
+dbsnp_hg38_alleles_file_name = 'hg38alleles/{0}'
+dbsnp_hg38_allele_disease_file_name = 'hg38alleledisease/{0}'
+dbsnp_hg38_allele_drug_file_name = 'hg38alleledrug/{0}'
+dbsnp_hg38_freq_file_name = 'hg38freq/{0}'
+cui_dict_mapping_file_name = 'cui_dcid_mappings.csv'
+writer_hg38 = None
+writer_hg38_alleles = None
+writer_hg38_allele_disease = None
+writer_hg38_allele_drug = None
+writer_hg38_freq = None
+hg19_genome_assembly_file_name = 'hg19_genome_assembly_report.json'
+hg38_genome_assembly_file_name = 'hg38_genome_assembly_report.json'
+gene_id_dcid_mapping_file_name = 'ncbi_gene_id_dcid_mapping.csv'
+HG19_REFSEQ_DICT = {}
+HG38_DCID_DICT = {}
+GENE_ID_DCID_MAPPING = {}
+DB_NOT_AVAILABLE = set()
+
+HG38_FLAG_PROPS = {
+    'NSF': 'hasNonSynonymousFrameShift',
+    'NSM': 'hasNonSynonymousMissenseMutation',
+    'NSN': 'hasNonSynonymousNonsenseMutation',
+    'SYN': 'hasSynonymousMutation',
+    'U3': 'isInThreePrimeUTR',
+    'U5': 'isInFivePrimeUTR',
+    'ASS': 'isInAcceptorSpliceSite',
+    'DSS': 'isInDonorSpliceSite',
+    'INT': 'isInIntron',
+    'R3': 'isInThreePrimeGeneRegion',
+    'R5': 'isInFivePrimeGeneRegion',
+    'GNO': 'genotypesAvailable',
+    'PUB': 'isPublished',
+    'COMMON': 'isCommonVariant',
+    'PM': 'isPublished'
+}
+
+# database dict
+HG38_DB_DICT = {
+    'ARUP_Laboratories\x2c_Molecular_Genetics_and_Genomics\x2cARUP_Laboratories':
+        'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID',
+    'Genetic_Testing_Registry_(GTR)':
+        'geneticTestingRegistryID',
+    'HGMD':
+        'humanGeneMutationDatabaseID',
+    'OMIM':
+        'omimID',
+    'PharmGKB':
+        'pharmGKBID',
+    'PharmGKB_Clinical_Annotation':
+        'pharmGKBID',
+    'UniProtKB':
+        'uniProtID'
+}
+
+# database to column mapping
+
+HG38_DB_COL_MAPPING = {
+    'MedGen': 'medGenID',
+    'Orphanet': 'orphaNumber',
+    'OMIM': 'omimID',
+    'SNOMED_CT': 'snomedCT',
+    'MeSH': 'medicalSubjectHeadingID',
+    'Gene': 'geneID',
+    'EFO': 'experimentalFactorOntologyID',
+    'GeneReviews': 'geneReviewsID',
+    'GeneReviews\\x2c': 'geneReviewsID',
+    'Genetics_Home_Reference': 'geneticsHomeReferenceID',
+    'Genetic_Testing_Registry_(GTR)': 'geneticTestingRegistryID',
+    'Medical_Genetics_Summaries': 'medicalGeneticsSummariesID',
+    'Office_of_Rare_Diseases': 'officeOfRareDiseasesId',
+    'PharmGKB': 'pharmGKBID',
+    'PharmGKB_Clinical_Annotation': 'pharmGKBID'
+}
+
+# clinicalSignificance dict
+HG38_SIG_DICT = {
+    '0': 'dcs:ClinSigUncertain',  # Uncertain significance
+    '1': 'dcs:ClinSigNotProvided',  # not provided
+    '2': 'dcs:ClinSigBenign',  # Benign
+    '3': 'dcs:ClinSigLikelyBenign',  # Likely benign
+    '4': 'dcs:ClinSigLikelyPathogenic',  # Likely pathogenic
+    '5': 'dcs:ClinSigPathogenic',  # Pathogenic
+    '6': 'dcs:ClinSigDrugResponse',  # Drug response
+    '8': 'dcs:ClinSigConfersSensitivity',  # Confers sensitivity
+    '9': 'dcs:ClinSigRiskFactor',  # Risk factor
+    '10': 'dcs:ClinSigAssociation',  # Association
+    '11': 'dcs:ClinSigProtective',  # Protective
+    '12': 'dcs:ClinSigConflictingPathogenicity',  # Conflicting interpretations of pathogenicity
+    '13': 'dcs:ClinSigAffects',  # Affects
+    '14': 'dcs:ClinSigAssociationNotFound',  # Association not found
+    '15': 'dcs:ClinSigBenign, dcs:ClinSigLikelyBenign',  # Benign/Likely bengin
+    '16': 'dcs:ClinSigPathogenic, dcs:ClinSigLikelyPathogenic',  # Pathogenic/Likely pathogenic
+    '17': 'dcs:ClinSigConflicting',  # Conflicting data from submitters
+    '18': 'dcs:ClinSigPathogenic, dcs:ClinSigLowPenetrance',  # Pathogenic, low penetrance
+    '19': 'dcs:ClinSigPathogenic, dcs:ClinSigLowPenetrance',  # Pathogenic, low penetrance
+    '20': 'dcs:ClinSigEstablishedRiskAllele',  # Established risk allele
+    '21': 'dcs:ClinSigLikelyRiskAllele',  # Likely risk allele
+    '22': 'dcs:ClinSigUncertainRiskAllele',  # Uncertain risk allele
+    '255': 'dcs:ClinSigOther'  # other
+}
+
+# enums
+
+SAO_ENUM = {
+    '0': 'dcs:VariantAlleleOriginUnspecified',
+    '1': 'dcs:VariantAlleleOriginGermline',
+    '2': 'dcs:VariantAlleleOriginSomatic',
+    '3': 'dcs:VariantAlleleOriginGermline, dcs:VariantAlleleOriginSomatic'
+}
+
+VC_ENUM = {
+    '1': 'dcs:VariationTypeSNV',
+    'SNV': 'dcs:VariationTypeSNV',
+    '2': 'dcs:VariationTypeDIV',
+    'DIV': 'dcs:VariationTypeDIV',
+    '3': 'dcs:VariationTypeHeterozygous',
+    'HETEROZYGOUS': 'dcs:VariationTypeHeterozygous',
+    '4': 'dcs:VariationTypeSTR',
+    'STR': 'dcs:VariationTypeSTR',
+    '5': 'dcs:VariationTypeNamed',
+    'NAMED': 'dcs:VariationTypeNamed',
+    '6': 'dcs:VariationTypeNoVariation',
+    'NO VARIATION': 'dcs:VariationTypeNoVariation',
+    '7': 'dcs:VariationTypeMixed',
+    'MIXED': 'dcs:VariationTypeMixed',
+    '8': 'dcs:VariationTypeMNV',
+    'MNV': 'dcs:VariationTypeMNV',
+    '9': 'dcs:VariationTypeException',
+    'Exception': 'dcs:VariationTypeException',
+    'INS': 'dcs:VariationTypeINS',
+    'DEL': 'dcs:VariationTypeDEL',
+    'INDEL': 'dcs:VariationTypeINDEL'
+}
+
+REVIEW_STATUS_ENUM = {
+    'no_assertion': 'dcs:ClinVarReviewStatusNoAssertion',  # No asserition provided by submitter
+    'no_assertion_criteria_provided': 'dcs:ClinVarReviewStatusNoAssertion',
+    'no_criteria':
+        'dcs:ClinVarReviewStatusNoCriteria',  # No assertion criteria provided by submitter
+    'no_assertion_criteria_provided': 'dcs:ClinVarReviewStatusNoCriteria',
+    'no_assertion': 'dcs:ClinVarReviewStatusNoCriteria',
+    'no_assertion_provided': 'dcs:ClinVarReviewStatusNoCriteria',
+    'Single': 'dcs:ClinVarReviewStatusSingleSubmitter',  # Classified by single submitter
+    '_single_submitter': 'dcs:ClinVarReviewStatusSingleSubmitter',
+    'single_submitter': 'dcs:ClinVarReviewStatusSingleSubmitter',
+    'mult': 'dcs:ClinVarReviewStatusMultipleSubmitters',  # Classified by multiple submitters
+    '_multiple_submitters': 'dcs:ClinVarReviewStatusMultipleSubmitters',
+    'multiple_submitters': 'dcs:ClinVarReviewStatusMultipleSubmitters',
+    'conf':
+        'dcs:ClinVarReviewStatusConflictingInterpretations',  # Criteria provided conflicting interpretations
+    'conflicting_interpretations': 'dcs:ClinVarReviewStatusConflictingInterpretations',
+    '_conflicting_interpretations': 'dcs:ClinVarReviewStatusConflictingInterpretations',
+    'exp': 'dcs:ClinVarReviewStatusReviewed',  # Reviewed by expert panel
+    'reviewed_by_expert_panel': 'dcs:ClinVarReviewStatusReviewed',
+    'guideline': 'dcs:ClinVarReviewStatusPracticeGuideline',  # Practice guideline
+    'practice_guideline': 'dcs:ClinVarReviewStatusPracticeGuideline',
+    'criteria_provided': 'dcs:ClinVarReviewStatusCriteriaProvided',
+    'no_conflicts': 'dcs:ClinVarReviewStatusNoConflicts',
+    '_no_conflicts': 'dcs:ClinVarReviewStatusNoConflicts',
+    'non_interpretation_for_the_single_variant': 'dcs:ClinVarReviewStatusNoInterpretation',
+    'no_interpretation_for_the_single_varian': 'dcs:ClinVarReviewStatusNoInterpretation'
+}
+
+
+def load_json(hg38_file_path: str, hg19_file_path: str, gene_id_dcid_mapping_path) -> None:
+    global HG19_REFSEQ_DICT, HG38_DCID_DICT, GENE_ID_DCID_MAPPING
+    start_time = time.time()
+    hg19_dict = None
+    with open(hg19_file_path, 'r') as f:
+        hg19_dict = json.load(f)
+    for hg in hg19_dict:
+        HG19_REFSEQ_DICT[hg['refSeqAccession']] = hg['dcid']
+
+    hg38_dict = None
+    with open(hg38_file_path, 'r') as f:
+        hg38_dict = json.load(f)
+    for hg in hg38_dict:
+        HG38_DCID_DICT[hg['refSeqAccession']] = [hg['dcid'], hg['name']]
+
+    with open(gene_id_dcid_mapping_path) as f:
+        next(f)  # Skip the header
+        reader = csv.reader(f, skipinitialspace=True)
+        GENE_ID_DCID_MAPPING = dict(reader)
+
+    logging.info(f"Count of GENE_ID_DCID_MAPPING loaded {len(GENE_ID_DCID_MAPPING)}")
+    logging.info(f"Time take to load mapping files {int((time.time() - start_time))} sec")
+
+
+def load_mapping_data(file_path: str) -> None:
+    global CUI_DCID_MAPPING_DICT
+    with open(file_path, 'r') as csv_file:
+        next(csv_file)
+        for input_row in csv_file:
+            line = input_row.split(',')
+            CUI_DCID_MAPPING_DICT[line[1]] = {
+                "dcid": line[0],
+                'name': line[2],
+                'is_drug_response': line[3]
+            }
+    logging.info(f"CUI DCID MAPPING records {len(CUI_DCID_MAPPING_DICT)}")
+
+
+def process_input_csv(input_file: str, dbsnp_hg38_file_path: str, dbsnp_hg38_alleles_file_path: str,
+                      dbsnp_hg38_allele_disease_file_path: str,
+                      dbsnp_hg38_allele_drug_file_path: str,
+                      dbsnp_hg38_freq_file_path: str) -> None:
+
+    global CUI_DCID_MAPPING_DICT, writer_hg38, writer_hg38_alleles, writer_hg38_allele_disease, writer_hg38_allele_drug, writer_hg38_freq
+
+    # open all output file and write header
+    with open(dbsnp_hg38_file_path, 'w') as output_hg38, open(
+            dbsnp_hg38_alleles_file_path, 'w') as output_hg38_alleles, open(
+                dbsnp_hg38_allele_disease_file_path, 'w') as output_hg38_allele_disease, open(
+                    dbsnp_hg38_allele_drug_file_path,
+                    'w') as output_hg38_allele_drug, open(dbsnp_hg38_freq_file_path,
+                                                          'w') as output_hg38_freq:
+        writer_hg38 = csv.DictWriter(output_hg38, HG38_DICT, extrasaction='ignore')
+        writer_hg38.writeheader()
+
+        writer_hg38_alleles = csv.DictWriter(output_hg38_alleles,
+                                             HG38_ALLELES_DICT,
+                                             extrasaction='ignore')
+        writer_hg38_alleles.writeheader()
+
+        writer_hg38_allele_disease = csv.DictWriter(output_hg38_allele_disease,
+                                                    HG38_ALLELES_DISEASE_DICT,
+                                                    extrasaction='ignore')
+        writer_hg38_allele_disease.writeheader()
+
+        writer_hg38_allele_drug = csv.DictWriter(output_hg38_allele_drug,
+                                                 HG38_ALLELES_DRUG_DICT,
+                                                 extrasaction='ignore')
+        writer_hg38_allele_drug.writeheader()
+
+        writer_hg38_freq = csv.DictWriter(output_hg38_freq, HG38_FREQ_DICT, extrasaction='ignore')
+        writer_hg38_freq.writeheader()
+
+        counters = Counters()
+        counters.add_counter('total', file_util.file_estimate_num_rows(input_file))
+
+        with open(input_file, 'r') as input_file_csv:
+            for line in input_file_csv:
+                # skip row
+                if line[0] == '#':
+                    continue
+
+                # core process starts here
+                else:
+                    input_row = line.replace('\n', '').split('\t')
+
+                    # dcid
+                    dcid = f'bio/{input_row[2]}'
+                    rsID = input_row[2]
+                    hg38_dcid = None
+                    if input_row[0] in HG38_DCID_DICT:
+                        hg38_dcid = HG38_DCID_DICT[input_row[0]]
+
+                    hg38_row, dict_info = parse_hg38_row(input_row, dcid, rsID, hg38_dcid)
+
+                    # process hg38_freq
+                    process_hg38_freq(dcid, input_row[3], input_row[4], dict_info, rsID,
+                                      writer_hg38_freq)
+
+                    # Process hg38_alleles
+                    process_hg38_alleles(dcid, input_row[3], input_row[4], dict_info,
+                                         writer_hg38_alleles)
+
+                    # Process hg38_alleles_disease_association & hg38_allele_drug_response_associations
+                    process_hg38_alleles_disease_drug(dcid, dict_info, writer_hg38_allele_disease,
+                                                      writer_hg38_allele_drug)
+
+                    writer_hg38.writerow(hg38_row)
+                counters.add_counter('processed', 1)
+
+
+def parse_hg38_row(input_row, dcid, rsID, hg38_dcid):
+    hg38_row = deepcopy(HG38_DICT)
+    hg38_row['dcid'] = dcid
+    hg38_row['rsID'] = rsID
+    # name
+    hg38_row['name'] = input_row[2]
+
+    if hg38_dcid:
+        hg38_row['dcid_pos'] = f'hg38_{dcid}_{input_row[1]}'
+        hg38_row['name_pos'] = f'"hg38 {hg38_dcid[1]} {input_row[1]}"'
+        hg38_row['chrom'] = hg38_dcid[1]
+        hg38_row['inChromosome'] = hg38_dcid[1]
+
+    hg38_row['position'] = input_row[1]
+    hg38_row['alternativeAllele'] = input_row[4]
+    hg38_row['referenceAllele'] = input_row[3]
+
+    dict_info = {}
+    l = input_row[7].split(';')
+    for item in l:
+        entry = item.split('=', maxsplit=1)
+        if len(entry) == 2:
+            dict_info[entry[0]] = entry[1]
+        else:
+            dict_info[entry[0]] = ''
+
+    if 'GENEINFO' in dict_info:
+        writeGeneInfo(dict_info['GENEINFO'], 'geneID', hg38_row)
+
+    if 'PSEUDOGENEINFO' in dict_info:
+        writeGeneInfo(dict_info['PSEUDOGENEINFO'], 'geneID_2', hg38_row)
+
+    if 'dbSNPBuildID' in dict_info:
+        hg38_row['dbSNPBuildID'] = dict_info['dbSNPBuildID']
+
+    if 'SAO' in dict_info:
+        hg38_row['alleleOrigin'] = SAO_ENUM[dict_info['SAO']]
+
+    if 'SSR' in dict_info:
+        hg38_row['suspectReasonCode'] = ','.join(write_reason_code(int(dict_info['SSR'])))
+
+    if 'VC' in dict_info:
+        hg38_row['variantClass'] = VC_ENUM[dict_info['VC']]
+
+        # update flags
+    for k, v in HG38_FLAG_PROPS.items():
+        if k in dict_info:
+            hg38_row[v] = True
+    return hg38_row, dict_info
+
+
+def writeGeneInfo(value, prop, row):
+    genes = value.split('|')
+    geneIDs = []
+    for g in genes:
+        geneIDs.append(f'dcid:bio/{g.split(":", maxsplit=1)[0]}')
+
+    row[prop] = ','.join(geneIDs)
+    return
+
+
+def write_reason_code(value):
+    global DB_NOT_AVAILABLE
+    original = deepcopy(value)
+    line = []
+    try:
+        if value == 0:
+            line.append(f'dcs:VariantSuspectReasonCodesUnspecified')
+            return (line)
+        if value >= 1024:
+            line.append(f'dcs:VariantSuspectReasonCodesOther')
+            value -= 1024
+        if value >= 512:
+            value -= 512
+        if value >= 256:
+            value -= 256
+        if value >= 128:
+            value -= 128
+        if value >= 64:
+            value -= 64
+        if value >= 32:
+            value -= 32
+        if value >= 16:
+            line.append(f'dcs:VariantSuspectReasonCodes1kgFailed')
+            value -= 16
+        if value >= 8:
+            line.append(f'dcs:VariantSuspectReasonCodesParaEST')
+            value -= 8
+        if value >= 4:
+            line.append(f'dcs:VariantSuspectReasonCodesOldAlign')
+            value -= 4
+        if value >= 2:
+            line.append(f'dcs:VariantSuspectReasonCodesByEST')
+            value -= 2
+        if value >= 1:
+            line.append(f'dcs:VariantSuspectReasonCodesParalog')
+            value -= 1
+        if value > 0:
+            logging.info(f'Suspect Reason Code Error: value = {value}, original = {original}')
+    except:
+        logging.info(f"Error parsing Reason Code {value}")
+    return (line)
+
+
+def process_hg38_alleles(dcid, ref, alt, dict_info, file) -> None:
+    alleles = str(ref + ',' + alt).split(',')
+    hgvs = []
+    db_lst = []
+    db_set = {}
+    if 'CLNHGVS' in dict_info and len(dict_info['CLNHGVS']) > 0:
+        hgvs = dict_info['CLNHGVS'].split(",")
+
+    if 'CLNVI' in dict_info:
+        db_entries = [x for x in dict_info['CLNVI'].split(',') if len(x) > 1]
+
+        for dbs in db_entries:
+            for dbs_level1 in dbs.split('|'):
+                for dbs_level2 in dbs_level1.split('/'):
+                    if ':' in dbs_level2:
+                        dbs = dbs_level2.split(':', maxsplit=1)
+                        if dbs[0] in HG38_DB_DICT.keys():
+                            db_set[dbs[0]] = HG38_DB_DICT[dbs[0]]
+
+    for idx, alle in enumerate(alleles):
+        # bio/rs199509194_Allele_<hashed ‘G'>)
+        row = deepcopy(HG38_ALLELES_DICT)
+        dcid_allele = f'{dcid}_Allele_{generate_short_id(alle)}'
+        name_allele = f'"{dcid} Allele {alle}"'
+        row['dcid'] = dcid
+        row['dcid_allele'] = dcid_allele
+        row['name_allele'] = name_allele
+        row['variant'] = alle
+        if len(hgvs) > idx:
+            row['CLNHGVS'] = hgvs[idx]
+        if db_set:
+            for db in db_set:
+                row[db] = db_set[db]
+        file.writerow(row)
+
+
+def process_hg38_alleles_disease_drug(dcid, dict_info, disease_file, drug_file) -> None:
+    dcid_disease = None
+    name_disease = None
+    is_drug_response = False
+    dcid_compound = []
+    global CUI_DCID_MAPPING_DICT
+    if 'CLNDISDB' in dict_info:
+        values = [x for x in dict_info['CLNDISDB'].split(',') if len(x) > 1]
+        for val in values:
+            if 'MedGen' in val:
+                cui = val.split(':')[1]
+                dcid_disease = f'bio/{cui}'
+                try:
+                    name_disease = CUI_DCID_MAPPING_DICT[cui]['name']
+                    is_drug_response = CUI_DCID_MAPPING_DICT[cui]['is_drug_response']
+                    dcid_compound.append(CUI_DCID_MAPPING_DICT[cui]['dcid'])
+                except:
+                    pass
+
+    if not dcid_disease or not name_disease:
+        if 'CLNDN' in dict_info:
+            if not dcid_disease:
+                dcid_disease = f'bio/{get_disease_pascal_case(dict_info["CLNDN"])}'
+            if not name_disease:
+                name_disease = dict_info['CLNDN']
+                name_disease = name_disease.replace('_', ' ').replace('x2c_', '').replace(
+                    '-', '').replace(',', ' ').replace('\\', ' ')
+
+    row = deepcopy(HG38_ALLELES_DISEASE_DICT)
+    row['dcid'] = dcid
+    row['dcid_disease'] = dcid_disease
+    row['name_disease'] = name_disease
+
+    if 'CLNACC' in dict_info:
+        acc = [d for d in dict_info['CLNACC'].split(",") if len(d) > 1]
+        row['dcid_allele_disease_association'] = f'bio/{acc[0]}'
+        row['name_allele_disease_association'] = acc[0]
+        row['CLNACC'] = ",".join(acc)
+
+    if 'CLNORIGIN' in dict_info:
+        row['alleleOrigin'] = writeOrigin(dict_info['CLNORIGIN'])
+
+    if 'CLNSIG' in dict_info:
+        sigs_lst = set()
+        for sigs in dict_info['CLNSIG'].split(","):
+            for sig in sigs.split('|'):
+                for s in sig.split('/'):
+                    if len(s) > 0 and s != '.':
+                        sigs_lst.add(HG38_SIG_DICT[s])
+
+        row['clinicalSignificance'] = ','.join(sigs_lst)
+
+    if 'CLNDISDB' in dict_info:
+        db_dict = getDatabasetoColMapping(dict_info['CLNDISDB'])
+        for db in db_dict:
+            row[db] = ",".join(db_dict[db])
+
+    if 'CLNREVSTAT' in dict_info:
+        stats = set(dict_info['CLNREVSTAT'].replace('.,', '').split(','))
+        row['clinVarReviewStatus'] = ",".join(stats)
+
+    if is_drug_response:
+        row['dcid_compound'] = ','.join(dcid_compound)
+        drug_file.writerow(row)
+    else:
+        disease_file.writerow(row)
+
+
+def process_hg38_freq(dcid, ref, alt, dict_info, rsID, file):
+    freq_lst = []
+    if 'FREQ' in dict_info:
+        freq_lst = dict_info['FREQ'].split('|')
+        for freq in freq_lst:
+            row = parse_hg38_freq_row(dcid, ref, alt, rsID, freq)
+            if row:
+                file.writerow(row)
+
+
+def parse_hg38_freq_row(dcid, ref, alt, rsID, freq):
+    """ parse freq entry to row dict
+
+    Args:
+        dcid (_type_): dcid
+        ref (_type_): referenceAllele
+        alt (_type_): alternativeAllele
+        rsID (_type_): rsID
+        freq (_type_): freq entry
+
+    Returns:
+        _type_: row dict
+    """
+    row = deepcopy(HG38_FREQ_DICT)
+    row['dcid'] = dcid
+    row['rsID'] = rsID
+    key, val = freq.split(':')
+    row['dcid_freq'] = f'{dcid}_{key}'
+    row['name_freq'] = f'"{rsID} {key} Population Frequency"'
+    freq_val = val.split(',')
+    ref_freq = f'{ref}:{freq_val[0]}'
+    alt_freq_lst = []
+    for idx, a in enumerate(alt.split(',')):
+        if idx == 0:
+            alt_freq_lst.append(f'{a}:{freq_val[1]}')
+        else:
+            alt_freq_lst.append(f'{a}:0.0')
+
+    row['alleleFrequency'] = f'{ref_freq},{",".join(alt_freq_lst)}'
+    row['measuredPopulation'] = key
+    return row
+
+
+def get_disease_pascal_case(s: str, sep=None) -> str:
+    s = s.replace('x2c_', '').replace('-', '').replace(',', '')
+
+    if sep and sep in s:
+        if '\\' in s:
+            s = s.replace('\\', sep)
+        return "".join(map(lambda x: x[:1].upper() + x[1:], s.split(sep)))
+    else:
+        return s[:1].upper() + s[1:]
+
+
+def getDatabasetoColMapping(value):
+    global GENE_ID_DCID_MAPPING
+    values = [i for i in value.split(',') if i != "." and len(i) > 0]  # split into entries
+    result_dict = {}
+    for v in values:
+        db_lst = []
+        if '\\' in v:
+            db_lst.extend(v.split('\\'))
+        elif '/' in v:
+            db_lst.extend(v.split('/'))
+        else:
+            db_lst.append(v)
+
+        for dbs in db_lst:
+            if ':' in dbs:
+                db, val = dbs.split(':', maxsplit=1)
+                if db == 'Human_Phenotype_Ontology':
+                    if 'humanPhenotypeOntologyID' in result_dict:
+                        result_dict['humanPhenotypeOntologyID'].append(val)
+                    else:
+                        result_dict['humanPhenotypeOntologyID'] = [val]
+                elif db == 'MeSH':
+                    if 'medicalSubjectHeadingID' in result_dict:
+                        result_dict['medicalSubjectHeadingID'].append(f'bio/{val}')
+                    else:
+                        result_dict['medicalSubjectHeadingID'] = [f'bio/{val}']
+                elif db == 'Gene':
+                    try:
+                        if 'Gene' in result_dict:
+                            result_dict['geneID'].append(GENE_ID_DCID_MAPPING[val])
+                        else:
+                            result_dict['geneID'] = [GENE_ID_DCID_MAPPING[val]]
+                    except:
+                        logging.info(f"Gene {val} not available in  GENE_ID_DCID_MAPPING")
+
+                else:
+                    try:
+                        if HG38_DB_COL_MAPPING[db] in result_dict:
+                            result_dict[HG38_DB_COL_MAPPING[db]].append(val)
+                        else:
+                            result_dict[HG38_DB_COL_MAPPING[db]] = [val]
+                    except:
+                        if db in result_dict:
+                            result_dict[db].append(val)
+                        else:
+                            result_dict[db] = [val]
+
+    return result_dict
+
+
+def write_review_status(value):
+    # extract entries
+    values = re.findall(r"[\w']+", value)
+    # remove duplicates
+    values = set(list(values))
+    line = []
+    for value in values:
+        if value in REVIEW_STATUS_ENUM:
+            line.append(REVIEW_STATUS_ENUM[value])
+        else:
+            logging.info(f'Review Status Error: {value}')
+    return line
+
+
+def writeOrigin(value):
+    # name = 'alleleOrigin'
+    original = deepcopy(value)  # save copy of original value
+    values = [i for i in re.split('\||,', value) if i != "." and len(i) > 0]  # split into entries
+    line = []
+    for v in values:
+        v = int(v)  # convert to integer
+        if v >= 1073741824:
+            line.append('dcs:VariantAlleleOriginOther')
+            v -= 1073741824
+        if v == 0:
+            line.append('dcs:VariantAlleleOriginUnspecified')
+            next
+        if v >= 1024:
+            line.append('dcs:VariantAlleleOriginOther')
+            v -= 1024
+        if v >= 512:
+            line.append('dcs:VariantAlleleOriginTestedInconclusive')
+            v -= 512
+        if v >= 256:
+            line.append('dcs:VariantAlleleOriginNotTested')
+            v -= 256
+        if v >= 128:
+            line.append('dcs:VariantAlleleOriginUniParental')
+            v -= 128
+        if v >= 64:
+            line.append('dcs:VariantAlleleOriginBiParenal')
+            v -= 64
+        if v >= 32:
+            line.append('dcs:VariantAlleleOriginDeNovo')
+            v -= 32
+        if v >= 16:
+            line.append('dcs:VariantAlleleOriginMaternal')
+            v -= 16
+        if v >= 8:
+            line.append('dcs:VariantAlleleOriginPaternal')
+            v -= 8
+        if v >= 4:
+            line.append('dcs:VariantAlleleOriginInherited')
+            v -= 4
+        if v >= 2:
+            line.append('dcs:VariantAlleleOriginSomatic')
+            v -= 2
+        if v >= 1:
+            line.append('dcs:VariantAlleleOriginGermline')
+            v -= 1
+        if v > 0:
+            logging.info(f'Allele Origin Error: value = {value}, original = {original}')
+    return ','.join(line)
+
+
+def generate_short_id(input_str):
+    fp = robust_farm_fingerprint_64(input_str)
+    res = []
+    for i in range(0, _LONG_ID_LEN):
+        idx = fp & 0x1f
+        res.append(_BASE_32_MAP[idx])
+        fp = fp >> _NUM_BITS_32
+        if fp == 0:
+            break
+    return u''.join(res)
+
+
+# define functions
+def robust_farm_fingerprint_64(data: typing.Union[str, bytes]) -> int:
+    """Calculates a 64-bit FarmHash fingerprint, robust against different input types.
+
+    Args:
+        data: The data to fingerprint (either a string or bytes).
+
+    Returns:
+        The 64-bit fingerprint as an integer.
+    """
+
+    if isinstance(data, str):
+        data = data.encode("utf-8")  # Ensure bytes for hashing consistency
+
+    # Modified from FarmHash (reference: https://github.com/google/farmhash)
+    size = len(data)
+    h = size * 0x811c9dc5
+
+    if size >= 8:
+        h = (h ^ struct.unpack("<Q", data[:8])[0]) * 0x811c9dc5
+        data = data[8:]
+        size -= 8
+
+    if size >= 4:
+        h = (h ^ struct.unpack("<I", data[:4])[0]) * 0x811c9dc5
+        data = data[4:]
+        size -= 4
+
+    if size >= 2:
+        h = (h ^ struct.unpack("<H", data[:2])[0]) * 0x811c9dc5
+        data = data[2:]
+        size -= 2
+
+    if size == 1:
+        h = (h ^ data[0]) * 0x811c9dc5
+
+    h = (h ^ h >> 33) * 0xc2b2ae35
+    h = h ^ h >> 29
+    return h
+
+
+def main(input_file_name: str) -> None:
+    """ Main method
+
+    Args:
+        input_file (str): file path to process
+    """
+    logging.set_verbosity('info')
+    logging.info(f"HG38 processing input file {input_file_name}  - {dt.now()}")
+    start_time = time.time()
+
+    input_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file_name)
+
+    output_file_name = input_file_name.split('.')[0] + '.csv'
+    dbsnp_hg38_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir,
+                                        dbsnp_hg38_file_name.format(output_file_name))
+    dbsnp_hg38_alleles_file_path = os.path.join(
+        MODULE_DIR + '/' + _FLAGS.output_dir, dbsnp_hg38_alleles_file_name.format(output_file_name))
+    dbsnp_hg38_allele_disease_file_path = os.path.join(
+        MODULE_DIR + '/' + _FLAGS.output_dir,
+        dbsnp_hg38_allele_disease_file_name.format(output_file_name))
+    dbsnp_hg38_allele_drug_file_path = os.path.join(
+        MODULE_DIR + '/' + _FLAGS.output_dir,
+        dbsnp_hg38_allele_drug_file_name.format(output_file_name))
+    dbsnp_hg38_freq_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir,
+                                             dbsnp_hg38_freq_file_name.format(output_file_name))
+    logging.info("load mapping data")
+    load_mapping_data(
+        os.path.join(MODULE_DIR + '/' + _FLAGS.mapping_file_dir, cui_dict_mapping_file_name))
+    logging.info("load JSON data")
+    load_json(os.path.join(MODULE_DIR + '/' + _FLAGS.json_dir, hg19_genome_assembly_file_name),
+              os.path.join(MODULE_DIR + '/' + _FLAGS.json_dir, hg38_genome_assembly_file_name),
+              os.path.join(MODULE_DIR, _FLAGS.gene_id_dcid_mapping))
+
+    process_input_csv(input_file_path, dbsnp_hg38_file_path, dbsnp_hg38_alleles_file_path,
+                      dbsnp_hg38_allele_disease_file_path, dbsnp_hg38_allele_drug_file_path,
+                      dbsnp_hg38_freq_file_path)
+
+    global DB_NOT_AVAILABLE
+    if len(DB_NOT_AVAILABLE) > 0:
+        logging.info("Database not available in the DB DICT..")
+        for db in DB_NOT_AVAILABLE:
+            logging.info(db)
+
+    logging.info(f"Time taken to process {((time.time() - start_time)/60):.2f} - {dt.now()}")
+
+
+if __name__ == '__main__':
+    main(_FLAGS.input_file)
diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_gene_condition_source.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_gene_condition_source.py
new file mode 100644
index 000000000..60b2fca05
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_gene_condition_source.py
@@ -0,0 +1,73 @@
+import csv
+import os
+import sys
+import copy
+from absl import flags
+from absl import logging
+from dateutil import parser as ds
+
+MODULE_DIR = os.path.dirname(os.path.dirname(__file__))
+
+_FLAGS = flags.FLAGS
+
+flags.DEFINE_string('output_dir', 'output', 'Output directory for generated files.')
+flags.DEFINE_string('input_dir', 'input', 'Input directory where .vcf files downloaded.')
+
+_FLAGS(sys.argv)
+gene_condition_source_id_file_name = 'gene_condition_source_id'
+output_csv_file_name = 'clinvar_diesease_gene.csv'
+CSV_DICT = {
+    'dcid': '',
+    'dcid_disease': '',
+    'dcid_gene': '',
+    'name': '',
+    'isCausal': '',
+    'sourceName': '',
+    'LastUpdated': ''
+}
+
+
+def main() -> None:
+    global HG19_REFSEQ_DICT, HG38_DCID_DICT
+    input_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir,
+                             gene_condition_source_id_file_name)
+    output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_csv_file_name)
+    date_patterns = ['%b %d %Y', "%d %b %Y"]
+    with open(input_csv, 'r') as input_file_csv:
+        with open(output_csv, 'w') as output_file_csv:
+            writer = csv.DictWriter(output_file_csv, CSV_DICT)
+            writer.writeheader()
+
+            # skip first row
+            next(input_file_csv)
+            for line in input_file_csv:
+                input_row = line.replace('\n', '').split('\t')
+                current_row = copy.deepcopy(CSV_DICT)
+                geneSymbol = ''
+                isCausal = False
+
+                if len(input_row[1]) > 0:
+                    geneSymbol = input_row[1]
+                    isCausal = True
+                else:
+                    geneSymbol = input_row[2]
+
+                current_row['dcid'] = f'bio/{input_row[3]}_{geneSymbol}'
+                current_row['dcid_disease'] = f'bio/{input_row[3]}'
+                current_row['dcid_gene'] = f'bio/{geneSymbol}'
+                current_row['name'] = f'"{input_row[4]} and {geneSymbol} Association"'
+                current_row['isCausal'] = isCausal
+                current_row['sourceName'] = input_row[5]
+                if len(input_row[8]) > 0:
+                    try:
+                        LastUpdated = ds.parse(input_row[8])
+                        current_row['LastUpdated'] = LastUpdated.strftime('%Y-%m-%d')
+                    except:
+                        print(f"LastUpdated date format issue {input_row[8]}")
+
+                # write to output
+                writer.writerow(current_row)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_genome_assembly_report.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_genome_assembly_report.py
new file mode 100644
index 000000000..84688ca4d
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_genome_assembly_report.py
@@ -0,0 +1,109 @@
+import csv
+import os
+import sys
+import copy
+import json
+from absl import flags
+from absl import logging
+
+CSV_DICT = {
+    'dcid': '',
+    'dcid_quantity': '',
+    'name_quantity': '',
+    'Sequence-Length': '',
+    'Sequence-Role': '',
+    'GenBank-Accn': '',
+    'Assembly-Unit': '',
+    'Assigned-Molecule': '',
+    'RefSeq-Accn': '',
+    'Sequence-Name': '',
+    'UCSC-style-name': ''
+}
+
+SEQUENCE_ROLE_DICT = {
+    'assembled-molecule': 'DNASequenceRoleAssembledMolecule',
+    'chromosome': 'DNASequenceRoleChromosome',
+    'unlocalized-scaffold': 'DNASequenceRoleUnlocalizedScaffold',
+    'unplaced-scaffold': 'DNASequenceRoleUnplacedScaffold',
+    'alt-scaffold': 'DNASequenceRoleAltScaffold',
+    'fix-patch': 'DNASequenceRoleFixPatch',
+    'novel-patch': 'DNASequenceRoleNovelPatch'
+}
+
+MODULE_DIR = os.path.dirname(os.path.dirname(__file__))
+
+_FLAGS = flags.FLAGS
+flags.DEFINE_string('output_dir', 'output', 'Output directory for generated files.')
+flags.DEFINE_string('input_dir', 'input', 'Input directory where .dmp files downloaded.')
+_FLAGS(sys.argv)
+
+GRCh37_input_file_name = 'GCA_000001405.14_GRCh37.p13_assembly_report.txt'
+GRCh38_input_file_name = 'GCA_000001405.29_GRCh38.p14_assembly_report.txt'
+
+GRCh37_output_file_name = 'ncbi_GRCh37_genome_assembly_report.csv'
+GRCh38_output_file_name = 'ncbi_GRCh38_genome_assembly_report.csv'
+
+hg19_genome_assembly_file_name = 'hg19_genome_assembly_report.json'
+hg38_genome_assembly_file_name = 'hg38_genome_assembly_report.json'
+
+
+def main(input_file: str, output_file: str, json_file_name: str, assembly_type: str) -> None:
+    input_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file)
+    output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file)
+    json_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, json_file_name)
+    # write header
+    # with open(output_csv, 'w') as output_file_csv:
+    #     writer = csv.DictWriter(output_file_csv, CSV_DICT)
+    #     writer.writeheader()
+    genome_assembly_json = []
+    with open(input_csv, 'r') as input_file_csv:
+        with open(output_csv, 'w') as output_file_csv:
+            writer = csv.DictWriter(output_file_csv, CSV_DICT)
+            writer.writeheader()
+
+            for line in input_file_csv:
+                # skip row
+                if line[0] == '#':
+                    continue
+                # process this row
+                else:
+                    input_row = line.replace('\n', '').split('\t')
+                    current_row = copy.deepcopy(CSV_DICT)
+                    # synonym
+                    current_row['Sequence-Name'] = input_row[1]
+                    # dnaSequenceRole
+                    current_row['Sequence-Role'] = SEQUENCE_ROLE_DICT[input_row[1]]
+                    # inChromosome
+                    if len(input_row[2]) > 1 and input_row[2] != input_row[0]:
+                        # bio/<genome_assembly>_chr1)
+                        current_row['Assigned-Molecule'] = f'bio/{assembly_type}_{input_row[9]}'
+
+                    current_row['GenBank-Accn'] = input_row[4]
+                    current_row['RefSeq-Accn'] = input_row[6]
+                    current_row['Assembly-Unit'] = input_row[7]
+                    current_row['Sequence-Length'] = input_row[8]
+                    current_row['UCSC-style-name'] = input_row[9]
+                    dcid = f"bio/{assembly_type}_{input_row[9]}"
+                    current_row['dcid'] = dcid
+                    current_row['dcid_quantity'] = f'BasePairs{input_row[8]}'
+                    current_row['name_quantity'] = f'"BasePairs {input_row[8]}"'
+
+                    # write to output
+                    writer.writerow(current_row)
+
+                    # write to json object
+                    genome_assembly_json.append({
+                        "dcid": dcid,
+                        "name": input_row[9],
+                        "refSeqAccession": input_row[6]
+                    })
+
+    with open(json_file_path, 'w', encoding='utf-8') as json_file:
+        json.dump(genome_assembly_json, json_file, ensure_ascii=False, indent=1)
+
+    print(f"Assembly file {assembly_type} completed.")
+
+
+if __name__ == '__main__':
+    main(GRCh37_input_file_name, GRCh37_output_file_name, hg19_genome_assembly_file_name, 'hg19')
+    main(GRCh38_input_file_name, GRCh38_output_file_name, hg38_genome_assembly_file_name, 'hg38')
diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_medgen.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_medgen.py
new file mode 100644
index 000000000..820392b0a
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_medgen.py
@@ -0,0 +1,511 @@
+import os
+import sys
+import copy
+import csv
+import time
+from absl import flags
+from absl import logging
+
+MODULE_DIR = os.path.dirname(os.path.dirname(__file__))
+_FLAGS = flags.FLAGS
+
+flags.DEFINE_string('output_dir', 'output', 'Output directory for generated files.')
+flags.DEFINE_string('input_dir', 'input', 'Input directory where .vcf files downloaded.')
+
+_FLAGS(sys.argv)
+
+MGSTY_file_name = 'MGSTY.txt'
+NAMES_file_name = 'NAMES.txt'
+MGDEF_file_name = 'MGDEF.txt'
+MedGenIDMappings_file_name = 'MedGenIDMappings.txt'
+output_file_name = 'medgen.csv'
+cui_dcid_mappings_file_name = 'cui_dcid_mappings.csv'
+
+CSV_DICT = {
+    'dcid': '',
+    'name': '',
+    'CUI': '',
+    'source': '',
+    'DEF': '',
+    'source_definition': '',
+    'STY': '',
+    'GARD': '',
+    'HPO': '',
+    'MONDO': '',
+    'MeSH': '',
+    'MedGen': '',
+    'OMIM': '',
+    'OMIM_Phenotypic_Series': '',
+    'OMIM_Allelic_Variant': '',
+    'Orphanet': '',
+    'SNOMEDCT_US': '',
+    'dcid_compound': '',
+    'dcid_atc_code': '',
+    'dcid_mesh': '',
+    'is_drug_response': False
+}
+
+CUI_DCID_MAPPING_DICT = {'dcid': '', 'CUI': '', 'name': '', 'is_drug_response': ''}
+
+CUI_ID_SET = set()
+
+SOURCE_DICT = {
+    'GTR': 'Genetic Testing Registry',
+    'MSH': 'Medical Subject Headings',
+    'NCI': 'NCI Thesaurus',
+    'OMIM': 'Online Mendelian Inheritance in Man',
+    'ORDO': 'Orphanet Rare Disease Ontology (ORDO)',
+    'SNOMEDCT_US': 'US Edition of SNOMED CT'
+}
+
+SOURCE_DEFINITION_DICT = {
+    'AIR':
+        'dcs:DiseaseSourceDefinitionAiRheum',
+    'AOT':
+        'dcs:DiseaseSourceDefinitionAuthorizedOsteopathicThesaurus',
+    'CCC':
+        'dcs:DiseaseSourceDefinitionClinicalCareClassificationTwoPointFive',
+    'CHV':
+        'dcs:DiseaseSourceDefinitionConsumerHealthVocabulary',
+    'CSP':
+        'dcs:DiseaseSourceDefinitionCrispThesaurus',
+    'Clinical Pharmacogenetics Implementation Consortium':
+        'dcs:DiseaseSourceDefinitionClinicalPharmacogeneticsImplementationConsortium',
+    'GO':
+        'dcs:DiseaseSourceDefinitionGeneOntology',
+    'GeneReviews':
+        'dcs:DiseaseSourceDefinitionGeneReviews',
+    'HL7V3.0':
+        'dcs:DiseaseSourceDefinitionHL7VocabularyVersionThreePointZero',
+    'HPO':
+        'dcs:DiseaseSourceDefinitionHumanPhenotypeOntology',
+    'ICF-CY':
+        'dcs:DiseaseSourceDefinitionInternationalClassificationOfFunctioninDisabilityAndHealthForChildrenAndYouth',
+    'JABL':
+        'dcs:DiseaseSourceDefinitionOnlineCongenitalMultipleAnomalyMentalRetardationSyndromes',
+    'LNC':
+        'dcs:DiseaseSourceDefinitionLoinc',
+    'MEDLINEPLUS':
+        'dcs:DiseaseSourceDefinitionMedlinePlus',
+    'MONDO':
+        'dcs:DiseaseSourceDefinitionMonarchInitiative',
+    'MSH':
+        'dcs:DiseaseSourceDefinitionMedicalSubjectHeading',
+    'Medical Genetics Summaries':
+        'dcs:DiseaseSourceDefinitionMedicalGeneticsSummaries',
+    'MedlinePlus Genetics':
+        'dcs:DiseaseSourceDefinitionMedlinePlusGenetics',
+    'NANDA-I':
+        'dcs:DiseaseSourceDefinitionNANDAITaxonomyII',
+    'NCBI curation':
+        'dcs:DiseaseSourceDefinitionNCBI',
+    'NCI':
+        'dcs:DiseaseSourceDefinitionNCIThesaurus',
+    'NOC':
+        'dcs:DiseaseSourceDefinitionNursingOutcomesClassificationThirdEdition',
+    'OMIM':
+        'dcs:DiseaseSourceDefinitionOnlineMendelianInheritanceInMan',
+    'OMS':
+        'dcs:DiseaseSourceDefinitionOmahaSystem',
+    'ORDO':
+        'dcs:DiseaseSourceDefinitionOrphanetRareDiseaseOntology',
+    'ORPHANET':
+        'dcs:DiseaseSourceDefinitionOrphanet',
+    'PDQ':
+        'dcs:DiseaseSourceDefinitionPhysicianDataQuery',
+    'PNDS':
+        'dcs:DiseaseSourceDefinitionPerioperativeNursingDataSetSecondEdition',
+    'PSY':
+        'dcs:DiseaseSourceDefinitionThesaurusOfPsychologicalIndexTerms',
+    'PharmGKB':
+        'dcs:DiseaseSourceDefinitionPharmGKB',
+    'SNOMEDCT_US':
+        'dcs:DiseaseSourceDefinitionSnomedCtUs'
+}
+
+DCID_CUI_ASSOCIATE = {
+    'C0568062': {
+        'name': 'methotrexate response - Toxicity',
+        'dcid_compound': 'dcs:chem/CID126941',
+        'dcid_atc_code': 'dcs:chem/L04AX03',
+        'dcid_mesh': 'dcs:bio/D008727'
+    },
+    'CN236531': {
+        'name': 'fentanyl response - Dosage',
+        'dcid_compound': 'dcs:chem/CID3345',
+        'dcid_atc_code': 'dcs:chem/N01AH01',
+        'dcid_mesh': 'dcs:bio/D005283'
+    },
+    'CN236536': {
+        'name': 'methadone response - Dosage',
+        'dcid_compound': 'dcs:chem/CID4095',
+        'dcid_atc_code': 'dcs:chem/N07BC02',
+        'dcid_mesh': 'dcs:bio/D008691'
+    },
+    'CN236588': {
+        'name': 'warfarin response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID54678486',
+        'dcid_atc_code': 'dcs:chem/B01AA03',
+        'dcid_mesh': 'dcs:bio/D014859'
+    },
+    'CN262133': {
+        'name': 'vincristine response - Toxicity/ADR',
+        'dcid_compound': 'dcs:chem/CID5978',
+        'dcid_atc_code': 'dcs:chem/L01CA02',
+        'dcid_mesh': 'dcs:bio/D014750'
+    },
+    'CN322717': {
+        'name':
+            'interferons, peginterferon alfa-2a, peginterferon alfa-2b, and ribavirin response - Efficacy',
+        'dcid_compound':
+            'dcs:bio/CHEMBL1201560, dcs:bio/CHEMBL1201561, dcs:chem/CID37542',
+        'dcid_atc_code':
+            'dcs:chem/L03AB11, dcs:chem/L03AB10, dcs:chem/J05AP01',
+        'dcid_mesh':
+            'dcs:bio/C100416, dcs:bio/C417083,dcs:bio/D012254'
+    },
+    'CN322718': {
+        'name': 'peginterferon alfa-2a, peginterferon alfa-2b, and ribavirin response - Efficacy',
+        'dcid_compound': 'dcs:bio/CHEMBL1201560, dcs:bio/CHEMBL1201561, dcs:chem/CID37542',
+        'dcid_atc_code': 'dcs:chem/L03AB11, dcs:chem/L03AB10, dcs:chem/J05AP01',
+        'dcid_mesh': 'dcs:bio/C100416, dcs:bio/C417083,dcs:bio/D012254'
+    },
+    'CN322719': {
+        'name':
+            'peginterferon alfa-2a, peginterferon alfa-2b, ribavirin, and telaprevir response - Efficacy',
+        'dcid_compound':
+            'dcs:bio/CHEMBL1201560, dcs:bio/CHEMBL1201561, dcs:chem/CID37542, dcs:chem/CID3010818',
+        'dcid_atc_code':
+            'dcs:chem/L03AB11, dcs:chem/L03AB10, dcs:chem/J05AP01, dcs:chem/J05AP02',
+        'dcid_mesh':
+            'dcs:bio/C100416, dcs:bio/C417083,dcs:bio/D012254, dcs:bio/C486464'
+    },
+    'CN322720': {
+        'name': 'Ace Inhibitors, Plain response - Toxicity/ADR',
+        'dcid_compound': '',
+        'dcid_atc_code': 'dcs:chem/C09A',
+        'dcid_mesh': 'dcs:bio/D000806'
+    },
+    'CN322721': {
+        'name': 'acenocoumarol response - Dosage',
+        'dcid_compound': 'dcs:chem/CID54676537',
+        'dcid_atc_code': 'dcs:chem/B01AA07',
+        'dcid_mesh': 'dcs:bio/D000074'
+    },
+    'CN322722': {
+        'name': 'adalimumab response - Efficacy',
+        'dcid_compound': 'dcs:bio/CHEMBL1201580',
+        'dcid_atc_code': 'dcs:chem/L04AB04',
+        'dcid_mesh': 'dcs:bio/D000068879'
+    },
+    'CN322723': {
+        'name': 'alfentanil response - Metabolism/PK',
+        'dcid_compound': 'dcs:chem/CID51263',
+        'dcid_atc_code': 'dcs:chem/N01AH02',
+        'dcid_mesh': 'dcs:bio/D015760'
+    },
+    'CN322724': {
+        'name': 'atorvastatin response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID60823',
+        'dcid_atc_code': 'dcs:chem/C10AA05',
+        'dcid_mesh': 'dcs:bio/D000069059'
+    },
+    'CN322725': {
+        'name': 'captopril response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID44093',
+        'dcid_atc_code': 'dcs:chem/C09AA01',
+        'dcid_mesh': 'dcs:bio/D002216'
+    },
+    'CN322726': {
+        'name': 'carbamazepine response - Dosage',
+        'dcid_compound': 'dcs:chem/CID2554',
+        'dcid_atc_code': 'dcs:chem/N03AF01',
+        'dcid_mesh': 'dcs:bio/D002220'
+    },
+    'CN322727': {
+        'name': 'clopidogrel response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID60606',
+        'dcid_atc_code': 'dcs:chem/B01AC04',
+        'dcid_mesh': 'dcs:bio/D000077144'
+    },
+    'CN322728': {
+        'name': 'phenprocoumon response - Dosage',
+        'dcid_compound': 'dcs:chem/CID54680692',
+        'dcid_atc_code': 'dcs:chem/B01AA04',
+        'dcid_mesh': 'dcs:bio/D010644'
+    },
+    'CN322729': {
+        'name': 'warfarin response - Dosage',
+        'dcid_compound': 'dcs:chem/CID54678486',
+        'dcid_atc_code': 'dcs:chem/B01AA03',
+        'dcid_mesh': 'dcs:bio/D014859'
+    },
+    'CN322730': {
+        'name': 'efavirenz response - Metabolism/PK',
+        'dcid_compound': 'dcs:chem/CID64139',
+        'dcid_atc_code': 'dcs:chem/J05AG03',
+        'dcid_mesh': 'dcs:bio/C098320'
+    },
+    'CN322731': {
+        'name': 'erlotinib response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID176870',
+        'dcid_atc_code': 'dcs:chem/L01EB02',
+        'dcid_mesh': 'dcs:bio/D000069347'
+    },
+    'CN322732': {
+        'name': 'etanercept response - Efficacy',
+        'dcid_compound': 'dcs:bio/CHEMBL1201572',
+        'dcid_atc_code': 'dcs:chem/L04AB01',
+        'dcid_mesh': 'dcs:bio/D000068800'
+    },
+    'CN322733': {
+        'name': 'gefitinib response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID123631',
+        'dcid_atc_code': 'dcs:chem/L01EB01',
+        'dcid_mesh': 'dcs:bio/D000077156'
+    },
+    'CN322734': {
+        'name': 'hydrochlorothiazide response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID3639',
+        'dcid_atc_code': 'dcs:chem/C03AA03',
+        'dcid_mesh': 'dcs:bio/D006852'
+    },
+    'CN322735': {
+        'name': 'ivacaftor response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID16220172',
+        'dcid_atc_code': 'dcs:chem/R07AX02',
+        'dcid_mesh': 'dcs:bio/C545203'
+    },
+    'CN322736': {
+        'name': 'methotrexate response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID126941',
+        'dcid_atc_code': 'dcs:chem/L04AX03',
+        'dcid_mesh': 'dcs:bio/D008727'
+    },
+    'CN322737': {
+        'name': 'pravastatin response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID54687',
+        'dcid_atc_code': 'dcs:chem/C10AA03',
+        'dcid_mesh': 'dcs:bio/D017035'
+    },
+    'CN322738': {
+        'name': 'rosuvastatin response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID446157',
+        'dcid_atc_code': 'dcs:chem/C10AA07',
+        'dcid_mesh': 'dcs:D000068718'
+    },
+    'CN322739': {
+        'name': 'salmeterol response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID5152',
+        'dcid_atc_code': 'dcs:chem/R03AC12',
+        'dcid_mesh': 'dcs:D000068299'
+    },
+    'CN322746': {
+        'name': 'ivacaftor / lumacaftor response',
+        'dcid_compound': 'chem/CID71494926',
+        'dcid_atc_code': 'chem/R07AX30',
+        'dcid_mesh': 'bio/C000599212'
+    },
+    'CN322747': {
+        'name': 'peginterferon alfa-2a response - Efficacy',
+        'dcid_compound': 'dcs:bio/CHEMBL1201560',
+        'dcid_atc_code': 'dcs:chem/L03AB11',
+        'dcid_mesh': 'dcs:bio/C100416'
+    },
+    'CN322748': {
+        'name': 'peginterferon alfa-2b response - Efficacy',
+        'dcid_compound': 'dcs:bio/CHEMBL1201561',
+        'dcid_atc_code': 'dcs:chem/L03AB10',
+        'dcid_mesh': 'dcs:bio/C417083'
+    },
+    'CN322749': {
+        'name': 'ribavirin response - Efficacy',
+        'dcid_compound': 'dcs:chem/CID37542',
+        'dcid_atc_code': 'dcs:chem/J05AP01',
+        'dcid_mesh': 'dcs:bio/D012254'
+    }
+}
+
+
+def get_pascal_case(s: str, sep=None):
+    if sep and sep in s:
+        return "".join(map(lambda x: x[:1].upper() + x[1:], s.split(sep)))
+    else:
+        return s[:1].upper() + s[1:]
+
+
+def main_process_csv(MGSTY_file_path: str, NAMES_file_path: str, MGDEF_file_path,
+                     MedGenIDMappings_file_path: str, output_file_path: str,
+                     cui_dcid_mapping_file_path: str) -> None:
+    # get unique CUI id from all four files:
+    # Clean-up the MGDEF file which is having '\n' newline character in column 2
+    MGDEF_records = []
+    with open(MGDEF_file_path, mode='r') as f:
+        next(f)
+        curr_line = ''
+        for line in f:
+            if line[-2:-1] == '|':
+                if curr_line == '':
+                    MGDEF_records.append(line)
+                    curr_line = ''
+                else:
+                    MGDEF_records.append(curr_line + line)
+                    curr_line = ''
+            else:
+                curr_line = curr_line + line
+
+    with open(MGSTY_file_path, mode='r') as f:
+        next(f)
+        for line in f:
+            val = line.split('|')
+            add_to_cui_set(val[0])
+
+    with open(NAMES_file_path, mode='r') as f:
+        next(f)
+        for line in f:
+            val = line.split('|')
+            if val[3] == 'N':
+                add_to_cui_set(val[0])
+
+    for line in MGDEF_records:
+        val = line.split('|')
+        if val[3] == 'N':
+            add_to_cui_set(val[0])
+
+    with open(MedGenIDMappings_file_path, mode='r') as f:
+        next(f)
+        for line in f:
+            val = line.split('|')
+            add_to_cui_set(val[0])
+
+    # final output dict
+    CSV_OUTPUT_DICT = {}
+    global CUI_ID_SET
+    for id in CUI_ID_SET:
+        obj = copy.deepcopy(CSV_DICT)
+        obj['CUI'] = id
+        obj['dcid'] = f"bio/{id}"
+        if id in DCID_CUI_ASSOCIATE:
+            obj['dcid_compound'] = DCID_CUI_ASSOCIATE[id]['dcid_compound']
+            obj['dcid_atc_code'] = DCID_CUI_ASSOCIATE[id]['dcid_atc_code']
+            obj['dcid_mesh'] = DCID_CUI_ASSOCIATE[id]['dcid_mesh']
+            obj['is_drug_response'] = True
+        CSV_OUTPUT_DICT[id] = obj
+
+    # process individual files and update the respective properties to final output dict
+    with open(MGSTY_file_path, mode='r') as f:
+        next(f)
+        for line in f:
+            val = line.split('|')
+            try:
+                # Convert to MedGenSemanticTypeEnum
+                if val[0] in CUI_ID_SET:
+                    CSV_OUTPUT_DICT[
+                        val[0]]['STY'] = f"dcs:MedGeneSemanticType{get_pascal_case(val[3], ' ')}"
+            except:
+                print(f"Error at {val[0]} in MGSTY file")
+
+    with open(NAMES_file_path, mode='r') as f:
+        next(f)
+        for line in f:
+            val = line.split('|')
+            if val[3] == 'N':
+                if val[0] in CUI_ID_SET:
+                    try:
+                        curr_record = CSV_OUTPUT_DICT[val[0]]
+                        name = val[1].replace('"', "'")
+                        curr_record['name'] = f'"{name}"'
+                        curr_record['source'] = f'"{SOURCE_DICT[val[2]]}"'
+                    except:
+                        print(f"source {val[2]} not in SOURCE_DICT")
+
+    for line in MGDEF_records:
+        val = line.split('|')
+        if val[3] == 'N':
+            if val[0] in CUI_ID_SET:
+                try:
+                    curr_record = CSV_OUTPUT_DICT[val[0]]
+                    curr_record[
+                        'source_definition'] = f'"{SOURCE_DEFINITION_DICT[val[2]].replace("[", "(").replace("]", ")")}"'
+                    if "\n" in val[1]:
+                        pass
+
+                    def_str = str(val[1]).replace("\\n", "").replace("\n", "").replace('"', "'")
+                    curr_record['DEF'] = f'"{def_str}"'
+                except:
+                    print(f"source_definition {val[2]} not in SOURCE_DEFINITION_DICT")
+
+    with open(MedGenIDMappings_file_path, mode='r') as f:
+        next(f)
+        for line in f:
+            val = line.split('|')
+            if val[0] in CUI_ID_SET:
+                curr_record = CSV_OUTPUT_DICT[val[0]]
+                source_id = val[2]
+                match val[3]:
+                    case 'GARD':
+                        curr_record['GARD'] = source_id
+                    case 'HPO':
+                        curr_record['HPO'] = source_id
+                    case 'MONDO':
+                        curr_record['MONDO'] = source_id
+                    case 'MeSH':
+                        curr_record['MeSH'] = f"bio/{source_id}"
+                    case 'MedGen':
+                        curr_record['MedGen'] = source_id
+                    case 'OMIM':
+                        curr_record['OMIM'] = source_id
+                    case 'OMIM included':
+                        curr_record['OMIM'] = source_id
+                    case 'OMIM Phenotypic Series':
+                        curr_record['OMIM_Phenotypic_Series'] = source_id
+                    case 'OMIM Allelic Variant':
+                        curr_record['OMIM_Allelic_Variant'] = source_id
+                    case 'Orphanet':
+                        curr_record['Orphanet'] = source_id
+                    case 'SNOMEDCT_US':
+                        curr_record['SNOMEDCT_US'] = source_id
+
+    with open(output_file_path, 'w') as output_file_csv, open(cui_dcid_mapping_file_path,
+                                                              'w') as cui_dcid_csv:
+        writer = csv.DictWriter(output_file_csv, CSV_DICT)
+        writer.writeheader()
+        cui_writer = csv.DictWriter(cui_dcid_csv, CUI_DCID_MAPPING_DICT, extrasaction="ignore")
+        cui_writer.writeheader()
+        for _, row in CSV_OUTPUT_DICT.items():
+            writer.writerow(row)
+            cui_writer.writerow(row)
+
+
+def add_to_cui_set(val: str) -> None:
+    global CUI_ID_SET
+    if ' ' in val:
+        cui = val.split(' ')
+        CUI_ID_SET.add(cui[0])
+    else:
+        CUI_ID_SET.add(val)
+
+
+def main():
+    # set start time
+    logging.set_verbosity('info')
+    logging.info("Started medgen process")
+    start = time.time()
+
+    MGSTY_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, MGSTY_file_name)
+    NAMES_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, NAMES_file_name)
+    MGDEF_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, MGDEF_file_name)
+    MedGenIDMappings_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir,
+                                              MedGenIDMappings_file_name)
+    output_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file_name)
+    cui_dcid_mapping_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir,
+                                              cui_dcid_mappings_file_name)
+
+    main_process_csv(MGSTY_file_path, NAMES_file_path, MGDEF_file_path, MedGenIDMappings_file_path,
+                     output_file_path, cui_dcid_mapping_file_path)
+
+    print(f'Process completed in {round((time.time() - start)/60,2)} mins')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/biomedical/NCBI_dbSNP/split_files.sh b/scripts/biomedical/NCBI_dbSNP/split_files.sh
new file mode 100644
index 000000000..282a5308f
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/split_files.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+
+echo "File split GCF_000001405.24 started"
+split -l 15000000 input/GCF25/GCF_000001405.25.vcf input/GCF25/gcf25_shard_ --additional-suffix=.vcf
+echo "File split GCF_000001405.40 started"
+split -l 15000000 input/GCF40/GCF_000001405.40.vcf input/GCF40/gcf40_shard_ --additional-suffix=.vcf
+echo "File split freq started"
+split -l 30000000 input/freq/freq.vcf input/freq/freq_shard_ --additional-suffix=.vcf
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh37.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh37.tmcf
new file mode 100644
index 000000000..3b70c3290
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh37.tmcf
@@ -0,0 +1,20 @@
+Node: E:ncbi_GRCh37_genome_assembly_report->E1
+typeOf: schema:Quantity
+dcid: C:ncbi_GRCh37_genome_assembly_report->dcid_quantity
+name: C:ncbi_GRCh37_genome_assembly_report->name_quantity
+unitOfMeasure: dcs:BasePairs
+value:  C:ncbi_GRCh37_genome_assembly_report->Sequence-Length
+
+Node: E:ncbi_GRCh37_genome_assembly_report->E2
+typeOf: dcs:Chromosome
+dcid: C:ncbi_GRCh37_genome_assembly_report->dcid
+chromosomeSize: E:ncbi_GRCh37_genome_assembly_report->E2
+dateCreated: 2013-06-28
+dnaSequenceRole: C:ncbi_GRCh37_genome_assembly_report->Sequence-Role
+genBankAccession: C:ncbi_GRCh37_genome_assembly_report->GenBank-Accn
+genomeAssemblyUnitName: C:ncbi_GRCh37_genome_assembly_report->Assembly-Unit
+inChromosome: C:ncbi_GRCh37_genome_assembly_report->Assigned-Molecule
+inGenomeAssembly: dcs:bio/GCA_000001405.14
+ofSpecies: dcs:bio/HomoSapiens
+refSeqAccession: C:ncbi_GRCh37_genome_assembly_report->RefSeq-Accn
+synonym: C:ncbi_GRCh37_genome_assembly_report->Sequence-Name
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh38.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh38.tmcf
new file mode 100644
index 000000000..3c8bf2b32
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh38.tmcf
@@ -0,0 +1,20 @@
+Node: E:ncbi_GRCh38_genome_assembly_report->E1
+typeOf: schema:Quantity
+dcid: C:ncbi_GRCh38_genome_assembly_report->dcid_quantity
+name: C:ncbi_GRCh38_genome_assembly_report->name_quantity
+unitOfMeasure: dcs:BasePairs
+value:  C:ncbi_GRCh38_genome_assembly_report->Sequence-Length
+
+Node: E:ncbi_GRCh38_genome_assembly_report->E2
+typeOf: dcs:Chromosome
+dcid: C:ncbi_GRCh38_genome_assembly_report->dcid
+chromosomeSize: E:ncbi_GRCh38_genome_assembly_report->E2
+dateCreated: 2022-02-03
+dnaSequenceRole: C:ncbi_GRCh38_genome_assembly_report->Sequence-Role
+genBankAccession: C:ncbi_GRCh38_genome_assembly_report->GenBank-Accn
+genomeAssemblyUnitName: C:ncbi_GRCh38_genome_assembly_report->Assembly-Unit
+inChromosome: C:ncbi_GRCh38_genome_assembly_report->Assigned-Molecule
+inGenomeAssembly: dcs:bio/GCA_000001405.29
+ofSpecies: dcs:bio/HomoSapiens
+refSeqAccession: C:ncbi_GRCh38_genome_assembly_report->RefSeq-Accn
+synonym: C:ncbi_GRCh38_genome_assembly_report->Sequence-Name
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/MedGen.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/MedGen.tmcf
new file mode 100644
index 000000000..6194b7211
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/MedGen.tmcf
@@ -0,0 +1,22 @@
+Node: E:medgen->E1
+typeOf: dcs:MedGenConceptUniqueIdentifier
+dcid: C:medgen->dcid
+name: C:medgen->name
+atcCode: C:medgen->dcid_atc_code
+compoundID: C:medgen->dcid_compound
+conceptUniqueIdentifier C:medgen->CUI
+description: C:medgen->DEF
+descriptionSource: C:medgen->source_definition
+geneticAndRareDiseasesID: C:medgen->GARD
+humanPhenotypeOntologyID: C:medgen->HPO
+mondoID: C:medgen->MONDO
+medicalSubjectHeadingID: C:medgen->MeSH
+medicalSubjectHeadingID: C:medgen->dcid_mesh
+omimAllelicVariantID C:medgen->OMIM_Allelic_Variant
+omimID: C:medgen->OMIM
+omimPhenotypicSeriesID C:medgen->OMIM_Phenotypic_Series
+orphaNumber: C:medgen->Orphanet
+snomedCT: C:medgen->SNOMEDCT_US
+source: C:medgen->source
+
+
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/clinvar_diesease_gene.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/clinvar_diesease_gene.tmcf
new file mode 100644
index 000000000..f2b1e1a77
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/clinvar_diesease_gene.tmcf
@@ -0,0 +1,17 @@
+Node: E:clinvar_disease_gene->E1
+typeOf: dcs:Disease
+dcid:C:clinvar_disease_gene->dcid_disease
+
+Node: E:clinvar_disease_gene->E2
+typeOf: dcs:Gene
+dcid:C:clinvar_disease_gene->dcid_gene
+
+Node: E:clinvar_disease_gene->E3
+typeOf: dcs:DiseaseGeneAssociation
+dcid: C:clinvar_disease_gene->dcid
+name: C:clinvar_disease_gene->name
+dateModified: C:clinvar_disease_gene->LastUpdated
+diseaseID: E:clinvar_disease_gene->E1
+geneID: E:clinvar_disease_gene->E2
+isCausal: C:clinvar_disease_gene->isCausal
+source: C:clinvar_disease_gene->sourceName
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_freq.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_freq.tmcf
new file mode 100644
index 000000000..6a4f046b7
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_freq.tmcf
@@ -0,0 +1,21 @@
+Node: E:dbsnp_freq->E1
+typeOf: dcs:GeneticVariant
+dcid: C:dbsnp_freq->dcid_gv
+
+Node: E:dbsnp_freq->E2
+dcid: C:dbsnp_freq->dcid
+name: C:dbsnp_freq->name
+typeOf: dcs:GeneticVariantPopulationFrequency
+alleleFrequency: C:dbsnp_freq->alleleFrequency
+alternativeAllele: C:dbsnp_freq->alternativeAllele
+genotypeHeterozygousFrequency: C:dbsnp_freq->genotypeHeterozygousFrequency
+genotypeHomozygousAlternativeFrequency: C:dbsnp_freq->genotypeHomozygousAlternativeFrequency
+genotypeHomozygousReferenceFrequency: C:dbsnp_freq->genotypeHomozygousReferenceFrequency
+geneticVaraintID: E:dbsnp_freq->E2
+hardyWeinbergEquationPValue: C:dbsnp_freq->hardyWeinbergEquationPValue
+isGlobalPopulation: C:dbsnp_freq->isGlobalPopulation
+measuredPopulation: C:dbsnp_freq->measuredPopulation
+measuredProperty: schema:frequency
+referenceAllele: C:dbsnp_freq->referenceAllele
+rsID: C:dbsnp_freq->rsID
+sampleSize: C:dbsnp_freq->sampleSize
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38.tmcf
new file mode 100644
index 000000000..6ba446f95
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38.tmcf
@@ -0,0 +1,38 @@
+Node: E:dbsnp_hg38->E1
+typeOf: dcs:GenomicPosition
+dcid: C:dbsnp_hg38->dcid_pos
+name: C:dbsnp_hg38->name_pos
+chrom: C:dbsnp_hg38->chrom
+inChromosome: C:dbsnp_hg38->chrom
+inGenomeAssembly: dcs:bio/GCA_000001405.14
+position: C:dbsnp_hg38->position
+
+Node: E:dbsnp_hg38->E2
+typeOf: dcs:GeneticVariant
+dcid: C:dbsnp_hg38->dcid
+name: C:dbsnp_hg38->name
+alleleOrigin: C:dbsnp_hg38->alleleOrigin
+alternativeAllele: C:dbsnp_hg38->alternativeAllele
+dbSNPBuildID: C:dbsnp_hg38->dbSNPBuildID
+geneID: C:dbsnp_hg38->geneID
+geneID: C:dbsnp_hg38->geneID_2
+genotypesAvailable: C:dbsnp_hg38->genotypesAvailable
+hasNonSynonymousFrameShift: C:dbsnp_hg38->hasNonSynonymousFrameShift
+hasNonSynonymousMissenseMutation: C:dbsnp_hg38->hasNonSynonymousMissenseMutation
+hasNonSynonymousNonsenseMutation: C:dbsnp_hg38->hasNonSynonymousNonsenseMutation
+hasSynonymousMutation: C:dbsnp_hg38->hasSynonymousMutation
+hg38GenomicPosition: E:dbsnp_hg38->E1
+inGenomeAssembly: dcs:bio/GCA_000001405.29
+isCommonVariant: C:dbsnp_hg38->isCommonVariant
+isInAcceptorSpliceSite: C:dbsnp_hg38->isInAcceptorSpliceSite
+isInDonorSpliceSite: C:dbsnp_hg38->isInDonorSpliceSite
+isInIntron: C:dbsnp_hg38->isInIntron
+isInFivePrimeGeneRegion: C:dbsnp_hg38->isInFivePrimeGeneRegion
+isInFivePrimeUTR: C:dbsnp_hg38->isInFivePrimeUTR
+isInThreePrimeGeneRegion: C:dbsnp_hg38->isInThreePrimeGeneRegion
+isInThreePrimeUTR: C:dbsnp_hg38->isInThreePrimeUTR
+isPublished: C:dbsnp_hg38->isPublished
+referenceAllele: C:dbsnp_hg38->referenceAllele
+rsID: C:dbsnp_hg38->rsID
+suspectReasonCode: C:dbsnp_hg38->suspectReasonCode
+variantClass: C:dbsnp_hg38->variantClass
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_allele_disease_associations.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_allele_disease_associations.tmcf
new file mode 100644
index 000000000..9c5900449
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_allele_disease_associations.tmcf
@@ -0,0 +1,44 @@
+Node: E:dbsnp_hg38_allele_disease_associations->E1
+typeOf: dcs:GeneticVariant
+dcid: C:dbsnp_hg38_allele_disease_associations->dcid
+
+Node: E:dbsnp_hg38_allele_disease_associations->E2
+typeOf: dcs:Allele
+dcid: C:dbsnp_hg38_allele_disease_associations->dcid_allele
+arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID:  C:dbsnp_hg38_allele_disease_associations->arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID
+geneticTestingRegistryID: C:dbsnp_hg38_allele_disease_associations->geneticTestingRegistryID
+geneticVariantID: E:dbsnp_hg38_allele_disease_associations->E3
+humanGeneMutationDatabaseID: C:dbsnp_hg38_allele_disease_associations->humanGeneMutationDatabaseID
+omimID: C:dbsnp_hg38_allele_disease_associations->omimID
+pharmGKBID: C:dbsnp_hg38_allele_disease_associations->pharmGKBID
+uniProtID: C:dbsnp_hg38_allele_disease_associations->uniProtID
+
+Node: E:dbsnp_hg38_allele_disease_associations->E3
+typeOf: dcs:Disease
+dcid: C:dbsnp_hg38_allele_disease_associations->dcid_disease
+name: C:dbsnp_hg38_allele_disease_associations->name_disease
+experimentalFactorOntologyID: C:dbsnp_hg38_allele_disease_associations->experimentalFactorOntologyID
+geneticReviewsID: C:dbsnp_hg38_allele_disease_associations->geneReviewsID
+humanPhenotypeOntologyID: C:dbsnp_hg38_allele_disease_associations->humanPhenotypeOntologyID
+medicalGeneticsSummariesID: C:dbsnp_hg38_allele_disease_associations->medicalGeneticsSummariesID
+medicalSubjectHeadingID: C:dbsnp_hg38_allele_disease_associations->medicalSubjectHeadingID
+officeOfRareDiseasesId: C:dbsnp_hg38_allele_disease_associations->officeOfRareDiseasesId
+omimID: C:dbsnp_hg38_allele_disease_associations->omimID
+orphaNumber: C:dbsnp_hg38_allele_disease_associations->orphaNumber
+snomedCT: C:dbsnp_hg38_allele_disease_associations->snomedCT
+umlsConceptUniqueIdentifier: C:dbsnp_hg38_allele_disease_associations->medGenID
+
+Node: E:dbsnp_hg38_allele_disease_associations->E4
+typeOf: dcs:DiseaseAlleleAssociation
+dcid: C:dbsnp_hg38_allele_disease_associations->dcid_disease_allele_association
+name:  C:dbsnp_hg38_allele_disease_associations->name_disease_allele_association
+alleleID: E:dbsnp_hg38_allele_disease_associations->E2
+alleleOrigin: C:dbsnp_hg38_allele_disease_associations->CLNORIGIN
+clinicalSignificance: C:dbsnp_hg38_allele_disease_associations->CLNSIG
+clinVarReviewStatus: C:dbsnp_hg38_allele_disease_associations->CLNREVSTAT
+diseaseID: E:dbsnp_hg38_allele_disease_associations->E3
+geneID: C:dbsnp_hg38_allele_disease_associations->geneID
+geneticVariantID: E:dbsnp_hg38_allele_disease_associations->E1
+geneticTestingRegistryID: C:dbsnp_hg38_allele_disease_associations->geneticTestingRegistryID
+pharmGKBID: C:dbsnp_hg38_allele_disease_associations->pharmGKBID
+referenceClinVarRecord: C:dbsnp_hg38_allele_disease_associations->CLNACC
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alleles.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alleles.tmcf
new file mode 100644
index 000000000..e55497a93
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alleles.tmcf
@@ -0,0 +1,11 @@
+Node: E:dbsnp_hg38_alleles->E1
+typeOf: dcs:GeneticVariant
+dcid: C:dbsnp_hg38_alleles->dcid
+
+Node: E:dbsnp_hg38_alleles->E2
+typeOf: dcs:Allele
+dcid: C:dbsnp_hg38_alleles->dcid_allele
+name: C:dbsnp_hg38_alleles->name_allele
+geneticVariantID: E:dbsnp_hg38_alleles->E1
+hgvsNomenclature: C:dbsnp_hg38_alleles->CLNHGVS
+variant: C:dbsnp_hg38_alleles->variant
\ No newline at end of file
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf
new file mode 100644
index 000000000..3751da8fd
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf
@@ -0,0 +1,46 @@
+Node: E:dbsnp_hg38_allele_drug_response_associations->E1
+typeOf: dcs:GeneticVariant
+dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid
+
+Node: E:dbsnp_hg38_allele_drug_response_associations->E2
+typeOf: dcs:Allele
+dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid_allele
+arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID: C:dbsnp_hg38_allele_drug_response_associations->arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID
+geneticTestingRegistryID: C:dbsnp_hg38_allele_drug_response_associations->geneticTestingRegistryID
+geneticVariantID: E:dbsnp_hg38_allele_drug_response_associations->E3
+humanGeneMutationDatabaseID: C:dbsnp_hg38_allele_drug_response_associations->humanGeneMutationDatabaseID
+omimID: C:dbsnp_hg38_allele_drug_response_associations->omimID
+pharmGKBID: C:dbsnp_hg38_allele_drug_response_associations->pharmGKBID
+uniProtID: C:dbsnp_hg38_allele_drug_response_associations->uniProtID
+
+Node: E:dbsnp_hg38_allele_drug_response_associations->E3
+typeOf: dcs:DrugResponse
+dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid_disease
+name: C:dbsnp_hg38_allele_drug_response_associations->name_disease
+compoundID: C:dbsnp_hg38_allele_drug_response_associations->compound_dcid
+experimentalFactorOntologyID: C:dbsnp_hg38_allele_drug_response_associations->experimentalFactorOntologyID
+geneticReviewsID: C:dbsnp_hg38_allele_drug_response_associations->geneReviewsID
+humanPhenotypeOntologyID: C:dbsnp_hg38_allele_drug_response_associations->humanPhenotypeOntologyID
+medicalGeneticsSummariesID: C:dbsnp_hg38_allele_drug_response_associations->medicalGeneticsSummariesID
+medicalSubjectHeadingID: C:dbsnp_hg38_allele_drug_response_associations->medicalSubjectHeadingID
+officeOfRareDiseasesId: C:dbsnp_hg38_allele_drug_response_associations->officeOfRareDiseasesId
+omimID: C:dbsnp_hg38_allele_drug_response_associations->omimID
+orphaNumber: C:dbsnp_hg38_allele_drug_response_associations->orphaNumber
+snomedCT: C:dbsnp_hg38_allele_drug_response_associations->snomedCT
+umlsConceptUniqueIdentifier: C:dbsnp_hg38_allele_drug_response_associations->medGenID
+
+Node: E:dbsnp_hg38_allele_drug_response_associations->E4
+typeOf: dcs:DiseaseAlleleAssociation
+dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid_disease_allele_association
+name: C:dbsnp_hg38_allele_drug_response_associations->name_disease_allele_association
+alleleID: E:dbsnp_hg38_allele_drug_response_associations->E2
+alleleOrigin: C:dbsnp_hg38_allele_drug_response_associations->CLNORIGIN
+clinicalSignificance: C:dbsnp_hg38_allele_drug_response_associations->CLNSIG
+clinVarReviewStatus: C:dbsnp_hg38_allele_drug_response_associations->CLNREVSTAT
+compoundID: C:dbsnp_hg38_allele_drug_response_associations->compound_dcid
+diseaseID: E:dbsnp_hg38_allele_drug_response_associations->E3
+geneID: C:dbsnp_hg38_allele_drug_response_associations->geneID
+geneticVariantID: E:dbsnp_hg38_allele_drug_response_associations->E1
+geneticTestingRegistryID: C:dbsnp_hg38_allele_drug_response_associations->geneticTestingRegistryID
+pharmGKBID: C:dbsnp_hg38_allele_drug_response_associations->pharmGKBID
+referenceClinVarRecord: C:dbsnp_hg38_allele_drug_response_associations->CLNACC
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_freq.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_freq.tmcf
new file mode 100644
index 000000000..79db8fe78
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_freq.tmcf
@@ -0,0 +1,13 @@
+Node: E:dbsnp_hg38_freq->E1
+typeOf: dcs:GeneticVariant
+dcid: C:dbsnp_hg38_freq->dcid
+
+Node: E:dbsnp_hg38_freq->E2
+typeOf: dcs:GeneticVariantPopulationFrequency
+dcid: C:dbsnp_hg38_freq->dcid_freq
+name: C:dbsnp_hg38_freq->name_freq
+alleleFrequency: C:dbsnp_hg38_freq->alleleFrequency
+geneticVaraintID: E:dbsnp_hg38_freq->E1
+measuredPopulation: C:dbsnp_hg38_freq->measuredPopulation
+measuredProperty: schema:frequency
+rsID: C:dbsnp_hg38_freq->rsID
diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/hg19_positions.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/hg19_positions.tmcf
new file mode 100644
index 000000000..2820ff731
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/hg19_positions.tmcf
@@ -0,0 +1,14 @@
+Node: E:dbsnp_hg19_positions->E1
+typeOf: dcs:GenomicPosition
+dcid: C:dbsnp_hg19_positions->dcid_pos
+name: C:dbsnp_hg19_positions->name_pos
+inChromosome: C:dbsnp_hg19_positions->inChromosome
+inGenomeAssembly: dcs:bio/GCA_000001405.14
+position:  C:dbsnp_hg19_positions->position
+
+Node: E:dbsnp_hg19_positions->E2
+typeOf: dcs:GeneticVariant
+dcid: C:dbsnp_hg19_positions->dcid
+name: C:dbsnp_hg19_positions->name
+hg19GenomicPosition: E:dbsnp_hg19_positions->E1
+rsID: C:dbsnp_hg19_positions->rsID
diff --git a/scripts/biomedical/NCBI_dbSNP/tests.sh b/scripts/biomedical/NCBI_dbSNP/tests.sh
new file mode 100644
index 000000000..b98df873b
--- /dev/null
+++ b/scripts/biomedical/NCBI_dbSNP/tests.sh
@@ -0,0 +1,55 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# """
+# Author: Pradeep Kumar Krishnaswamy
+# Date: 23/08/2024
+# Name: tests
+# Description: This file runs the Data Commons Java tool to run standard
+# tests on tmcf + CSV pairs for the NCBI Gene data import.
+# """
+
+#!/bin/bash
+
+# download data commons java test tool version 0.1-alpha.1k
+rm -rf tmp
+mkdir -p tmp; cd tmp
+wget https://github.com/datacommonsorg/import/releases/download/0.1-alpha.1k/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar
+cd ..
+mkdir -p lint
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/GRCh37.tmcf output/ncbi_GRCh37_genome_assembly_report.csv -n 20 -o lint/GRCh37
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/GRCh38.tmcf output/ncbi_GRCh38_genome_assembly_report.csv -n 20 -o lint/GRCh38
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/MedGen.tmcf output/medgen.csv -n 20 -o lint/medgen
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/clinvar_diesease_gene.tmcf output/clinvar_diesease_gene.csv -n 20 -o lint/clinvar_diesease_gene
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/hg19_positions.tmcf output/GCF25/*.csv -n 20 -o lint/GCF25
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_freq.tmcf output/freq/*.csv -n 20 -o lint/freq
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38.tmcf output/GCF40/hg38/*.csv -n 20 -o lint/GCF40/hg38
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_alleles.tmcf output/GCF40/hg38alleles/*.csv -n 20 -o lint/GCF40/hg38alleles
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_allele_disease_associations.tmcf output/GCF40/hg38alleledisease/*.csv -n 20 -o lint/GCF40/hg38alleledisease
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf output/GCF40/hg38alleledrug/*.csv -n 20 -o lint/GCF40/hg38alleledrug
+
+java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_freq.tmcf output/GCF40/hg38freq/*.csv -n 20 -o lint/GCF40/hg38freq
+
+