diff --git a/site/db-backup/bin/backup.sh b/site/db-backup/bin/backup.sh index f267194715..db340f7c5e 100755 --- a/site/db-backup/bin/backup.sh +++ b/site/db-backup/bin/backup.sh @@ -60,7 +60,17 @@ mongoexport -o ${TARGET}/incidents.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v mongoexport -o ${TARGET}/duplicates.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=duplicates --fields=duplicate_incident_number,true_incident_number mongoexport -o ${TARGET}/quickadd.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=quickadd --fields=incident_id,url,date_submitted,source_domain mongoexport -o ${TARGET}/submissions.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=submissions --fields=authors,date_downloaded,date_modified,date_published,date_submitted,image_url,incident_date,incident_id,language,mongodb_id,source_domain,submitters,text,title,url -mongoexport -o ${TARGET}/reports.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=reports --fields=_id,incident_id,authors,date_downloaded,date_modified,date_published,date_submitted,description,epoch_date_downloaded,epoch_date_modified,epoch_date_published,epoch_date_submitted,image_url,language,ref_number,report_number,source_domain,submitters,text,title,url,tags + +###### Begin Reports CSV Export ###### + +JSON_FILE="${TARGET}/reports.json" +CSV_FILE="${TARGET}/reports.csv" +FIELDS="_id,authors,date_downloaded,date_modified,date_published,date_submitted,description,epoch_date_downloaded,epoch_date_modified,epoch_date_published,epoch_date_submitted,image_url,language,ref_number,report_number,source_domain,submitters,text,title,url,tags" +mongoexport --uri="${MONGODB_URI}/${MONGODB_DBNAME}" --collection=reports --out="${JSON_FILE}" --jsonArray --jsonFormat=relaxed --fields="${FIELDS}" +python3 convert_json_to_csv.py "${JSON_FILE}" "${CSV_FILE}" "${FIELDS}" +rm -f $JSON_FILE + +###### End Reports CSV Export ###### ###### Begin Taxa CSV Export ###### diff --git a/site/db-backup/bin/convert_json_to_csv.py b/site/db-backup/bin/convert_json_to_csv.py new file mode 100644 index 0000000000..6281b1c0bd --- /dev/null +++ b/site/db-backup/bin/convert_json_to_csv.py @@ -0,0 +1,54 @@ +import json +import csv +import sys + +def json_to_csv(json_file, csv_file, fields=None): + with open(json_file, 'r', encoding='utf-8') as f_json: + data = json.load(f_json) + + if not data: + print(f"The JSON file '{json_file}' is empty.") + sys.exit(1) + + # Extract headers from the keys of the dictionaries in the data if fields are not provided + if fields is None: + headers = set() + for entry in data: + headers.update(entry.keys()) + headers = list(headers) + else: + headers = fields.split(',') + + with open(csv_file, 'w', newline='', encoding='utf-8') as f_csv: + writer = csv.writer(f_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL, escapechar='\\', lineterminator='\n') + + # Write the header row + writer.writerow(headers) + + # Write data rows + for entry in data: + row = [] + for header in headers: + value = entry.get(header, '') + if isinstance(value, list): + # Convert list to a comma-separated string + value = ', '.join(map(str, value)) + elif isinstance(value, dict) and '$date' in value: + value = value['$date'] + elif isinstance(value, dict) and '$oid' in value: + value = value['$oid'] + row.append(value) + writer.writerow(row) + + print(f"CSV file '{csv_file}' generated successfully.") + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: python convert_json_to_csv.py [fields]") + sys.exit(1) + + json_file = sys.argv[1] + csv_file = sys.argv[2] + fields = sys.argv[3] if len(sys.argv) > 3 else None + + json_to_csv(json_file, csv_file, fields) \ No newline at end of file