From 3946c2cca4016a93a84d073ac77ef1f064151c70 Mon Sep 17 00:00:00 2001 From: kelle Date: Mon, 8 Jan 2024 15:49:59 -0600 Subject: [PATCH 1/8] first attempt at script --- .../ultracool_sheet/generate_simple_links.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 scripts/ultracool_sheet/generate_simple_links.py diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py new file mode 100644 index 000000000..e08cba692 --- /dev/null +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -0,0 +1,81 @@ +# from scripts.ingests.ingest_utils import * +from scripts.ingests.utils import load_simpledb, find_source_in_db +from astropy.io import ascii +import urllib.request +from astropy.table import Table + + +RECREATE_DB = False +db = load_simpledb("SIMPLE.sqlite", recreatedb=RECREATE_DB) + +# Load Ultracool sheet +sheet_id = "1i98ft8g5mzPp2DNno0kcz4B9nzMxdpyz5UquAVhz-U8" +link = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv" +link = "scripts/ultracool_sheet/UltracoolSheet - Main_010824.csv" + +# read the csv data into an astropy table +uc_sheet_table = ascii.read( + link, + format="csv", + data_start=1, + header_start=0, + guess=False, + fast_reader=False, + delimiter=",", +) + +# Match sources in Ultracool sheet to sources in SIMPLE +uc_names = [] +simple_urls = [] +simple_sources = [] +for source in uc_sheet_table[0:100]: + uc_sheet_name = source["name"] + match = find_source_in_db( + db, + uc_sheet_name, + ra=source["ra_j2000_formula"], + dec=source["dec_j2000_formula"], + ) + + # convert SIMPLE source name to URL + if len(match) == 0: + print("No match found for ", uc_sheet_name) + raise ValueError + elif len(match) > 1: + print("Multiple matches found for ", uc_sheet_name) + raise ValueError + elif len(match) == 1: + simple_source = match[0] + print(f"Match found for {uc_sheet_name}: {simple_source}") + else: + raise ValueError("Unexpected state") + + # URLify source name + source_url = simple_source.strip().replace(" ", "%21") + url = "https://simple-bd-archive.org/load_solo/" + source_url + # u rl = "https://simple-bd-archive.org/solo_result/" + source_url + + # TODO: THIS DOESN'T WORK!!! Even bad URLs return 200 + # test the URL + url_status = 200 + # url_status = urllib.request.urlopen(url).getcode() + if url_status != 200: + raise ValueError("URL not valid for ", uc_sheet_name, simple_source, url) + else: + print("URL valid for ", uc_sheet_name, simple_source, url) + + uc_names.append(uc_sheet_name) + simple_sources.append(simple_source) + simple_urls.append(url) + +# write the results to a file +results_table = Table( + [uc_names, simple_sources, simple_urls], + names=["Ultracool Sheet Name", "SIMPLE Source Name", "SIMPLE URL"], +) +results_table.write( + "scripts/ultracool_sheet/uc_sheet_simple_urls.csv", + delimiter=",", + overwrite=True, + format="ascii.ecsv", +) From d241d0dba6ad9c93c323f849b046f9004558e0eb Mon Sep 17 00:00:00 2001 From: kelle Date: Mon, 8 Jan 2024 16:03:39 -0600 Subject: [PATCH 2/8] code cleanup --- scripts/ultracool_sheet/generate_simple_links.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py index e08cba692..76e42bbea 100644 --- a/scripts/ultracool_sheet/generate_simple_links.py +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -10,7 +10,7 @@ # Load Ultracool sheet sheet_id = "1i98ft8g5mzPp2DNno0kcz4B9nzMxdpyz5UquAVhz-U8" -link = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv" +# link = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv" link = "scripts/ultracool_sheet/UltracoolSheet - Main_010824.csv" # read the csv data into an astropy table @@ -39,11 +39,11 @@ # convert SIMPLE source name to URL if len(match) == 0: - print("No match found for ", uc_sheet_name) - raise ValueError + msg = f"No match found for {uc_sheet_name}" + raise ValueError(msg) elif len(match) > 1: - print("Multiple matches found for ", uc_sheet_name) - raise ValueError + msg = f"Multiple matches found for {uc_sheet_name}" + raise ValueError(msg) elif len(match) == 1: simple_source = match[0] print(f"Match found for {uc_sheet_name}: {simple_source}") From 36de01d9d7c1a5a19beaf54197cbdecd74d63978 Mon Sep 17 00:00:00 2001 From: kelle Date: Tue, 9 Jan 2024 15:00:55 -0600 Subject: [PATCH 3/8] changes based on Wills comments --- scripts/ultracool_sheet/generate_simple_links.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py index 76e42bbea..1193a0b97 100644 --- a/scripts/ultracool_sheet/generate_simple_links.py +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -1,7 +1,8 @@ # from scripts.ingests.ingest_utils import * from scripts.ingests.utils import load_simpledb, find_source_in_db from astropy.io import ascii -import urllib.request +from urllib.parse import quote +import requests from astropy.table import Table @@ -51,14 +52,13 @@ raise ValueError("Unexpected state") # URLify source name - source_url = simple_source.strip().replace(" ", "%21") - url = "https://simple-bd-archive.org/load_solo/" + source_url - # u rl = "https://simple-bd-archive.org/solo_result/" + source_url + source_url = quote(simple_source) + url = "https://simple-bd-archive.org/solo_result/" + source_url # TODO: THIS DOESN'T WORK!!! Even bad URLs return 200 # test the URL - url_status = 200 - # url_status = urllib.request.urlopen(url).getcode() + # url_status = 200 + url_status = requests.get(url).status_code if url_status != 200: raise ValueError("URL not valid for ", uc_sheet_name, simple_source, url) else: From 05a9a746a214a98e79b7e7c41ad95c098acbb41b Mon Sep 17 00:00:00 2001 From: kelle Date: Wed, 10 Jan 2024 00:27:26 -0600 Subject: [PATCH 4/8] url checking is real now --- scripts/ultracool_sheet/generate_simple_links.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py index 1193a0b97..1c1c160a6 100644 --- a/scripts/ultracool_sheet/generate_simple_links.py +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -1,4 +1,3 @@ -# from scripts.ingests.ingest_utils import * from scripts.ingests.utils import load_simpledb, find_source_in_db from astropy.io import ascii from urllib.parse import quote @@ -29,7 +28,7 @@ uc_names = [] simple_urls = [] simple_sources = [] -for source in uc_sheet_table[0:100]: +for source in uc_sheet_table: uc_sheet_name = source["name"] match = find_source_in_db( db, @@ -55,9 +54,7 @@ source_url = quote(simple_source) url = "https://simple-bd-archive.org/solo_result/" + source_url - # TODO: THIS DOESN'T WORK!!! Even bad URLs return 200 - # test the URL - # url_status = 200 + # test the URL to make sure it is valid url_status = requests.get(url).status_code if url_status != 200: raise ValueError("URL not valid for ", uc_sheet_name, simple_source, url) From c26620f2665156aca4983f193b66ffeaa716b429 Mon Sep 17 00:00:00 2001 From: kelle Date: Wed, 10 Jan 2024 12:43:37 -0600 Subject: [PATCH 5/8] Using `head` instead of `get` to check URLs. FASTER. --- scripts/ultracool_sheet/generate_simple_links.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py index 1c1c160a6..53ba50311 100644 --- a/scripts/ultracool_sheet/generate_simple_links.py +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -28,7 +28,7 @@ uc_names = [] simple_urls = [] simple_sources = [] -for source in uc_sheet_table: +for source in uc_sheet_table[1647:]: uc_sheet_name = source["name"] match = find_source_in_db( db, @@ -55,12 +55,14 @@ url = "https://simple-bd-archive.org/solo_result/" + source_url # test the URL to make sure it is valid - url_status = requests.get(url).status_code + url_status = requests.head(url).status_code if url_status != 200: raise ValueError("URL not valid for ", uc_sheet_name, simple_source, url) else: print("URL valid for ", uc_sheet_name, simple_source, url) + # ('URL not valid for ', 'AB Pic b', 'HD 44627B', 'https://simple-bd-archive.org/solo_result/HD%2044627B') + uc_names.append(uc_sheet_name) simple_sources.append(simple_source) simple_urls.append(url) From c2d51bc63f05c7fad27a49be7225593dccb3a27a Mon Sep 17 00:00:00 2001 From: kelle Date: Wed, 10 Jan 2024 16:32:49 -0600 Subject: [PATCH 6/8] better URL testing --- .../ultracool_sheet/generate_simple_links.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py index 53ba50311..92e865fcf 100644 --- a/scripts/ultracool_sheet/generate_simple_links.py +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -1,4 +1,4 @@ -from scripts.ingests.utils import load_simpledb, find_source_in_db +from scripts.ingests.utils import load_simpledb, find_source_in_db, logger, SimpleError from astropy.io import ascii from urllib.parse import quote import requests @@ -28,7 +28,7 @@ uc_names = [] simple_urls = [] simple_sources = [] -for source in uc_sheet_table[1647:]: +for source in uc_sheet_table[2368:]: uc_sheet_name = source["name"] match = find_source_in_db( db, @@ -40,15 +40,19 @@ # convert SIMPLE source name to URL if len(match) == 0: msg = f"No match found for {uc_sheet_name}" - raise ValueError(msg) + logger.error(msg) + raise SimpleError(msg) elif len(match) > 1: msg = f"Multiple matches found for {uc_sheet_name}" - raise ValueError(msg) + logger.error(msg) + raise SimpleError(msg) elif len(match) == 1: simple_source = match[0] - print(f"Match found for {uc_sheet_name}: {simple_source}") + logger.info(f"Match found for {uc_sheet_name}: {simple_source}") else: - raise ValueError("Unexpected state") + msg = f"Unexpected state for {uc_sheet_name}" + logger.error(msg) + raise SimpleError(msg) # URLify source name source_url = quote(simple_source) @@ -56,12 +60,18 @@ # test the URL to make sure it is valid url_status = requests.head(url).status_code - if url_status != 200: - raise ValueError("URL not valid for ", uc_sheet_name, simple_source, url) + if url_status == 404: + msg = f"URL not valid for {uc_sheet_name} {simple_source} at {url}" + logger.error(msg) + raise SimpleError(msg) + elif url_status != 200: + logger.warning(f"URL not valid for {uc_sheet_name} {simple_source} at {url} + but with HTTP status {url_status}") else: - print("URL valid for ", uc_sheet_name, simple_source, url) + logger.info(f"URL valid for {uc_sheet_name} {simple_source} at {url}") # ('URL not valid for ', 'AB Pic b', 'HD 44627B', 'https://simple-bd-archive.org/solo_result/HD%2044627B') + # ('URL not valid for ', '2MASSI J1707333+430130', '2MASS J17073334+4301304', 'https://simple-bd-archive.org/solo_result/2MASS%20J17073334%2B4301304') uc_names.append(uc_sheet_name) simple_sources.append(simple_source) From e1af602c6f42eeb476661b76ff25e84bf21a5972 Mon Sep 17 00:00:00 2001 From: kelle Date: Wed, 10 Jan 2024 17:53:27 -0600 Subject: [PATCH 7/8] match new UCS --- scripts/ultracool_sheet/generate_simple_links.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py index 92e865fcf..0e3747dad 100644 --- a/scripts/ultracool_sheet/generate_simple_links.py +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -12,6 +12,7 @@ sheet_id = "1i98ft8g5mzPp2DNno0kcz4B9nzMxdpyz5UquAVhz-U8" # link = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv" link = "scripts/ultracool_sheet/UltracoolSheet - Main_010824.csv" +link = "UltracoolSheet - PRIVATE - Sheet26" # read the csv data into an astropy table uc_sheet_table = ascii.read( @@ -28,7 +29,7 @@ uc_names = [] simple_urls = [] simple_sources = [] -for source in uc_sheet_table[2368:]: +for source in uc_sheet_table[2390:]: uc_sheet_name = source["name"] match = find_source_in_db( db, @@ -62,17 +63,16 @@ url_status = requests.head(url).status_code if url_status == 404: msg = f"URL not valid for {uc_sheet_name} {simple_source} at {url}" - logger.error(msg) - raise SimpleError(msg) + logger.error(msg) + raise SimpleError(msg) elif url_status != 200: - logger.warning(f"URL not valid for {uc_sheet_name} {simple_source} at {url} - but with HTTP status {url_status}") + logger.warning( + f"URL not valid for {uc_sheet_name} {simple_source} at {url} \ + but with HTTP status {url_status}" + ) else: logger.info(f"URL valid for {uc_sheet_name} {simple_source} at {url}") - # ('URL not valid for ', 'AB Pic b', 'HD 44627B', 'https://simple-bd-archive.org/solo_result/HD%2044627B') - # ('URL not valid for ', '2MASSI J1707333+430130', '2MASS J17073334+4301304', 'https://simple-bd-archive.org/solo_result/2MASS%20J17073334%2B4301304') - uc_names.append(uc_sheet_name) simple_sources.append(simple_source) simple_urls.append(url) From c97adaa8fcc67fb67ca0a5ae741cc7ff21ef7eb3 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 11 Jan 2024 11:43:40 -0600 Subject: [PATCH 8/8] keep track of multiple and no matches --- .../ultracool_sheet/generate_simple_links.py | 84 +++++++++++++------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/scripts/ultracool_sheet/generate_simple_links.py b/scripts/ultracool_sheet/generate_simple_links.py index 0e3747dad..b7e388ca1 100644 --- a/scripts/ultracool_sheet/generate_simple_links.py +++ b/scripts/ultracool_sheet/generate_simple_links.py @@ -5,14 +5,31 @@ from astropy.table import Table +def check_simple_url(url, uc_sheet_name, simple_source): + # test the URL to make sure it is valid + url_status = requests.head(url).status_code + if url_status == 404: + msg = f"URL not valid for {uc_sheet_name} {simple_source} at {url}" + logger.error(msg) + raise SimpleError(msg) + elif url_status != 200: + logger.warning( + f"URL not valid for {uc_sheet_name} {simple_source} at {url} \ + but with HTTP status {url_status}" + ) + else: + logger.info(f"URL valid for {uc_sheet_name} {simple_source} at {url}") + return url + + RECREATE_DB = False db = load_simpledb("SIMPLE.sqlite", recreatedb=RECREATE_DB) # Load Ultracool sheet sheet_id = "1i98ft8g5mzPp2DNno0kcz4B9nzMxdpyz5UquAVhz-U8" # link = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv" -link = "scripts/ultracool_sheet/UltracoolSheet - Main_010824.csv" -link = "UltracoolSheet - PRIVATE - Sheet26" +# link = "scripts/ultracool_sheet/UltracoolSheet - Main_010824.csv" +link = "scripts/ultracool_sheet/UltracoolSheet - PRIVATE - Sheet26.csv" # read the csv data into an astropy table uc_sheet_table = ascii.read( @@ -29,7 +46,9 @@ uc_names = [] simple_urls = [] simple_sources = [] -for source in uc_sheet_table[2390:]: +no_match = [] +multiple_matches = [] +for source in uc_sheet_table: uc_sheet_name = source["name"] match = find_source_in_db( db, @@ -42,40 +61,34 @@ if len(match) == 0: msg = f"No match found for {uc_sheet_name}" logger.error(msg) - raise SimpleError(msg) + no_match.append(uc_sheet_name) + # raise SimpleError(msg) + continue elif len(match) > 1: msg = f"Multiple matches found for {uc_sheet_name}" logger.error(msg) - raise SimpleError(msg) + multiple_matches.append(uc_sheet_name) + # raise SimpleError(msg) + continue elif len(match) == 1: simple_source = match[0] logger.info(f"Match found for {uc_sheet_name}: {simple_source}") - else: - msg = f"Unexpected state for {uc_sheet_name}" - logger.error(msg) - raise SimpleError(msg) + uc_names.append(uc_sheet_name) + simple_sources.append(simple_source) - # URLify source name - source_url = quote(simple_source) - url = "https://simple-bd-archive.org/solo_result/" + source_url + # URLify source name + source_url = quote(simple_source) + url = "https://simple-bd-archive.org/solo_result/" + source_url - # test the URL to make sure it is valid - url_status = requests.head(url).status_code - if url_status == 404: - msg = f"URL not valid for {uc_sheet_name} {simple_source} at {url}" + # check the URL + good_url = check_simple_url(url, uc_sheet_name, simple_source) + simple_urls.append(good_url) + + else: + msg = f"Unexpected state for {uc_sheet_name}" logger.error(msg) raise SimpleError(msg) - elif url_status != 200: - logger.warning( - f"URL not valid for {uc_sheet_name} {simple_source} at {url} \ - but with HTTP status {url_status}" - ) - else: - logger.info(f"URL valid for {uc_sheet_name} {simple_source} at {url}") - uc_names.append(uc_sheet_name) - simple_sources.append(simple_source) - simple_urls.append(url) # write the results to a file results_table = Table( @@ -88,3 +101,22 @@ overwrite=True, format="ascii.ecsv", ) +no_match_table = Table([no_match], names=["No Match"]) +no_match_table.write( + "scripts/ultracool_sheet/uc_sheet_no_match.csv", + delimiter=",", + overwrite=True, + format="ascii.ecsv", +) +multiple_matches_table = Table([multiple_matches], names=["Multiple Matches"]) +multiple_matches_table.write( + "scripts/ultracool_sheet/uc_sheet_multiple_matches.csv", + delimiter=",", + overwrite=True, + format="ascii.ecsv", +) +print(f"Number of no matches {len(no_match_table)}") +print(f"Number of multiple matches {len(multiple_matches_table)}") +# Number of no matches 860 +# Number of multiple matches 2 +# DENIS J220002.0-303832A = DENIS-P J220002.05-303832.9