diff --git a/README.md b/README.md index 21fee24..3ce8af6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,13 @@ # Linkscraper +
+ +
+ +Requirements: + +* Python >= 3.6 ([Download](https://www.python.org/downloads/)) + Clone this repository. ```shell @@ -226,7 +234,11 @@ python linkscraper -u https://example.com -a get-plugins -p screenshot -f screen ## Changelog -> Current version: ``2.0.1`` +> Current version: ``2.0.2`` + +Minors + +* Refactoring code Fixes @@ -279,6 +291,11 @@ Plugins added * rich * python-decouple +## Roadmap + +* [ ] Implement a micro database key-value type ([TinyDB](https://tinydb.readthedocs.io/en/latest/) like) +* [ ] List of possible pdf's files in URL + ## License Code licensed under [MIT License](https://github.com/kremilly/linkscraper/blob/main/LICENSE) diff --git a/__main__.py b/__main__.py index 35e6ea5..005c289 100644 --- a/__main__.py +++ b/__main__.py @@ -8,7 +8,7 @@ console = Console(record=True) -version = "2.0.1" +VERSION = "2.0.2" parser = argparse.ArgumentParser() parser.add_argument("-u", "--url", help="URL to scan", required=True) @@ -21,7 +21,7 @@ parser.add_argument("-b", "--browser", help="Set browser to take screenshot", required=False) parser.add_argument("-t", "--title", help="Set title the screenshot on Imgur", required=False) parser.add_argument("-ssc", "--show-status-code", help="Show status code", required=False, default="false") -parser.add_argument("-version", "--version", help="Show current version", action="version", version=version) +parser.add_argument("-version", "--version", help="Show current version", action="version", version=VERSION) parser.add_argument("-k", "--key", help="Set the API key to use an plugin that is needs this", required=False) parser.add_argument("-smf", "--show-minify-files", help="Show only minify files", required=False, default="false") parser.add_argument("-oel", "--only-external-links", help="Show only external links", required=False, default="false") @@ -31,15 +31,15 @@ if __name__ == "__main__": BASE_URL = args.url - if isURL(BASE_URL) != True: + if not is_url(BASE_URL): console.print("[bold red]Error: URL is missing[/bold red]") sys.exit(1) - if check_connection(BASE_URL) != True: + if not check_connection(BASE_URL): console.print("[bold red]Error: connection is not established") sys.exit(1) - run_home(BASE_URL, version) + run_home(BASE_URL, VERSION) if not args.action or args.action == "get-core" or args.action == "core": run_core(BASE_URL) diff --git a/core/download_files.py b/core/download_files.py index 4d92c0c..a1c5000 100644 --- a/core/download_files.py +++ b/core/download_files.py @@ -32,7 +32,7 @@ def download_js(url, minify_files, filter_data): table.add_column("Size", style="blue") table.add_column("Status") - createFolder(path) + create_folder(path) for script in soup.find_all("script"): if script.attrs.get("src"): @@ -50,7 +50,7 @@ def download_js(url, minify_files, filter_data): for script_url in list(set(links)): text = requests.get(script_url).text - file_name = path + getRemoteFileName(script_url) + file_name = path + get_remote_file_size(script_url) with open(file_name, 'w', encoding="utf-8") as f: f.write(text) @@ -58,9 +58,9 @@ def download_js(url, minify_files, filter_data): total_files += 1 if os.path.exists(file_name): - table.add_row(getRemoteFileName(script_url), script_url, localFileSize(file_name), "[bold green]Download completed[/bold green]") + table.add_row(get_remote_file_size(script_url), script_url, local_file_size(file_name), "[bold green]Download completed[/bold green]") else: - table.add_row(getRemoteFileName(script_url), script_url, localFileSize(file_name), "[bold red]Download failed[/bold red]") + table.add_row(get_remote_file_size(script_url), script_url, local_file_size(file_name), "[bold red]Download failed[/bold red]") path = os.path.realpath(path) os.startfile(path) @@ -87,7 +87,7 @@ def download_css(url, minify_files, filter_data): table.add_column("Size", style="blue") table.add_column("Status") - createFolder(path) + create_folder(path) for css in soup.find_all("link"): if css.attrs.get("href"): @@ -106,7 +106,7 @@ def download_css(url, minify_files, filter_data): for css_url in list(set(links)): text = requests.get(css_url).text - file_name = path + getRemoteFileName(css_url) + file_name = path + get_remote_file_size(css_url) with open(file_name, 'w', encoding="utf-8") as f: f.write(text) @@ -114,9 +114,9 @@ def download_css(url, minify_files, filter_data): total_files += 1 if os.path.exists(file_name): - table.add_row(getRemoteFileName(css_url), css_url, localFileSize(file_name), "[bold green]Download completed[/bold green]") + table.add_row(get_remote_file_size(css_url), css_url, local_file_size(file_name), "[bold green]Download completed[/bold green]") else: - table.add_row(getRemoteFileName(css_url), css_url, localFileSize(file_name), "[bold red]Download failed[/bold red]") + table.add_row(get_remote_file_size(css_url), css_url, local_file_size(file_name), "[bold red]Download failed[/bold red]") path = os.path.realpath(path) os.startfile(path) @@ -143,7 +143,7 @@ def download_images(url, filter_data): table.add_column("Size", style="blue") table.add_column("Status", style="bold green") - createFolder(path) + create_folder(path) for img in soup.find_all("img"): img_url = urljoin(url, img.attrs.get("src")) @@ -156,7 +156,7 @@ def download_images(url, filter_data): for img_url in list(set(links)): img_data = requests.get(img_url).content - file_name = path + getRemoteFileName(img_url) + file_name = path + get_remote_file_size(img_url) with open(file_name, 'wb') as handler: handler.write(img_data) @@ -164,9 +164,9 @@ def download_images(url, filter_data): total_files += 1 if os.path.exists(file_name): - table.add_row(getRemoteFileName(img_url), img_url, localFileSize(file_name), "[bold green]Download completed[/bold green]") + table.add_row(get_remote_file_size(img_url), img_url, local_file_size(file_name), "[bold green]Download completed[/bold green]") else: - table.add_row(getRemoteFileName(img_url), img_url, localFileSize(file_name), "[bold red]Download failed[/bold red]") + table.add_row(get_remote_file_size(img_url), img_url, local_file_size(file_name), "[bold red]Download failed[/bold red]") path = os.path.realpath(path) os.startfile(path) diff --git a/core/scraper.py b/core/scraper.py index 193c58f..438c7a4 100644 --- a/core/scraper.py +++ b/core/scraper.py @@ -31,14 +31,14 @@ def get_links(url, external_links, status_code, filter_data): for link in soup.find_all('a'): if link.get('href') != None: if filter_data: - if isURL(link.get('href')) and find(link.get('href'), filter_data): + if is_url(link.get('href')) and find(link.get('href'), filter_data): links.append(link.get('href')) else: if not external_links or external_links != "true": - if isURL(link.get('href')): + if is_url(link.get('href')): links.append(link.get('href')) else: - if isURL(link.get('href')) and find(get_hostname(link.get('href')), get_hostname(url)) != True: + if is_url(link.get('href')) and find(get_hostname(link.get('href')), get_hostname(url)) != True: links.append(link.get('href')) for link in list(set(links)): diff --git a/core/static_files.py b/core/static_files.py index 73cfcd0..c5b34e7 100644 --- a/core/static_files.py +++ b/core/static_files.py @@ -47,7 +47,7 @@ def js_files(url, minify_files, filter_data, download): links.append(script_url) for script_url in list(set(links)): - table.add_row(getRemoteFileName(script_url), script_url) + table.add_row(get_remote_file_size(script_url), script_url) total_files += 1 end_time = "{:.2f}".format(time.time() - start_time) @@ -86,7 +86,7 @@ def css_files(url, minify_files, filter_data, download): links.append(css_url) for css_url in list(set(links)): - table.add_row(getRemoteFileName(css_url), css_url) + table.add_row(get_remote_file_size(css_url), css_url) total_files += 1 end_time = "{:.2f}".format(time.time() - start_time) @@ -119,7 +119,7 @@ def images_files(url, filter_data, download): links.append(img_url) for img_url in list(set(links)): - table.add_row(getRemoteFileName(img_url), img_url) + table.add_row(get_remote_file_size(img_url), img_url) total_files += 1 end_time = "{:.2f}".format(time.time() - start_time) diff --git a/plugins/imgur.py b/plugins/imgur.py index 17b478c..c2de3ff 100644 --- a/plugins/imgur.py +++ b/plugins/imgur.py @@ -8,20 +8,20 @@ console = Console(record=True) -def getTitle(title): +def get_title(title): if not title: return 'Screenshot made by Linkscraper' else: return title -def embedCode(imgur_code_img, direct_link, imgur_page, title): +def embed_code(imgur_code_img, direct_link, imgur_page, title): console.print("-" * 60) console.print("Embed codes") console.print("-" * 60) - console.print(f'[italic yellow]Imgur Post[/italic yellow]:
{getTitle(title)}
') - console.print(f"[italic yellow]HTML[/italic yellow]: {getTitle(title)}") - console.print(f"[italic yellow]Markdown[/italic yellow]: ![{getTitle(title)}]({direct_link})") + console.print(f'[italic yellow]Imgur Post[/italic yellow]:
{get_title(title)}
') + console.print(f"[italic yellow]HTML[/italic yellow]: {get_title(title)}") + console.print(f"[italic yellow]Markdown[/italic yellow]: ![{get_title(title)}]({direct_link})") console.print(f"[italic yellow]BBCode[/italic yellow]: [img]{direct_link}[/img]") def plugin_imgur(file, key, title): @@ -37,20 +37,20 @@ def plugin_imgur(file, key, title): response = requests.request("POST", "https://api.imgur.com/3/image", headers = { 'Authorization': f"Client-ID {key}" }, data = { - 'image': toBase64(file), - 'title': getTitle(title) + 'image': to_base64(file), + 'title': get_title(title) }) callback = response.json() if callback["success"] == True: direct_link = callback['data']['link'] imgur_page = direct_link.replace("i.", "") - imgur_code_img = removeExtension(imgur_page).replace("https://imgur.com/", "") + imgur_code_img = remove_extension(imgur_page).replace("https://imgur.com/", "") - console.print(f"Imgur page: [bold green]{removeExtension(imgur_page)}[/bold green]") + console.print(f"Imgur page: [bold green]{remove_extension(imgur_page)}[/bold green]") console.print(f"Link Direct: [bold green]{direct_link}[/bold green]") - embedCode(imgur_code_img, direct_link, imgur_page, title) + embed_code(imgur_code_img, direct_link, imgur_page, title) console.print("-" * 60) pyperclip.copy(direct_link) diff --git a/utils/utils.py b/utils/utils.py index cd783ce..ce07db6 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -62,7 +62,7 @@ def isJSON(string): except ValueError as e: return False -def isURL(string, check_protocol = True): +def is_url(string, check_protocol = True): url_pattern_check_protocol = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$" url_pattern = "^[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$" diff --git a/utils/utils_files.py b/utils/utils_files.py index f5eadc1..2325783 100644 --- a/utils/utils_files.py +++ b/utils/utils_files.py @@ -5,11 +5,11 @@ from utils.utils import * -def localFileSize(file): +def local_file_size(file): file_size = os.stat(file) return humanSize(file_size.st_size) -def remoteFileSize(url): +def remote_file_size(url): try: req_headers = requests.get(url) return humanSize( @@ -18,16 +18,16 @@ def remoteFileSize(url): except: return None -def toBase64(file): +def to_base64(file): with open(file, "rb") as f: output = base64.b64encode(f.read()) return output -def removeExtension(file): +def remove_extension(file): return file.rsplit(".", 1)[0] -def getExtension(file): +def get_extension(file): ext = os.path.splitext(file) if ext != "" or ext != ".": @@ -35,7 +35,7 @@ def getExtension(file): else: return None -def getFileName(string): +def get_file_name(string): name = os.path.split(string)[1] if find(string, "?"): @@ -43,7 +43,7 @@ def getFileName(string): else: return name -def getRemoteFileName(url): +def get_remote_file_size(url): a = urlparse(url) basename = os.path.basename(a.path) @@ -54,6 +54,6 @@ def getRemoteFileName(url): if find(file, ".") and len(file) > 1: return file -def createFolder(folder): +def create_folder(folder): if os.path.isdir(folder) != True: os.makedirs(folder)