diff --git a/examples/html_2_xlsx.ipynb b/examples/html_2_xlsx.ipynb new file mode 100644 index 0000000..e212178 --- /dev/null +++ b/examples/html_2_xlsx.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HTML 2 XLSX\n", + "Notebook to convert the HTML tables in the output of `AnyParser` to an XLSX file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install BeautifulSoup4\n", + "# !pip3 install lxml\n", + "# !pip3 install openpyxl\n", + "# !pip3 install pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input Data\n", + "First, we load the `input_html` string from the `AnyParser` output, located in the html_2_xlsx/input folder." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from html_2_xlsx.input.html_input import input_html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert HTML to XLSX\n", + "Next, we define the `html_to_excel` function, which reads the html string and converts it to an XLSX file using `BeautifulSoup` and `pandas`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "from io import StringIO\n", + "\n", + "\n", + "def html_to_excel(html_string, output_folder, output_filename):\n", + " if not os.path.exists(output_folder):\n", + " os.makedirs(output_folder)\n", + "\n", + " soup = BeautifulSoup(html_string, 'html.parser')\n", + "\n", + " tables = soup.find_all('table')\n", + "\n", + " dfs = {}\n", + " for i, table in enumerate(tables):\n", + " dfs[f\"Table_{i+1}\"] = pd.read_html(StringIO(str(table)))[0]\n", + "\n", + " output_file = os.path.join(output_folder, output_filename)\n", + " with pd.ExcelWriter(output_file) as writer:\n", + " for name, df in dfs.items():\n", + " df.to_excel(writer, sheet_name=name, index=False)\n", + "\n", + " print(f\"Excel file saved to {output_file}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we run `html_to_excel` to convert the HTML to XLSX." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Excel file saved to html_2_xlsx/output/html_2_excel_output.xlsx\n" + ] + } + ], + "source": [ + "output_file = 'html_2_excel_output.xlsx'\n", + "output_folder = 'html_2_xlsx/output'\n", + "html_to_excel(input_html, output_folder, output_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-parser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/html_2_xlsx/input/html_input.py b/examples/html_2_xlsx/input/html_input.py new file mode 100644 index 0000000..e7478ad --- /dev/null +++ b/examples/html_2_xlsx/input/html_input.py @@ -0,0 +1,46 @@ +input_html = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Summary Current Month- March-2024 Three Months - 1/1/2024 to 3/31/2024 Twelve Months - 4/1/2023 to 3/31/2024
Number of Reporting Licensees - 444 Number of ReportingLicensees - 445 Number of ReportingLicensees 453
Unit Description # Of Loc*# Of UnitsWin* Amount % Chg Win Percent# Of Loc*Avg UnitsWin** Amount% Chg Win Percent# Of Loc*Avg UnitsWin** Amount% Chg Win Percent
Table, Counter and Card Games
Twenty One 114 2,065 108,802 (18.17) 13.59 114 2,063 339,520 (7.64) 13.85 115 2,017 1,297,555 (3.75) 14.19
Craps 101 336 41,567 2.11 16.48 102 324 118,285 1.03 16.70 105 319 468,742 2.60 16.82
Roulette 100 435 37,678 8.00 20.41 100 441 96,295 (20.68) 14.96 101 434 435,203 (7.09) 18.01
Ultim Texas Hold'em 70 194 17,256 (0.22) 23.08 71 194 51,882 0.45 24.06 72 182 188,983 0.36 21.77
3-Card Poker 90 165 9,997 (23.54) 27.97 90 167 29,993 (15.13) 31.02 93 169 119,934 (8.19) 31.53
Baccarat 34 447 114,694 75.51 18.81 34 446 392,829 51.81 17.45 36 405 1,622,476 39.79 17.85
Mini-Baccarat 37 115 4,980 (24.61) 7.65 38 119 21,183 (8.28) 10.14 41 115 88,698 (9.65) 11.85
Keno 31 41 1,487 (6.03) 30.51 31 41 4,148 (8.07) 29.02 33 42 17,648 (5.14) 29.57
Bingo 31 31 6,890 1.51 30.58 31 31 12,486 (5.93) 23.22 34 31 44,769 (3.15) 21.65
Let It Ride 30 35 1,487 (0.12) 24.21 31 35 4,184 (16.04) 25.66 33 37 16,970 (17.15) 25.37
Pai Gow 13 25 2,164 368.92 46.16 15 26 2,731 19.56 19.10 16 25 11,112 19.01 22.00
Pai Gow Poker 76 208 10,875 (6.82) 20.99 76 209 32,533 22.28 21.69 78 208 130,222 (5.34) 21.84
Race Book (1) 165 163 2,815 (2.94) 16.52 167 164 7,076 (6.26) 16.41 169 160 29,962 (9.65) 15.64
Sports Pool (2) 181 181 29,762 (32.13) 3.79 181 180 142,411 5.06 6.25 182 177 488,194 5.12 6.02
Card Games 35 454 14,664 (4.42) 35 452 39,840 (6.00) 38 481 226,354 1.71
Other 153 10,773 1.63 23.76 164 28,620 (14.25) 18.40 158 129,395 1.44 20.27
Total 263 5,048 415,892 2.54 13.98 263 5,056 1,324,017 6.31 14.17 266 4,960 5,316,219 7.81 14.94
Slot Machines
1 Cent 247 40,371 220,252 (23.43) 8,97 248 41,784 659,530 (21.36) 9.66 256 43,757 2,973,572 (15.68) 9.58
5 Cent 90 847 3,307 (8.54) 6.56 94 815 9,880 (5.73) 6.61 104 834 41,975 (1.27) 5.69
25 Cent 163 2,692 15,903 (15.81) 7.33 167 2,761 46,711 (17.96) 7.99 174 2,958 209,647 (12.20) 7.93
1 Dollar 169 5,679 43,643 (26.86) 6.23 171 5,873 138,351 (20.30) 6.93 179 6,157 602,837 (16.78) 6.43
5 Dollars 87 574 4,988 (20.77) 5.15 88 599 15,291 (23.43) 5.11 92 621 65,712 (22.31) 5.06
25 Dollars 46 109 1,329 (39.77) 5.90 47 113 4,796 (22.22) 6.39 52 121 21,451 (27.94) 5.21
100 Dollars 32 86 2,196 (36.11) 6.35 32 87 7,722 (8.40) 7.08 35 86 31,216 (14.11) 6.24
Multi Denomination 309 76,718 573,675 12.37 6.41 310 75,315 1,677,377 17.60 6.73 317 70,289 6,224,784 15.94 6.50
Other 917 9,220 (34.95) 1,009 28,173 (21.78) 1,037 124,892 (7.27)
Total 316 127,993 874,515 (3.53) 6.88 317 128,356 2,587,831 0.44 7.29 325 125,860 10,296,085 1.08 7.15
Total Gaming 1,290,407 (1.65) 3,911,849 2.35 15,612,304 3.27
(1) Race Book Parimutuel60 60 2,786 (3.60) 16.51 62 62 7,017 (6.54) 16.44 66 66 29,587 (8.81) 15.91
Sports Mobile 56 26,010 1.66 5.13 58 79,867 33.25 5.55 68 260,585 18.54 4.86
(2) Sports Football 120 120 (12,881) (3.82) (4,049.07) 181 181 37,565 51.81 6.90 182 182 170,755 9.25 6.42
Sports Basketball 180 180 32,464 (27.64) 5.31 180 180 75,596 (7.81) 5.74 182 182 127,091 (4.29) 5.24
Sports Baseball 179 179 1,974 5.92 179 179 16 (101.44) 0.05 181 181 94,320 17.15 5.82
Sports Parlay Cards 26 26 (155) (496.78) (90.37) 153 153 77 (96.21) 2.91 163 163 6,510 (43.39) 23.22
Sports Hockey 178 178 730 (85.23) 1.60 178 178 8,117 (34.42) 6.35 181 181 14,706 (41.12) 3.76
Other 181 7,630 4.05 8.14 180 21,039 35.67 8.33 177 74,812 28.23 7.56
+""" diff --git a/examples/html_2_xlsx/output/html_2_excel_output.xlsx b/examples/html_2_xlsx/output/html_2_excel_output.xlsx new file mode 100644 index 0000000..a421a58 Binary files /dev/null and b/examples/html_2_xlsx/output/html_2_excel_output.xlsx differ