Skip to content

Commit

Permalink
Added convert_to_html (#71)
Browse files Browse the repository at this point in the history
* added convert_to_html

Signed-off-by: Peter Staar <[email protected]>

* added convert_to_html

Signed-off-by: Peter Staar <[email protected]>

* added convert_to_html (2)

Signed-off-by: Peter Staar <[email protected]>

* added convert_to_html (3)

Signed-off-by: Peter Staar <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM authored Jun 6, 2024
1 parent dcbbf00 commit e4ce593
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 17 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ for **Document conversion** as well as **Data and Knowledge exploration**.
- [Deep Search Toolkit](https://github.com/ds4sd/deepsearch-toolkit)
- [Documentation](https://ds4sd.github.io/deepsearch-toolkit/)


## Examples

### Setup and usage
Expand Down
63 changes: 53 additions & 10 deletions examples/document_conversion_quick_start/convert_documents.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,18 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"id": "b01a4fd1",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Project key: 1234567890abcdefghijklmnopqrstvwyz123456\n"
]
}
],
"source": [
"from dsnotebooks.settings import ProjectNotebookSettings\n",
"\n",
Expand All @@ -56,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"id": "502cdef8",
"metadata": {
"ExecuteTime": {
Expand All @@ -72,7 +80,7 @@
"from pathlib import Path\n",
"from zipfile import ZipFile\n",
"\n",
"from deepsearch.documents.core.export import export_to_markdown\n",
"from deepsearch.documents.core.export import export_to_markdown, export_to_html\n",
"from IPython.display import display, Markdown, HTML, display_html"
]
},
Expand All @@ -86,7 +94,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"id": "f44fbf08",
"metadata": {},
"outputs": [],
Expand All @@ -96,15 +104,32 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "ec83eb0b",
"metadata": {
"ExecuteTime": {
"end_time": "2022-08-02T12:14:49.216045Z",
"start_time": "2022-08-02T12:14:25.380757Z"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:00<00:00, 39.65it/s]\u001b[38;2;15;98;254m \u001b[0m\n",
"Submitting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:04<00:00, 4.48s/it]\u001b[38;2;15;98;254m \u001b[0m\n",
"Converting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:36<00:00, 36.90s/it]\u001b[38;2;15;98;254m \u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Total documents': 1, 'Successfully converted documents': 1}\n"
]
}
],
"source": [
"output_dir = Path(\"./converted_docs\")\n",
"\n",
Expand All @@ -121,10 +146,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "382c4869-cca9-43fc-8052-c0ab7e9c175d",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"writing converted_docs/2206.01062.md\n",
"writing converted_docs/2206.01062.html\n"
]
}
],
"source": [
"# Iterare output files and visualize the output\n",
"for output_file in output_dir.rglob(\"json*.zip\"):\n",
Expand All @@ -136,13 +170,22 @@
"\n",
" basename = name.rstrip(\".json\")\n",
" doc_json = json.loads(archive.read(f\"{basename}.json\"))\n",
"\n",
" doc_md = export_to_markdown(doc_json)\n",
"\n",
" ofile = output_dir / f\"{basename}.md\"\n",
" print(f\"writing {ofile}\")\n",
"\n",
" with ofile.open(\"w\") as fw:\n",
" fw.write(doc_md)\n",
"\n",
" print(f\"writing {ofile}\")"
" doc_html = export_to_html(doc_json)\n",
"\n",
" ofile = output_dir / f\"{basename}.html\"\n",
" print(f\"writing {ofile}\")\n",
"\n",
" with ofile.open(\"w\") as fw:\n",
" fw.write(doc_html)"
]
},
{
Expand Down
10 changes: 6 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ packages = [

[tool.poetry.dependencies]
python = ">= 3.8, <3.11"
deepsearch-toolkit = "^0.46.0"
deepsearch-toolkit = "0.47.0"
jupyter = "^1.0.0"
ipywidgets = "^7" # previous major release is needed bcause of mols2grid
numpy = "^1.23.4"
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ cymem==2.0.8 ; python_version >= "3.8" and python_version < "3.11"
debugpy==1.8.1 ; python_version >= "3.8" and python_version < "3.11"
decorator==5.1.1 ; python_version >= "3.8" and python_version < "3.11"
deepsearch-glm==0.16.2 ; python_version >= "3.8" and python_version < "3.11" and sys_platform != "win32"
deepsearch-toolkit==0.46.0 ; python_version >= "3.8" and python_version < "3.11"
deepsearch-toolkit==0.47.0 ; python_version >= "3.8" and python_version < "3.11"
defusedxml==0.7.1 ; python_version >= "3.8" and python_version < "3.11"
deprecated==1.2.14 ; python_version >= "3.8" and python_version < "3.11"
et-xmlfile==1.1.0 ; python_version >= "3.8" and python_version < "3.11"
Expand Down

0 comments on commit e4ce593

Please sign in to comment.