Skip to content

Commit

Permalink
first tries without extension specified to have textract get from fil…
Browse files Browse the repository at this point in the history
…ename
  • Loading branch information
Tom O'Hara committed Sep 23, 2023
1 parent aaefc2f commit f39f30b
Showing 1 changed file with 15 additions and 6 deletions.
21 changes: 15 additions & 6 deletions mezcla/extract_document_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,23 @@ def document_to_text(doc_filename):
debug.trace(4, f"document_to_text({doc_filename})")
debug.trace_expr(4, IN_EXT)
text = ""
OK = False
try:
## OLD: text = system.from_utf8(textract.process(doc_filename))
text = textract.process(doc_filename,
extension=IN_EXT
).decode("UTF-8")
except:
debug.trace_fmtd(3, "Warning: problem converting document file {f}: {e}",
text = textract.process(doc_filename).decode("UTF-8")
OK = True
except ImportError:
debug.trace_fmtd(3, "FYI: import error converting file {f}: {e}",
f=doc_filename, e=sys.exc_info())

if not OK:
try:
## OLD: text = system.from_utf8(textract.process(doc_filename))
text = textract.process(doc_filename,
extension=IN_EXT
).decode("UTF-8")
except:
debug.trace_fmtd(3, "Warning: problem converting document file {f}: {e}",
f=doc_filename, e=sys.exc_info())
debug.trace(4, f"document_to_text() => {gh.elide(text)!r}")
return text

Expand Down

0 comments on commit f39f30b

Please sign in to comment.