Skip to content

Commit

Permalink
Merge pull request #3 from bertsky/ignore-unknown-blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
rue-a authored May 4, 2023
2 parents 3857f71 + 7cc351e commit b19ba11
Showing 1 changed file with 11 additions and 6 deletions.
17 changes: 11 additions & 6 deletions textract2page/convert_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def convert_file(json_path: str, img_path: str, out_path: str) -> None:

for block in aws_json["Blocks"]:
if block["BlockType"] == "PAGE":
assert not page_block, "page must not have more than 1 PAGE block"
page_block = block
if block["BlockType"] == "LINE":
line_blocks[block["Id"]] = block
Expand Down Expand Up @@ -190,9 +191,11 @@ def convert_file(json_path: str, img_path: str, out_path: str) -> None:

# TextLine from LINE blocks that are listed in the PAGE-block's
# child relationships
for line_block_id in [rel["Ids"]
for rel in page_block.get("Relationships", [])
if rel["Type"] == "CHILD"][0]:
for line_block_id in next((rel.get("Ids", [])
for rel in page_block.get("Relationships", [])
if rel["Type"] == "CHILD"), []):
if line_block_id not in line_blocks:
continue
line_block = line_blocks[line_block_id]
if "Polygon" in line_block["Geometry"]:
awsgeometry = TextractPolygon(line_block["Geometry"]["Polygon"])
Expand All @@ -212,9 +215,11 @@ def convert_file(json_path: str, img_path: str, out_path: str) -> None:

# Word from WORD blocks that are listed in the LINE-block's
# child relationships
for word_block_id in [rel["Ids"]
for rel in line_block.get("Relationships", [])
if rel["Type"] == "CHILD"][0]:
for word_block_id in next((rel.get("Ids", [])
for rel in line_block.get("Relationships", [])
if rel["Type"] == "CHILD"), []):
if word_block_id not in word_blocks:
continue
word_block = word_blocks[word_block_id]
if "Polygon" in word_block["Geometry"]:
awsgeometry = TextractPolygon(word_block["Geometry"]["Polygon"])
Expand Down

0 comments on commit b19ba11

Please sign in to comment.