Skip to content

Commit

Permalink
fix: Switch to spine-based parsing when 25% or more chapters have emp…
Browse files Browse the repository at this point in the history
…ty bodies (#202)

Signed-off-by: starry-shivam <[email protected]>
  • Loading branch information
starry-shivam authored Aug 17, 2024
1 parent 45ab9be commit 9b8be6d
Showing 1 changed file with 20 additions and 3 deletions.
23 changes: 20 additions & 3 deletions app/src/main/java/com/starry/myne/epub/EpubParser.kt
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ class EpubParser {
// is true, use the ToC file for parsing. Otherwise, parse using the spine.
val chapters = if (!tocNavPoints.isNullOrEmpty() && shouldUseToc) {
Log.d(TAG, "Parsing based on ToC file")
parseUsingTocFile(tocNavPoints, files, hrefRootPath)
parseUsingTocFile(tocNavPoints, files, hrefRootPath, document, manifestItems)
} else {
Log.d(TAG, "Parsing based on spine; shouldUseToc: $shouldUseToc")
parseUsingSpine(document.spine, manifestItems, files)
Expand Down Expand Up @@ -256,10 +256,14 @@ class EpubParser {

// Parse chapters based on the table of contents (ToC) file.
private fun parseUsingTocFile(
tocNavPoints: List<Element>, files: Map<String, EpubFile>, hrefRootPath: File
tocNavPoints: List<Element>,
files: Map<String, EpubFile>,
hrefRootPath: File,
document: EpubDocument,
manifestItems: Map<String, EpubManifestItem>
): List<EpubChapter> {
// Parse each chapter entry.
return tocNavPoints.flatMapIndexed { index, navPoint ->
val chapters = tocNavPoints.flatMapIndexed { index, navPoint ->
val title =
navPoint.selectFirstChildTag("navLabel")?.selectFirstChildTag("text")?.textContent
val chapterSrc = navPoint.selectFirstChildTag("content")?.getAttributeValue("src")
Expand Down Expand Up @@ -307,6 +311,19 @@ class EpubParser {
emptyList()
}
}.filter { it.body.isNotBlank() }.toList()

// If 25% or more chapters have empty bodies, that means the ToC file is not
// reliable and has incorrect references. In such cases, switch to spine-based parsing.
val emptyChapterThreshold = 0.25
val totalChapters = tocNavPoints.size
val emptyChapters = totalChapters - chapters.size

if (emptyChapters.toDouble() / totalChapters >= emptyChapterThreshold) {
Log.w(TAG, "More than 60% of chapters have empty bodies. Switching to spine-based parsing.")
return parseUsingSpine(document.spine, manifestItems, files)
}

return chapters // Return the parsed chapters from the ToC file.
}

// Parse chapters based on the spine of the epub document.
Expand Down

0 comments on commit 9b8be6d

Please sign in to comment.