-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
increase accuracy and test coverage for PlaintextRenderer #250
Changes from all commits
0133ccf
e85729f
7729f8e
d547ee1
9dd9979
975e0b7
52299fc
15f7992
82624a6
f0a063a
a152caa
fb34009
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,40 +16,123 @@ | |
|
||
package pink.cozydev.protosearch.analysis | ||
|
||
import laika.ast._ | ||
import laika.ast.* | ||
import laika.api.format.{Formatter, RenderFormat} | ||
import laika.ast.html.{HTMLBlock, HTMLSpan} | ||
|
||
object PlaintextRenderer extends ((Formatter, Element) => String) { | ||
|
||
private case class Content(content: Seq[Element], options: Options = Options.empty) | ||
extends Element | ||
with ElementContainer[Element] { | ||
type Self = Content | ||
def withOptions(options: Options): Content = copy(options = options) | ||
} | ||
|
||
def apply(fmt: Formatter, element: Element): String = { | ||
|
||
def renderElement(e: Element): String = { | ||
val (elements, _) = e.productIterator.partition(_.isInstanceOf[Element]) | ||
e.productPrefix + fmt.indentedChildren( | ||
elements.toList.asInstanceOf[Seq[Element]] | ||
) | ||
def renderElement(e: Element): String = e match { | ||
|
||
/* search engines tend to index alt and title attributes of images */ | ||
case img: Image => (img.alt.toList ++ img.title.toList).mkString(" ") | ||
|
||
/* only pick up nodes targeting HTML output */ | ||
case TargetFormat(formats, element, _) if formats.contains("html") => fmt.child(element) | ||
|
||
/* tabbed content in HTML, including all tabs */ | ||
case sel: Selection => renderBlocks(sel.choices.flatMap(_.content)) | ||
|
||
/* traverse to extract text nodes in verbatim HTML */ | ||
case html: HTMLBlock => fmt.child(html.root) | ||
|
||
/* 3rd party nodes can implement Fallback to provide an alternative representation | ||
* of the same element based on more common node types. */ | ||
case f: Fallback => fmt.child(f.fallback) | ||
|
||
/* ignore unknown nodes as we cannot know if they represent visual, textual information */ | ||
case _ => "" | ||
} | ||
|
||
def lists(lists: Seq[Element]*): String = | ||
fmt.childPerLine(lists.map { case elems => Content(elems) }) | ||
def renderListContainer(con: ListContainer): String = con match { | ||
/* Excluded as they would either produce unwanted entries (e.g. the headline of a different page) | ||
* or duplicate entries (e.g. a section title on the current page) | ||
*/ | ||
case _: NavigationList | _: NavigationItem => "" | ||
case _ => fmt.children(con.content) | ||
} | ||
|
||
element match { | ||
case s: Section => | ||
fmt.children(s.header.content) + "\n" + fmt.childPerLine(s.content) + "\n" | ||
def renderBlockContainer(con: BlockContainer): String = con match { | ||
/* Some special handling for the few containers which hold child nodes | ||
in more properties than just the container's `content` property. | ||
*/ | ||
case Section(header, content, _) => renderBlock(header.content) + renderBlocks(content) | ||
case QuotedBlock(content, attr, _) => renderBlocks(content) + renderBlock(attr) | ||
case TitledBlock(title, content, _) => renderBlock(title) + renderBlocks(content) | ||
case Figure(_, caption, content, _) => renderBlock(caption) + renderBlocks(content) | ||
case DefinitionListItem(term, defn, _) => renderBlock(term) + renderBlocks(defn) | ||
case _ => renderBlocks(con.content) | ||
} | ||
|
||
def renderElementContainer(con: ElementContainer[? <: Element]): String = con match { | ||
/* SectionInfo is solely used in navigation structures and represents duplicate info. | ||
*/ | ||
case _: SectionInfo => "" | ||
/* All other core AST types implement one of the sub-traits of ElementContainer - | ||
if we end up here it's an unknown 3rd party node | ||
*/ | ||
case _ => fmt.children(con.content) | ||
} | ||
|
||
def renderTextContainer(con: TextContainer): String = con match { | ||
/* match on most common container type first for performance */ | ||
case Text(content, _) => content | ||
/* could be any unknown markup format */ | ||
case _: RawContent => "" | ||
/* comments are usually ignored by search engines */ | ||
case _: Comment => "" | ||
/* embedded debug info node */ | ||
case _: RuntimeMessage => "" | ||
/* this does not represent text nodes in verbatim HTML */ | ||
case _: HTMLSpan => "" | ||
case _: SectionNumber => "" | ||
case QuotedBlock(content, attr, _) => lists(content, attr) | ||
case DefinitionListItem(term, defn, _) => lists(term, defn) + "\n" | ||
case bc: BlockContainer => fmt.childPerLine(bc.content) + "\n" | ||
case tc: TextContainer => tc.content | ||
case Content(content, _) => fmt.childPerLine(content) | ||
case ec: ElementContainer[_] => fmt.children(ec.content) | ||
case _ => con.content | ||
} | ||
|
||
def renderTable(table: Table): String = { | ||
val cells = (table.head.content ++ table.body.content).flatMap(_.content) | ||
renderBlocks(cells.flatMap(_.content)) + renderBlock(table.caption.content) | ||
Comment on lines
+95
to
+96
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a lot of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, just looking at those lines you might think the naming is unfortunate. But as much as I'd love to be more explicit, the |
||
} | ||
|
||
def renderTemplateSpan(ts: TemplateSpan): String = ts match { | ||
/* The first two types represent nodes originating in markup. | ||
* Applying a template happens by merging its AST with the markup AST, | ||
* meaning its node types will be interspersed. | ||
* It is unlikely anyone will use a template for the index renderer, | ||
* but the use case should be covered - it could lead to an empty index | ||
* for pages with templates otherwise. */ | ||
case EmbeddedRoot(content, _, _) => renderBlocks(content) | ||
case TemplateElement(element, _, _) => renderElement(element) | ||
|
||
case tsc: TemplateSpanContainer => fmt.children(tsc.content) | ||
|
||
/* The rest is HTML markup or unknown content */ | ||
case _ => "" | ||
} | ||
|
||
def renderBlocks(blocks: Seq[Block]): String = | ||
if (blocks.nonEmpty) fmt.childPerLine(blocks) + fmt.newLine | ||
else "" | ||
|
||
def renderBlock(spans: Seq[Span]): String = | ||
if (spans.nonEmpty) fmt.children(spans) + fmt.newLine | ||
else "" | ||
|
||
element match { | ||
/* These are marker traits for nodes we should ignore. | ||
* They usually also implement some of the other traits we match on, | ||
* so this always needs to come first. */ | ||
case _: Hidden | _: Unresolved | _: Invalid => "" | ||
case lc: ListContainer => renderListContainer(lc) | ||
case bc: BlockContainer => renderBlockContainer(bc) | ||
case sc: SpanContainer => fmt.children(sc.content) | ||
case t: Table => renderTable(t) | ||
case tsc: TemplateSpanContainer => fmt.children(tsc.content) | ||
case ts: TemplateSpan => renderTemplateSpan(ts) | ||
case tc: TextContainer => renderTextContainer(tc) | ||
case ec: ElementContainer[?] => renderElementContainer(ec) | ||
case e => renderElement(e) | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(not a change request, just me thinking outloud)
This line of code made me realize a cheap and cheerful way to identify "fragments" within the "plain text" in order to divide the text up into logical pieces during highlighting (#205). Which is to say, delimiting "fragments" by newlines. Effectively making every line in the plain text format a "fragment" and then, during highlighting we figure out the fragment that best matches the query.
So in some future effort I might change this line to use
.mkString("\n")
, making the caption and the title to different fragments to potentially highlight. Or maybe not, maybe it's better to have them together in one fragment. It's just nice that there is a pretty easy way to control this in this renderer, if we go that route.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Haven't thought about that aspect yet, but yes, makes a lot of sense. Another aspect I'm wondering about is scoring - in theory the AST model is ideal for that, you could have headlines score more than matches in body text, but I'm not sure how you'd ever carry that information in a plain text renderer without using an interim format that needs to be parsed again?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I think you're right about needing to reparse.
Because rendering in Laika requires us to output
String
, we're a bit limited in how we can preserve structure without requiring an additional parsing step.But that's not the end of the world or anything. We have to tokenize the text anyways, so ideally we can handle the secondary parsing there.