diff --git a/laikaIO/src/main/scala/pink/cozydev/protosearch/analysis/PlaintextRenderer.scala b/laikaIO/src/main/scala/pink/cozydev/protosearch/analysis/PlaintextRenderer.scala index f4cf071..d2ce405 100644 --- a/laikaIO/src/main/scala/pink/cozydev/protosearch/analysis/PlaintextRenderer.scala +++ b/laikaIO/src/main/scala/pink/cozydev/protosearch/analysis/PlaintextRenderer.scala @@ -16,40 +16,123 @@ package pink.cozydev.protosearch.analysis -import laika.ast._ +import laika.ast.* import laika.api.format.{Formatter, RenderFormat} +import laika.ast.html.{HTMLBlock, HTMLSpan} object PlaintextRenderer extends ((Formatter, Element) => String) { - private case class Content(content: Seq[Element], options: Options = Options.empty) - extends Element - with ElementContainer[Element] { - type Self = Content - def withOptions(options: Options): Content = copy(options = options) - } - def apply(fmt: Formatter, element: Element): String = { - def renderElement(e: Element): String = { - val (elements, _) = e.productIterator.partition(_.isInstanceOf[Element]) - e.productPrefix + fmt.indentedChildren( - elements.toList.asInstanceOf[Seq[Element]] - ) + def renderElement(e: Element): String = e match { + + /* search engines tend to index alt and title attributes of images */ + case img: Image => (img.alt.toList ++ img.title.toList).mkString(" ") + + /* only pick up nodes targeting HTML output */ + case TargetFormat(formats, element, _) if formats.contains("html") => fmt.child(element) + + /* tabbed content in HTML, including all tabs */ + case sel: Selection => renderBlocks(sel.choices.flatMap(_.content)) + + /* traverse to extract text nodes in verbatim HTML */ + case html: HTMLBlock => fmt.child(html.root) + + /* 3rd party nodes can implement Fallback to provide an alternative representation + * of the same element based on more common node types. */ + case f: Fallback => fmt.child(f.fallback) + + /* ignore unknown nodes as we cannot know if they represent visual, textual information */ + case _ => "" } - def lists(lists: Seq[Element]*): String = - fmt.childPerLine(lists.map { case elems => Content(elems) }) + def renderListContainer(con: ListContainer): String = con match { + /* Excluded as they would either produce unwanted entries (e.g. the headline of a different page) + * or duplicate entries (e.g. a section title on the current page) + */ + case _: NavigationList | _: NavigationItem => "" + case _ => fmt.children(con.content) + } - element match { - case s: Section => - fmt.children(s.header.content) + "\n" + fmt.childPerLine(s.content) + "\n" + def renderBlockContainer(con: BlockContainer): String = con match { + /* Some special handling for the few containers which hold child nodes + in more properties than just the container's `content` property. + */ + case Section(header, content, _) => renderBlock(header.content) + renderBlocks(content) + case QuotedBlock(content, attr, _) => renderBlocks(content) + renderBlock(attr) + case TitledBlock(title, content, _) => renderBlock(title) + renderBlocks(content) + case Figure(_, caption, content, _) => renderBlock(caption) + renderBlocks(content) + case DefinitionListItem(term, defn, _) => renderBlock(term) + renderBlocks(defn) + case _ => renderBlocks(con.content) + } + + def renderElementContainer(con: ElementContainer[? <: Element]): String = con match { + /* SectionInfo is solely used in navigation structures and represents duplicate info. + */ + case _: SectionInfo => "" + /* All other core AST types implement one of the sub-traits of ElementContainer - + if we end up here it's an unknown 3rd party node + */ + case _ => fmt.children(con.content) + } + + def renderTextContainer(con: TextContainer): String = con match { + /* match on most common container type first for performance */ + case Text(content, _) => content + /* could be any unknown markup format */ + case _: RawContent => "" + /* comments are usually ignored by search engines */ + case _: Comment => "" + /* embedded debug info node */ + case _: RuntimeMessage => "" + /* this does not represent text nodes in verbatim HTML */ + case _: HTMLSpan => "" case _: SectionNumber => "" - case QuotedBlock(content, attr, _) => lists(content, attr) - case DefinitionListItem(term, defn, _) => lists(term, defn) + "\n" - case bc: BlockContainer => fmt.childPerLine(bc.content) + "\n" - case tc: TextContainer => tc.content - case Content(content, _) => fmt.childPerLine(content) - case ec: ElementContainer[_] => fmt.children(ec.content) + case _ => con.content + } + + def renderTable(table: Table): String = { + val cells = (table.head.content ++ table.body.content).flatMap(_.content) + renderBlocks(cells.flatMap(_.content)) + renderBlock(table.caption.content) + } + + def renderTemplateSpan(ts: TemplateSpan): String = ts match { + /* The first two types represent nodes originating in markup. + * Applying a template happens by merging its AST with the markup AST, + * meaning its node types will be interspersed. + * It is unlikely anyone will use a template for the index renderer, + * but the use case should be covered - it could lead to an empty index + * for pages with templates otherwise. */ + case EmbeddedRoot(content, _, _) => renderBlocks(content) + case TemplateElement(element, _, _) => renderElement(element) + + case tsc: TemplateSpanContainer => fmt.children(tsc.content) + + /* The rest is HTML markup or unknown content */ + case _ => "" + } + + def renderBlocks(blocks: Seq[Block]): String = + if (blocks.nonEmpty) fmt.childPerLine(blocks) + fmt.newLine + else "" + + def renderBlock(spans: Seq[Span]): String = + if (spans.nonEmpty) fmt.children(spans) + fmt.newLine + else "" + + element match { + /* These are marker traits for nodes we should ignore. + * They usually also implement some of the other traits we match on, + * so this always needs to come first. */ + case _: Hidden | _: Unresolved | _: Invalid => "" + case lc: ListContainer => renderListContainer(lc) + case bc: BlockContainer => renderBlockContainer(bc) + case sc: SpanContainer => fmt.children(sc.content) + case t: Table => renderTable(t) + case tsc: TemplateSpanContainer => fmt.children(tsc.content) + case ts: TemplateSpan => renderTemplateSpan(ts) + case tc: TextContainer => renderTextContainer(tc) + case ec: ElementContainer[?] => renderElementContainer(ec) case e => renderElement(e) } } diff --git a/laikaIO/src/test/scala/pink/cozydev/protosearch/analysis/PlaintextRendererSuite.scala b/laikaIO/src/test/scala/pink/cozydev/protosearch/analysis/PlaintextRendererSuite.scala index 4e0609e..c240eb8 100644 --- a/laikaIO/src/test/scala/pink/cozydev/protosearch/analysis/PlaintextRendererSuite.scala +++ b/laikaIO/src/test/scala/pink/cozydev/protosearch/analysis/PlaintextRendererSuite.scala @@ -16,19 +16,75 @@ package pink.cozydev.protosearch.analysis -import laika.api.MarkupParser -import laika.format.Markdown -import laika.config.SyntaxHighlighting -import laika.api.Renderer +import cats.effect.{IO, Resource} +import laika.api.{MarkupParser, Renderer, Transformer} +import laika.format.{Markdown, ReStructuredText} +import laika.config.{ChoiceConfig, SelectionConfig, Selections, SyntaxHighlighting} import laika.api.errors.TransformationError +import laika.ast.Path.Root +import laika.io.api.TreeTransformer +import laika.io.model.InputTree +import laika.io.syntax.* +import munit.CatsEffectSuite -class PlaintextRendererSuite extends munit.FunSuite { +import scala.annotation.nowarn - val parser = MarkupParser.of(Markdown).using(Markdown.GitHubFlavor, SyntaxHighlighting).build - val plaintextRenderer = Renderer.of(Plaintext).build +@nowarn("msg=possible missing interpolator") +class PlaintextRendererSuite extends CatsEffectSuite { - def render(input: String): Either[TransformationError, String] = - parser.parse(input).flatMap(d => plaintextRenderer.render(d)) + val selections: Selections = Selections( + SelectionConfig( + "name", + ChoiceConfig("aaa", "AAA Content"), + ChoiceConfig("bbb", "BBB Content"), + ) + ) + + val markdownParser: MarkupParser = + MarkupParser + .of(Markdown) + .using(Markdown.GitHubFlavor, SyntaxHighlighting) + .withConfigValue(selections) + .withRawContent + .build + + val rstParser: MarkupParser = + MarkupParser.of(ReStructuredText).withRawContent.build + val plaintextRenderer: Renderer = Renderer.of(Plaintext).build + + val ioTransformer: Resource[IO, TreeTransformer[IO]] = Transformer + .from(Markdown) + .to(Plaintext) + .using(Markdown.GitHubFlavor, SyntaxHighlighting) + .parallel[IO] + .build + + def transformMarkdown(input: String): Either[TransformationError, String] = + markdownParser.parse(input).flatMap(d => plaintextRenderer.render(d)) + def transformRST(input: String): Either[TransformationError, String] = + rstParser.parse(input).flatMap(d => plaintextRenderer.render(d)) + + def transformWithTemplate(input: String, template: String): IO[String] = { + val firstDoc = + s"""|{% + |laika.template = custom.template.txt + |%} + | + |$input""".stripMargin + val secondDoc = + """|Second Doc + |========== + | + |Text + |""".stripMargin + val inputTree = InputTree[IO] + .addString(firstDoc, Root / "doc-1.md") + .addString(secondDoc, Root / "doc-2.md") + .addString(template, Root / "custom.template.txt") + ioTransformer.use { + _.fromInput(inputTree).toMemory.transform.map(_.allDocuments.head.content) + } + } test("title, words") { val doc = @@ -40,7 +96,7 @@ class PlaintextRendererSuite extends munit.FunSuite { |normal bold italics code | |""".stripMargin - assertEquals(render(doc), Right(expected)) + assertEquals(transformMarkdown(doc), Right(expected)) } test("title, empty line, words") { @@ -54,7 +110,7 @@ class PlaintextRendererSuite extends munit.FunSuite { |normal bold italics code | |""".stripMargin - assertEquals(render(doc), Right(expected)) + assertEquals(transformMarkdown(doc), Right(expected)) } test("title, empty line, words, code block") { @@ -73,7 +129,307 @@ class PlaintextRendererSuite extends munit.FunSuite { |val x = 2 | |""".stripMargin - assertEquals(render(doc), Right(expected)) + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("table head and body - Markdown with GitHub Flavor") { + val doc = + """|| AAA | BBB | + || --- | --- | + || CCC | DDD | + || EEE | FFF | + | + |Some more text + """.stripMargin + val expected = + """|AAA + |BBB + |CCC + |DDD + |EEE + |FFF + | + |Some more text + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("images - index alt and title attributes") { + val doc = + """|AAA BBB @:image(logo.png) { + | alt = Some Explanation + | title = Tooltip Text + |} CCC + |""".stripMargin + val expected = + """|AAA BBB Some Explanation Tooltip Text CCC + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("content from select directive") { + val doc = + """|@:select(name) + | + |@:choice(aaa) + |AAA + | + |@:choice(bbb) + |BBB + | + |@:@ + | + |Other Text""".stripMargin + val expected = + """|AAA + |BBB + | + |Other Text + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("only index target-specific content for HTML") { + val doc = + """|Normal Text + | + |@:format(html) + |HTML *Content* + |@:@ + | + |@:format(epub) + |EPUB *Content* + |@:@ + |""".stripMargin + val expected = + """|Normal Text + |HTML Content + | + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + /** BlockContainers **************************************************** */ + + test("nested blockquotes - Markdown") { + val doc = + """|>aaa + |> + |>>bbb + |> + |>ccc""".stripMargin + val expected = + """|aaa + |bbb + | + |ccc + | + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("block quote with an attribution - reStructuredText") { + val doc = + """| Paragraph 1 + | + | -- an attribution""".stripMargin + val expected = + """|Paragraph 1 + |an attribution + | + |""".stripMargin + assertEquals(transformRST(doc), Right(expected)) + } + + test("titled block - reStructuredText") { + val doc = + """|.. caution:: + | + | Line 1 + | + | Line 2""".stripMargin + val expected = + """|Caution! + |Line 1 + |Line 2 + | + |""".stripMargin + assertEquals(transformRST(doc), Right(expected)) + } + + test("figure with a caption and a legend - reStructuredText") { + val doc = + """|.. figure:: picture.jpg + | + | This is the *caption* + | + | And this is the legend""".stripMargin + val expected = + """|This is the caption + |And this is the legend + | + |""".stripMargin + assertEquals(transformRST(doc), Right(expected)) + } + + /** lists ************************************************************** */ + + test("nested bullet lists") { + val doc = + """|* Bullet 1 - Line 1 + | + | Bullet 1 - Line 2 + | + | * Nested - Line 1 + | + | Nested - Line 2 + | + |* Bullet 2 - Line 1 + | Bullet 2 - Line 2 + |""".stripMargin + val expected = + """|Bullet 1 - Line 1 + |Bullet 1 - Line 2 + | + |Nested - Line 1 + |Nested - Line 2 + |Bullet 2 - Line 1 + |Bullet 2 - Line 2 + | + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("enum lists") { + val doc = + """|1. Item 1 + |2. Item *em* 2 + |3. Item 3 + |""".stripMargin + val expected = + """|Item 1 + |Item em 2 + |Item 3 + | + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("definition list - reStructuredText") { + val doc = + """|term 1 + | aaa + | aaa + | + | bbb *ccc* ddd + | + |term 2 + | ccc""".stripMargin + val expected = + """|term 1 + |aaa + |aaa + | + |bbb ccc ddd + |term 2 + |ccc + | + |""".stripMargin + assertEquals(transformRST(doc), Right(expected)) + } + + test("exclude navigation lists") { + val doc = + """|First Doc + |========= + |""".stripMargin + val template = + """|@:navigationTree { + | entries = [{ target = "/", excludeRoot = true }] + |} + | + |${cursor.currentDocument.content} + |""".stripMargin + val expected = + """|First Doc + | + |""".stripMargin + transformWithTemplate(doc, template).map { res => + assertEquals(res, expected) + } + } + + /** templates and raw content ****************************************************** */ + + test("exclude template content except the nodes merged from the associated markup files") { + val doc = + """|First Doc + |========= + |""".stripMargin + val template = + """| + | + |${cursor.currentDocument.content} + | + | + |""".stripMargin + val expected = + """|First Doc + | + |""".stripMargin + transformWithTemplate(doc, template).map { res => + assertEquals(res, expected) + } + } + + test("extract text nodes in verbatim HTML") { + val doc = + """|
Text Node
+ | + |Included Text + |""".stripMargin + val expected = + """|Text Node + |Included Text + |""".stripMargin + assertEquals(transformMarkdown(doc), Right(expected)) + } + + test("ignore comments - reStructuredText") { + val doc = + """|The Title + |========= + |The text + | + |.. This is a comment + |""".stripMargin + val expected = + """|The Title + |The text + | + | + |""".stripMargin + assertEquals(transformRST(doc), Right(expected)) + } + + test("ignore raw content - reStructuredText") { + val doc = + """|The Title + |========= + |The text + | + |.. raw:: format + | + | some input + | + | some more""".stripMargin + val expected = + """|The Title + |The text + | + | + |""".stripMargin + assertEquals(transformRST(doc), Right(expected)) } }