diff options
Diffstat (limited to 'dokka-subprojects/analysis-markdown-jb')
8 files changed, 888 insertions, 0 deletions
diff --git a/dokka-subprojects/analysis-markdown-jb/README.md b/dokka-subprojects/analysis-markdown-jb/README.md new file mode 100644 index 00000000..2922abc8 --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/README.md @@ -0,0 +1,7 @@ +# Ananlysis: Markdown (JetBrains) + +An internal module that encapsulates Markdown file and format parsing by using `org.jetbrains:markdown` +as the primary implementation dependency. + +Used by other Dokka modules, but it must not be used by external users directly until stable public API +is provided. diff --git a/dokka-subprojects/analysis-markdown-jb/api/analysis-markdown-jb.api b/dokka-subprojects/analysis-markdown-jb/api/analysis-markdown-jb.api new file mode 100644 index 00000000..3a8c37c5 --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/api/analysis-markdown-jb.api @@ -0,0 +1,28 @@ +public final class org/jetbrains/dokka/analysis/markdown/jb/MarkdownApiKt { + public static final fun getMARKDOWN_ELEMENT_FILE_NAME ()Ljava/lang/String; +} + +public class org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser : org/jetbrains/dokka/analysis/markdown/jb/Parser { + public static final field Companion Lorg/jetbrains/dokka/analysis/markdown/jb/MarkdownParser$Companion; + public fun <init> (Lkotlin/jvm/functions/Function1;Ljava/lang/String;)V + public fun parseStringToDocNode (Ljava/lang/String;)Lorg/jetbrains/dokka/model/doc/DocTag; + protected fun parseTagWithBody (Ljava/lang/String;Ljava/lang/String;)Lorg/jetbrains/dokka/model/doc/TagWrapper; + protected fun preparse (Ljava/lang/String;)Ljava/lang/String; +} + +public final class org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser$Companion { + public final fun fqDeclarationName (Lorg/jetbrains/dokka/links/DRI;)Ljava/lang/String; +} + +public final class org/jetbrains/dokka/analysis/markdown/jb/ParseUtilsKt { + public static final fun parseHtmlEncodedWithNormalisedSpaces (Ljava/lang/String;Z)Ljava/util/List; +} + +public abstract class org/jetbrains/dokka/analysis/markdown/jb/Parser { + public fun <init> ()V + public fun parse (Ljava/lang/String;)Lorg/jetbrains/dokka/model/doc/DocumentationNode; + public abstract fun parseStringToDocNode (Ljava/lang/String;)Lorg/jetbrains/dokka/model/doc/DocTag; + protected fun parseTagWithBody (Ljava/lang/String;Ljava/lang/String;)Lorg/jetbrains/dokka/model/doc/TagWrapper; + protected abstract fun preparse (Ljava/lang/String;)Ljava/lang/String; +} + diff --git a/dokka-subprojects/analysis-markdown-jb/build.gradle.kts b/dokka-subprojects/analysis-markdown-jb/build.gradle.kts new file mode 100644 index 00000000..bb051bf2 --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/build.gradle.kts @@ -0,0 +1,19 @@ +/* + * Copyright 2014-2023 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +import dokkabuild.overridePublicationArtifactId + +plugins { + id("dokkabuild.kotlin-jvm") + id("dokkabuild.publish-jvm") +} + +overridePublicationArtifactId("analysis-markdown") + +dependencies { + compileOnly(projects.dokkaSubprojects.dokkaCore) + + implementation(libs.jsoup) + implementation(libs.jetbrains.markdown) +} diff --git a/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownApi.kt b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownApi.kt new file mode 100644 index 00000000..bc56b596 --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownApi.kt @@ -0,0 +1,12 @@ +/* + * Copyright 2014-2023 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package org.jetbrains.dokka.analysis.markdown.jb + +import org.intellij.markdown.MarkdownElementTypes +import org.jetbrains.dokka.InternalDokkaApi + +// TODO [beresnev] move/rename if it's only used for CustomDocTag. for now left as is for compatibility +@InternalDokkaApi +public val MARKDOWN_ELEMENT_FILE_NAME: String = MarkdownElementTypes.MARKDOWN_FILE.name diff --git a/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser.kt b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser.kt new file mode 100644 index 00000000..130c6def --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser.kt @@ -0,0 +1,554 @@ +/* + * Copyright 2014-2023 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package org.jetbrains.dokka.analysis.markdown.jb + +import org.intellij.markdown.MarkdownElementTypes +import org.intellij.markdown.MarkdownTokenTypes +import org.intellij.markdown.ast.ASTNode +import org.intellij.markdown.ast.CompositeASTNode +import org.intellij.markdown.ast.LeafASTNode +import org.intellij.markdown.ast.impl.ListItemCompositeNode +import org.intellij.markdown.flavours.gfm.GFMElementTypes +import org.intellij.markdown.flavours.gfm.GFMFlavourDescriptor +import org.intellij.markdown.flavours.gfm.GFMTokenTypes +import org.intellij.markdown.html.HtmlGenerator +import org.jetbrains.dokka.InternalDokkaApi +import org.jetbrains.dokka.analysis.markdown.jb.factories.DocTagsFromIElementFactory +import org.jetbrains.dokka.links.DRI +import org.jetbrains.dokka.links.PointingToDeclaration +import org.jetbrains.dokka.model.doc.* +import java.net.MalformedURLException +import java.net.URL +import org.intellij.markdown.parser.MarkdownParser as IntellijMarkdownParser + +@InternalDokkaApi +public open class MarkdownParser( + private val externalDri: (String) -> DRI?, + private val kdocLocation: String?, +) : Parser() { + + private lateinit var destinationLinksMap: Map<String, String> + private lateinit var text: String + + override fun parseStringToDocNode(extractedString: String): DocTag { + val gfmFlavourDescriptor = GFMFlavourDescriptor() + val markdownAstRoot = IntellijMarkdownParser(gfmFlavourDescriptor).buildMarkdownTreeFromString(extractedString) + destinationLinksMap = getAllDestinationLinks(extractedString, markdownAstRoot).toMap() + text = extractedString + + val parsed = visitNode(markdownAstRoot) + if (parsed.size == 1) { + return parsed.first() + } + return CustomDocTag(children = parsed, params = emptyMap(), name = "") + } + + override fun preparse(text: String): String = text.replace("\r\n", "\n").replace("\r", "\n") + + override fun parseTagWithBody(tagName: String, content: String): TagWrapper = + when (tagName) { + "see" -> { + val referencedName = content.substringBefore(' ') + val dri = externalDri(referencedName) + See( + parseStringToDocNode(content.substringAfter(' ')), + dri?.fqDeclarationName() ?: referencedName, + dri + ) + } + "throws", "exception" -> { + val dri = externalDri(content.substringBefore(' ')) + Throws( + parseStringToDocNode(content.substringAfter(' ')), + dri?.fqDeclarationName() ?: content.substringBefore(' '), + dri + ) + } + else -> super.parseTagWithBody(tagName, content) + } + + private fun headersHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + node.type, + visitNode(node.children.find { it.type == MarkdownTokenTypes.ATX_CONTENT } + ?: throw detailedException("Wrong AST Tree. Header does not contain expected content", node) + ).flatMap { it.children } + ) + + /** + * Handler for [MarkdownTokenTypes.ATX_CONTENT], which is the content of the header + * elements like [MarkdownElementTypes.ATX_1], [MarkdownElementTypes.ATX_2] and so on. + * + * For example, a header line like `# Header text` is expected to be parsed into: + * - One [MarkdownTokenTypes.ATX_HEADER] with startOffset = 0, endOffset = 1 (only the `#` symbol) + * - Composite [MarkdownTokenTypes.ATX_CONTENT] with four children: WHITE_SPACE, TEXT, WHITE_SPACE, TEXT. + */ + private fun headerContentHandler(node: ASTNode): List<DocTag> { + // ATX_CONTENT contains everything after the `#` symbol, so if there's a space + // in-between the `#` symbol and the text (like `# header`), it will be present here too. + // However, we don't need the leading space between the `#` symbol and the text, nor do we need trailing spaces, + // so we just skip it (otherwise the header text will be parsed as `<whitespace>header` instead of `header`). + // If there's more space between `#` and text, like `# header`, it will still be a single WHITE_SPACE + // element, but it will be wider, so the solution below should still hold. The same applies to trailing spaces. + val trimmedChildren = node.children.trimWhitespaceToken() + + val children = trimmedChildren.evaluateChildren() + return DocTagsFromIElementFactory.getInstance( + MarkdownElementTypes.PARAGRAPH, // PARAGRAPH instead of TEXT to preserve compatibility with prev. versions + children = children + ) + } + + /** + * @return a sublist of [this] list that does not contain + * leading and trailing [MarkdownTokenTypes.WHITE_SPACE] elements + */ + private fun List<ASTNode>.trimWhitespaceToken(): List<ASTNode> { + val firstNonWhitespaceIndex = this.indexOfFirst { it.type != MarkdownTokenTypes.WHITE_SPACE } + if (firstNonWhitespaceIndex == -1) { + return this + } + val lastNonWhitespaceIndex = this.indexOfLast { it.type != MarkdownTokenTypes.WHITE_SPACE } + + return this.subList(firstNonWhitespaceIndex, lastNonWhitespaceIndex + 1) + } + + private fun horizontalRulesHandler() = + DocTagsFromIElementFactory.getInstance(MarkdownTokenTypes.HORIZONTAL_RULE) + + private fun emphasisHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + node.type, + children = node.children.evaluateChildrenWithDroppedEnclosingTokens(1) + ) + + private fun strongHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + node.type, + children = node.children.evaluateChildrenWithDroppedEnclosingTokens(2) + ) + + private fun List<ASTNode>.evaluateChildrenWithDroppedEnclosingTokens(count: Int) = + drop(count).dropLast(count).evaluateChildren() + + private fun blockquotesHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + node.type, children = node.children + .filterIsInstance<CompositeASTNode>() + .evaluateChildren() + ) + + private fun listsHandler(node: ASTNode): List<DocTag> { + + val children = node.children.filterIsInstance<ListItemCompositeNode>().flatMap { + if (it.children.last().type in listOf( + MarkdownElementTypes.ORDERED_LIST, + MarkdownElementTypes.UNORDERED_LIST + ) + ) { + val nestedList = it.children.last() + (it.children as MutableList).removeAt(it.children.lastIndex) + listOf(it, nestedList) + } else + listOf(it) + } + + return DocTagsFromIElementFactory.getInstance( + node.type, + children = + children + .flatMap { + if (it.type == MarkdownElementTypes.LIST_ITEM) + DocTagsFromIElementFactory.getInstance( + it.type, + children = it + .children + .filterIsInstance<CompositeASTNode>() + .evaluateChildren() + ) + else + visitNode(it) + }, + params = + if (node.type == MarkdownElementTypes.ORDERED_LIST) { + val listNumberNode = node.children.first().children.first() + mapOf( + "start" to text.substring( + listNumberNode.startOffset, + listNumberNode.endOffset + ).trim().dropLast(1) + ) + } else + emptyMap() + ) + } + + private fun resolveDRI(mdLink: String): DRI? = + mdLink + .removePrefix("[") + .removeSuffix("]") + .let { link -> + try { + URL(link) + null + } catch (e: MalformedURLException) { + externalDri(link) + } + } + + private fun getAllDestinationLinks(text: String, node: ASTNode): List<Pair<String, String>> = + node.children + .filter { it.type == MarkdownElementTypes.LINK_DEFINITION } + .map { + text.substring(it.children[0].startOffset, it.children[0].endOffset).toLowerCase() to + text.substring(it.children[2].startOffset, it.children[2].endOffset) + } + + node.children.filterIsInstance<CompositeASTNode>().flatMap { getAllDestinationLinks(text, it) } + + + private fun referenceLinksHandler(node: ASTNode): List<DocTag> { + val linkLabel = node.children.find { it.type == MarkdownElementTypes.LINK_LABEL } + ?: throw detailedException("Wrong AST Tree. Reference link does not contain link label", node) + val linkText = node.children.findLast { it.type == MarkdownElementTypes.LINK_TEXT } ?: linkLabel + + val linkKey = text.substring(linkLabel.startOffset, linkLabel.endOffset) + + val link = destinationLinksMap[linkKey.toLowerCase()] ?: linkKey + + return linksHandler(linkText, link) + } + + private fun inlineLinksHandler(node: ASTNode): List<DocTag> { + val linkText = node.children.find { it.type == MarkdownElementTypes.LINK_TEXT } + ?: throw detailedException("Wrong AST Tree. Inline link does not contain link text", node) + val linkDestination = node.children.find { it.type == MarkdownElementTypes.LINK_DESTINATION } + val linkTitle = node.children.find { it.type == MarkdownElementTypes.LINK_TITLE } + + // Link destination may be ommited: https://github.github.com/gfm/#example-495 + val link = linkDestination?.let { text.substring(it.startOffset, it.endOffset) } + + return linksHandler(linkText, link, linkTitle) + } + + private fun markdownFileHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + node.type, + children = node.children + .filterSpacesAndEOL() + .evaluateChildren() + ) + + private fun autoLinksHandler(node: ASTNode): List<DocTag> { + val link = text.substring(node.startOffset + 1, node.endOffset - 1) + + return linksHandler(node, link) + } + + private fun linksHandler(linkText: ASTNode, link: String?, linkTitle: ASTNode? = null): List<DocTag> { + val dri: DRI? = link?.let { resolveDRI(it) } + val linkOrEmpty = link ?: "" + val linkTextString = + if (linkTitle == null) linkOrEmpty else text.substring(linkTitle.startOffset + 1, linkTitle.endOffset - 1) + + val params = if (linkTitle == null) + mapOf("href" to linkOrEmpty) + else + mapOf("href" to linkOrEmpty, "title" to linkTextString) + + return if (link != null && dri == null && !linkOrEmpty.isRemoteLink()) { + DocTagsFromIElementFactory.getInstance( + MarkdownTokenTypes.TEXT, + params = params, + children = linkText.children.drop(1).dropLast(1).evaluateChildren(), + body = linkTextString.removeSurrounding("[", "]") + ) + } else { + DocTagsFromIElementFactory.getInstance( + MarkdownElementTypes.INLINE_LINK, + params = params, + children = linkText.children.drop(1).dropLast(1).evaluateChildren(), + dri = dri + ) + } + } + + private fun codeLineHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance( + MarkdownElementTypes.CODE_BLOCK, + body = text.substring(node.startOffset, node.endOffset) + ) + + private fun textHandler(node: ASTNode, keepAllFormatting: Boolean) = DocTagsFromIElementFactory.getInstance( + MarkdownTokenTypes.TEXT, + body = text.substring(node.startOffset, node.endOffset).transform(), + keepFormatting = keepAllFormatting + ) + + private fun strikeThroughHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance( + node.type, + children = node.children.evaluateChildrenWithDroppedEnclosingTokens(2) + ) + + private fun tableHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance( + GFMElementTypes.TABLE, + children = node.children + .filter { it.type == GFMElementTypes.ROW || it.type == GFMElementTypes.HEADER } + .evaluateChildren() + ) + + private fun headerHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance( + GFMElementTypes.HEADER, + children = node.children + .filter { it.type == GFMTokenTypes.CELL } + .evaluateChildren() + ) + + private fun rowHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance( + GFMElementTypes.ROW, + children = node.children + .filter { it.type == GFMTokenTypes.CELL } + .evaluateChildren() + ) + + private fun cellHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance( + GFMTokenTypes.CELL, + children = node.children.filterTabSeparators().evaluateChildren().trimSurroundingTokensIfText() + ) + + private fun String.isRemoteLink() = try { + URL(this) + true + } catch (e: MalformedURLException) { + false + } + + private fun imagesHandler(node: ASTNode): List<DocTag> = + with(node.children.last().children) { + val destination = find { it.type == MarkdownElementTypes.LINK_DESTINATION } + val description = find { it.type == MarkdownElementTypes.LINK_TEXT } + + val src = destination?.let { + mapOf("href" to text.substring(it.startOffset, it.endOffset)) + } ?: emptyMap() + + val alt = description?.let { + mapOf("alt" to text.substring(it.startOffset + 1, it.endOffset - 1)) + } ?: emptyMap() + + return DocTagsFromIElementFactory.getInstance( + node.type, + params = src + alt + ) + } + + + private fun rawHtmlHandler(node: ASTNode): List<DocTag> = + DocTagsFromIElementFactory.getInstance( + node.type, + body = text.substring(node.startOffset, node.endOffset) + ) + + private fun codeSpansHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + node.type, + children = DocTagsFromIElementFactory.getInstance( + MarkdownTokenTypes.TEXT, + body = text.substring(node.startOffset + 1, node.endOffset - 1).replace('\n', ' ').trimIndent(), + keepFormatting = true + ) + ) + + private fun codeFencesHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + node.type, + children = node + .children + .dropWhile { it.type != MarkdownTokenTypes.CODE_FENCE_CONTENT } + .dropLastWhile { it.type != MarkdownTokenTypes.CODE_FENCE_CONTENT } + .filter { it.type != MarkdownTokenTypes.WHITE_SPACE } + .map { + if (it.type == MarkdownTokenTypes.EOL) + LeafASTNode(MarkdownTokenTypes.HARD_LINE_BREAK, 0, 0) + else + it + }.evaluateChildren(keepAllFormatting = true), + params = node + .children + .find { it.type == MarkdownTokenTypes.FENCE_LANG } + ?.let { mapOf("lang" to text.substring(it.startOffset, it.endOffset)) } + ?: emptyMap() + ) + + private fun codeBlocksHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance(node.type, children = node.children.mergeLeafASTNodes().flatMap { + DocTagsFromIElementFactory.getInstance( + MarkdownTokenTypes.TEXT, + body = HtmlGenerator.trimIndents(text.substring(it.startOffset, it.endOffset), 4).toString() + ) + }) + + private fun defaultHandler(node: ASTNode) = + DocTagsFromIElementFactory.getInstance( + MarkdownElementTypes.PARAGRAPH, + children = node.children.evaluateChildren() + ) + + private fun visitNode(node: ASTNode, keepAllFormatting: Boolean = false): List<DocTag> = + when (node.type) { + MarkdownElementTypes.ATX_1, + MarkdownElementTypes.ATX_2, + MarkdownElementTypes.ATX_3, + MarkdownElementTypes.ATX_4, + MarkdownElementTypes.ATX_5, + MarkdownElementTypes.ATX_6, + -> headersHandler(node) + MarkdownTokenTypes.ATX_CONTENT -> headerContentHandler(node) + MarkdownTokenTypes.HORIZONTAL_RULE -> horizontalRulesHandler() + MarkdownElementTypes.STRONG -> strongHandler(node) + MarkdownElementTypes.EMPH -> emphasisHandler(node) + MarkdownElementTypes.FULL_REFERENCE_LINK, + MarkdownElementTypes.SHORT_REFERENCE_LINK, + -> referenceLinksHandler(node) + MarkdownElementTypes.INLINE_LINK -> inlineLinksHandler(node) + MarkdownElementTypes.AUTOLINK -> autoLinksHandler(node) + MarkdownElementTypes.BLOCK_QUOTE -> blockquotesHandler(node) + MarkdownElementTypes.UNORDERED_LIST, + MarkdownElementTypes.ORDERED_LIST, + -> listsHandler(node) + MarkdownElementTypes.CODE_BLOCK -> codeBlocksHandler(node) + MarkdownElementTypes.CODE_FENCE -> codeFencesHandler(node) + MarkdownElementTypes.CODE_SPAN -> codeSpansHandler(node) + MarkdownElementTypes.IMAGE -> imagesHandler(node) + MarkdownElementTypes.HTML_BLOCK, + MarkdownTokenTypes.HTML_TAG, + MarkdownTokenTypes.HTML_BLOCK_CONTENT, + -> rawHtmlHandler(node) + MarkdownTokenTypes.HARD_LINE_BREAK -> DocTagsFromIElementFactory.getInstance(node.type) + MarkdownTokenTypes.CODE_FENCE_CONTENT, + MarkdownTokenTypes.CODE_LINE, + -> codeLineHandler(node) + MarkdownTokenTypes.TEXT -> textHandler(node, keepAllFormatting) + MarkdownElementTypes.MARKDOWN_FILE -> markdownFileHandler(node) + GFMElementTypes.STRIKETHROUGH -> strikeThroughHandler(node) + GFMElementTypes.TABLE -> tableHandler(node) + GFMElementTypes.HEADER -> headerHandler(node) + GFMElementTypes.ROW -> rowHandler(node) + GFMTokenTypes.CELL -> cellHandler(node) + else -> defaultHandler(node) + } + + private fun List<ASTNode>.filterTabSeparators() = + this.filterNot { it.type == GFMTokenTypes.TABLE_SEPARATOR } + + private fun List<ASTNode>.filterSpacesAndEOL() = + this.filterNot { it.type == MarkdownTokenTypes.WHITE_SPACE || it.type == MarkdownTokenTypes.EOL } + + private fun List<ASTNode>.evaluateChildren(keepAllFormatting: Boolean = false): List<DocTag> = + this.removeUselessTokens().swapImagesThatShouldBeLinks(keepAllFormatting).mergeLeafASTNodes().flatMap { visitNode(it, keepAllFormatting) } + + private fun List<ASTNode>.swapImagesThatShouldBeLinks(keepAllFormatting: Boolean): List<ASTNode> = + if (keepAllFormatting) { + this + } else { + flatMap { node -> + if (node.type == MarkdownElementTypes.IMAGE + && node.children.firstOrNull()?.let { it is LeafASTNode && it.type.name == "!" } == true + && node.children.lastOrNull()?.type == MarkdownElementTypes.SHORT_REFERENCE_LINK + ) { + node.children + } else { + listOf(node) + } + } + } + + private fun List<ASTNode>.removeUselessTokens(): List<ASTNode> = + this.filterIndexed { index, node -> + !(node.type == MarkdownElementTypes.LINK_DEFINITION || ( + node.type == MarkdownTokenTypes.EOL && + this.getOrNull(index - 1)?.type == MarkdownTokenTypes.HARD_LINE_BREAK + )) + } + + private fun List<DocTag>.trimSurroundingTokensIfText() = mapIndexed { index, elem -> + val elemTransformed = if (index == 0 && elem is Text) elem.copy(elem.body.trimStart()) else elem + if (index == lastIndex && elemTransformed is Text) elemTransformed.copy(elemTransformed.body.trimEnd()) else elemTransformed + } + + private val notLeafNodes = listOf( + MarkdownTokenTypes.HORIZONTAL_RULE, + MarkdownTokenTypes.HARD_LINE_BREAK, + MarkdownTokenTypes.HTML_TAG, + MarkdownTokenTypes.HTML_BLOCK_CONTENT + ) + + private fun ASTNode.isNotLeaf() = this is CompositeASTNode || this.type in notLeafNodes + + private fun List<ASTNode>.isNotLeaf(index: Int): Boolean = + if (index in 0..this.lastIndex) + this[index].isNotLeaf() + else + false + + private fun List<ASTNode>.mergeLeafASTNodes(): List<ASTNode> { + val children: MutableList<ASTNode> = mutableListOf() + var index = 0 + while (index <= this.lastIndex) { + if (this.isNotLeaf(index)) { + children += this[index] + } else { + val startOffset = this[index].startOffset + val sIndex = index + while (index < this.lastIndex) { + if (this.isNotLeaf(index + 1) || this[index + 1].startOffset != this[index].endOffset) { + children += mergedLeafNode(this, index, startOffset, sIndex) + break + } + index++ + } + if (index == this.lastIndex) { + children += mergedLeafNode(this, index, startOffset, sIndex) + } + } + index++ + } + return children + } + + private fun mergedLeafNode(nodes: List<ASTNode>, index: Int, startOffset: Int, sIndex: Int): LeafASTNode { + val endOffset = nodes[index].endOffset + val type = if (nodes.subList(sIndex, index) + .any { it.type == MarkdownTokenTypes.CODE_LINE } + ) MarkdownTokenTypes.CODE_LINE else MarkdownTokenTypes.TEXT + return LeafASTNode(type, startOffset, endOffset) + } + + private fun String.transform() = this + .replace(Regex("\n\n+"), "") // Squashing new lines between paragraphs + .replace(Regex("\n"), " ") + .replace(Regex(" >+ +"), " ") // Replacement used in blockquotes, get rid of garbage + + private fun detailedException(baseMessage: String, node: ASTNode) = + IllegalStateException( + baseMessage + " in ${kdocLocation ?: "unspecified location"}, element starts from offset ${node.startOffset} and ends ${node.endOffset}: ${ + text.substring( + node.startOffset, + node.endOffset + ) + }" + ) + + + public companion object { + public fun DRI.fqDeclarationName(): String? { + if (this.target !is PointingToDeclaration) { + return null + } + return listOfNotNull(this.packageName, this.classNames, this.callable?.name) + .joinToString(separator = ".") + .takeIf { it.isNotBlank() } + } + } +} + diff --git a/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/ParseUtils.kt b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/ParseUtils.kt new file mode 100644 index 00000000..0293d470 --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/ParseUtils.kt @@ -0,0 +1,43 @@ +/* + * Copyright 2014-2023 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package org.jetbrains.dokka.analysis.markdown.jb + +import org.intellij.markdown.lexer.Compat +import org.intellij.markdown.lexer.Compat.forEachCodePoint +import org.jetbrains.dokka.InternalDokkaApi +import org.jetbrains.dokka.model.doc.DocTag +import org.jetbrains.dokka.model.doc.Text +import org.jsoup.internal.StringUtil +import org.jsoup.nodes.Entities + +@InternalDokkaApi +public fun String.parseHtmlEncodedWithNormalisedSpaces( + renderWhiteCharactersAsSpaces: Boolean +): List<DocTag> { + val accum = StringBuilder() + val tags = mutableListOf<DocTag>() + var lastWasWhite = false + + forEachCodePoint { c -> + if (renderWhiteCharactersAsSpaces && StringUtil.isWhitespace(c)) { + if (!lastWasWhite) { + accum.append(' ') + lastWasWhite = true + } + } else if (Compat.codePointToString(c).let { it != Entities.escape(it) }) { + accum.toString().takeIf { it.isNotBlank() }?.let { tags.add(Text(it)) } + accum.delete(0, accum.length) + + accum.appendCodePoint(c) + tags.add(Text(accum.toString(), params = DocTag.contentTypeParam("html"))) + accum.delete(0, accum.length) + } else if (!StringUtil.isInvisibleChar(c)) { + accum.appendCodePoint(c) + lastWasWhite = false + } + } + accum.toString().takeIf { it.isNotBlank() }?.let { tags.add(Text(it)) } + return tags +} diff --git a/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/Parser.kt b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/Parser.kt new file mode 100644 index 00000000..28afa0c4 --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/Parser.kt @@ -0,0 +1,135 @@ +/* + * Copyright 2014-2023 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package org.jetbrains.dokka.analysis.markdown.jb + +import org.jetbrains.dokka.InternalDokkaApi +import org.jetbrains.dokka.model.doc.* + +@InternalDokkaApi +public abstract class Parser { + + public abstract fun parseStringToDocNode(extractedString: String): DocTag + + protected abstract fun preparse(text: String): String + + public open fun parse(text: String): DocumentationNode = + DocumentationNode(extractTagsToListOfPairs(preparse(text)).map { (tag, content) -> parseTagWithBody(tag, content) }) + + protected open fun parseTagWithBody(tagName: String, content: String): TagWrapper = + when (tagName) { + "description" -> Description(parseStringToDocNode(content)) + "author" -> Author(parseStringToDocNode(content)) + "version" -> Version(parseStringToDocNode(content)) + "since" -> Since(parseStringToDocNode(content)) + "see" -> See( + parseStringToDocNode(content.substringAfter(' ')), + content.substringBefore(' '), + null + ) + "param" -> Param( + parseStringToDocNode(content.substringAfter(' ')), + content.substringBefore(' ') + ) + "property" -> Property( + parseStringToDocNode(content.substringAfter(' ')), + content.substringBefore(' ') + ) + "return" -> Return(parseStringToDocNode(content)) + "constructor" -> Constructor(parseStringToDocNode(content)) + "receiver" -> Receiver(parseStringToDocNode(content)) + "throws", "exception" -> Throws( + parseStringToDocNode(content.substringAfter(' ')), + content.substringBefore(' '), + null + ) + "deprecated" -> Deprecated(parseStringToDocNode(content)) + "sample" -> Sample( + parseStringToDocNode(content.substringAfter(' ')), + content.substringBefore(' ') + ) + "suppress" -> Suppress(parseStringToDocNode(content)) + else -> CustomTagWrapper(parseStringToDocNode(content), tagName) + } + + /** + * KDoc parser from Kotlin compiler relies on a comment asterisk + * So there is a mini parser here + * TODO: at least to adapt [org.jetbrains.kotlin.kdoc.lexer.KDocLexer] to analyze KDoc without the asterisks and use it here + */ + private fun extractTagsToListOfPairs(text: String): List<Pair<String, String>> = + "description $text" + .extractKDocSections() + .map { content -> + val contentWithEscapedAts = content.replace("\\@", "@") + val (tag, body) = contentWithEscapedAts.split(" ", limit = 2) + tag to body + } + + /** + * Ignore a doc tag inside code spans and blocks + * @see org.jetbrains.kotlin.kdoc.psi.impl.KDocSection + */ + private fun CharSequence.extractKDocSections(delimiter: String = "\n@"): List<String> { + var countOfBackticks = 0 + var countOfTildes = 0 + var countOfBackticksInOpeningFence = 0 + var countOfTildesInOpeningFence = 0 + + var isInCode = false + val result = mutableListOf<String>() + var rangeStart = 0 + var rangeEnd = 0 + var currentOffset = 0 + while (currentOffset < length) { + + when (get(currentOffset)) { + '`' -> { + countOfBackticks++ + countOfTildes = 0 + } + '~' -> { + countOfTildes++ + countOfBackticks = 0 + } + else -> { + if (isInCode) { + // The closing code fence must be at least as long as the opening fence + if(countOfBackticks >= countOfBackticksInOpeningFence + || countOfTildes >= countOfTildesInOpeningFence) + isInCode = false + } else { + // as per CommonMark spec, there can be any number of backticks for a code span, not only one or three + if (countOfBackticks > 0) { + isInCode = true + countOfBackticksInOpeningFence = countOfBackticks + countOfTildesInOpeningFence = Int.MAX_VALUE + } + // tildes are only for a code block, not code span + if (countOfTildes >= 3) { + isInCode = true + countOfTildesInOpeningFence = countOfTildes + countOfBackticksInOpeningFence = Int.MAX_VALUE + } + } + countOfTildes = 0 + countOfBackticks = 0 + } + } + if (!isInCode && startsWith(delimiter, currentOffset)) { + result.add(substring(rangeStart, rangeEnd)) + currentOffset += delimiter.length + rangeStart = currentOffset + rangeEnd = currentOffset + continue + } + + ++rangeEnd + ++currentOffset + } + result.add(substring(rangeStart, rangeEnd)) + return result + } + +} diff --git a/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/factories/DocTagsFromIElementFactory.kt b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/factories/DocTagsFromIElementFactory.kt new file mode 100644 index 00000000..77ca92d5 --- /dev/null +++ b/dokka-subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/factories/DocTagsFromIElementFactory.kt @@ -0,0 +1,90 @@ +/* + * Copyright 2014-2023 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package org.jetbrains.dokka.analysis.markdown.jb.factories + +import org.intellij.markdown.IElementType +import org.intellij.markdown.MarkdownElementTypes +import org.intellij.markdown.MarkdownTokenTypes +import org.intellij.markdown.flavours.gfm.GFMElementTypes +import org.intellij.markdown.flavours.gfm.GFMTokenTypes +import org.jetbrains.dokka.analysis.markdown.jb.MARKDOWN_ELEMENT_FILE_NAME +import org.jetbrains.dokka.analysis.markdown.jb.parseHtmlEncodedWithNormalisedSpaces +import org.jetbrains.dokka.links.DRI +import org.jetbrains.dokka.model.doc.* +import org.jetbrains.dokka.model.doc.DocTag.Companion.contentTypeParam +import org.jsoup.Jsoup + +internal object DocTagsFromIElementFactory { + + @Suppress("IMPLICIT_CAST_TO_ANY") + fun getInstance(type: IElementType, children: List<DocTag> = emptyList(), params: Map<String, String> = emptyMap(), body: String? = null, dri: DRI? = null, keepFormatting: Boolean = false) = + when(type) { + MarkdownElementTypes.SHORT_REFERENCE_LINK, + MarkdownElementTypes.FULL_REFERENCE_LINK, + MarkdownElementTypes.INLINE_LINK -> if(dri == null) A(children, params) else DocumentationLink(dri, children, params) + MarkdownElementTypes.STRONG -> B(children, params) + MarkdownElementTypes.BLOCK_QUOTE -> BlockQuote(children, params) + MarkdownElementTypes.CODE_SPAN -> CodeInline(children, params) + MarkdownElementTypes.CODE_BLOCK, + MarkdownElementTypes.CODE_FENCE -> CodeBlock(children, params) + MarkdownElementTypes.ATX_1 -> H1(children, params) + MarkdownElementTypes.ATX_2 -> H2(children, params) + MarkdownElementTypes.ATX_3 -> H3(children, params) + MarkdownElementTypes.ATX_4 -> H4(children, params) + MarkdownElementTypes.ATX_5 -> H5(children, params) + MarkdownElementTypes.ATX_6 -> H6(children, params) + MarkdownElementTypes.EMPH -> I(children, params) + MarkdownElementTypes.IMAGE -> Img(children, params) + MarkdownElementTypes.LIST_ITEM -> Li(children, params) + MarkdownElementTypes.ORDERED_LIST -> Ol(children, params) + MarkdownElementTypes.UNORDERED_LIST -> Ul(children, params) + MarkdownElementTypes.PARAGRAPH -> P(children, params) + MarkdownTokenTypes.TEXT -> if (keepFormatting) Text( + body.orEmpty(), + children, + params + ) else { + // corner case: there are only spaces between two Markdown nodes + val containsOnlySpaces = body?.isNotEmpty() == true && body.all { it.isWhitespace() } + if (containsOnlySpaces) Text(" ", children, params) + else body?.parseWithNormalisedSpaces(renderWhiteCharactersAsSpaces = false).orEmpty() + } + MarkdownTokenTypes.HORIZONTAL_RULE -> HorizontalRule + MarkdownTokenTypes.HARD_LINE_BREAK -> Br + GFMElementTypes.STRIKETHROUGH -> Strikethrough(children, params) + GFMElementTypes.TABLE -> Table(children, params) + GFMElementTypes.HEADER -> Th(children, params) + GFMElementTypes.ROW -> Tr(children, params) + GFMTokenTypes.CELL -> Td(children, params) + MarkdownElementTypes.MARKDOWN_FILE -> CustomDocTag(children, params, MARKDOWN_ELEMENT_FILE_NAME) + MarkdownElementTypes.HTML_BLOCK, + MarkdownTokenTypes.HTML_TAG, + MarkdownTokenTypes.HTML_BLOCK_CONTENT -> Text(body.orEmpty(), params = params + contentTypeParam("html")) + else -> CustomDocTag(children, params, type.name) + }.let { + @Suppress("UNCHECKED_CAST") + when (it) { + is List<*> -> it as List<DocTag> + else -> listOf(it as DocTag) + } + } + + /** + * Parses string into [Text] doc tags that can have either value of the string or html-encoded value with content-type=html parameter. + * Content type is added when dealing with html entries like ` ` + */ + private fun String.parseWithNormalisedSpaces( + renderWhiteCharactersAsSpaces: Boolean + ): List<DocTag> { + if (!requiresHtmlEncoding()) { + return parseHtmlEncodedWithNormalisedSpaces(renderWhiteCharactersAsSpaces) + } + // parsing it using jsoup is required to get codePoints, otherwise they are interpreted separately, as chars + // But we dont need to do it for java as it is already parsed with jsoup + return Jsoup.parseBodyFragment(this).body().wholeText().parseHtmlEncodedWithNormalisedSpaces(renderWhiteCharactersAsSpaces) + } + + private fun String.requiresHtmlEncoding(): Boolean = indexOf('&') != -1 +} |