Decompose Kotlin/Java analysis (#3034)

* Extract analysis into separate modules
author: Ignat Beresnev <ignat.beresnev@jetbrains.com> 2023-07-05 10:04:55 +0200
committer: GitHub <noreply@github.com> 2023-07-05 10:04:55 +0200
commit: 9559158bfeeb274e9ccf1b4563f1b23b42afc493 (patch)
tree: 3ece0887623cfe2b7148af23001867a1dd5e6597 /subprojects/analysis-markdown-jb/src
parent: cbd9733d3dd2f52992e98e7cebd072091a572529 (diff)
download: dokka-9559158bfeeb274e9ccf1b4563f1b23b42afc493.tar.gz
dokka-9559158bfeeb274e9ccf1b4563f1b23b42afc493.tar.bz2
dokka-9559158bfeeb274e9ccf1b4563f1b23b42afc493.zip
5 files changed, 775 insertions, 0 deletions
diff --git a/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownApi.kt b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownApi.kt
new file mode 100644
index 00000000..e8738190
--- /dev/null
+++ b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownApi.kt
@@ -0,0 +1,8 @@
+package org.jetbrains.dokka.analysis.markdown.jb
+
+import org.intellij.markdown.MarkdownElementTypes
+import org.jetbrains.dokka.InternalDokkaApi
+
+// TODO [beresnev] move/rename if it's only used for CustomDocTag. for now left as is for compatibility
+@InternalDokkaApi
+val MARKDOWN_ELEMENT_FILE_NAME = MarkdownElementTypes.MARKDOWN_FILE.name
diff --git a/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser.kt b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser.kt
new file mode 100644
index 00000000..165ca4c6
--- /dev/null
+++ b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/MarkdownParser.kt
@@ -0,0 +1,511 @@
+package org.jetbrains.dokka.analysis.markdown.jb
+
+import org.intellij.markdown.MarkdownElementTypes
+import org.intellij.markdown.MarkdownTokenTypes
+import org.intellij.markdown.ast.ASTNode
+import org.intellij.markdown.ast.CompositeASTNode
+import org.intellij.markdown.ast.LeafASTNode
+import org.intellij.markdown.ast.impl.ListItemCompositeNode
+import org.intellij.markdown.flavours.gfm.GFMElementTypes
+import org.intellij.markdown.flavours.gfm.GFMFlavourDescriptor
+import org.intellij.markdown.flavours.gfm.GFMTokenTypes
+import org.intellij.markdown.html.HtmlGenerator
+import org.jetbrains.dokka.InternalDokkaApi
+import org.jetbrains.dokka.analysis.markdown.jb.factories.DocTagsFromIElementFactory
+import org.jetbrains.dokka.links.DRI
+import org.jetbrains.dokka.links.PointingToDeclaration
+import org.jetbrains.dokka.model.doc.*
+import java.net.MalformedURLException
+import java.net.URL
+import org.intellij.markdown.parser.MarkdownParser as IntellijMarkdownParser
+
+@InternalDokkaApi
+open class MarkdownParser(
+    private val externalDri: (String) -> DRI?,
+    private val kdocLocation: String?,
+) : Parser() {
+
+    private lateinit var destinationLinksMap: Map<String, String>
+    private lateinit var text: String
+
+    override fun parseStringToDocNode(extractedString: String): DocTag {
+        val gfmFlavourDescriptor = GFMFlavourDescriptor()
+        val markdownAstRoot = IntellijMarkdownParser(gfmFlavourDescriptor).buildMarkdownTreeFromString(extractedString)
+        destinationLinksMap = getAllDestinationLinks(extractedString, markdownAstRoot).toMap()
+        text = extractedString
+
+        val parsed = visitNode(markdownAstRoot)
+        if (parsed.size == 1) {
+            return parsed.first()
+        }
+        return CustomDocTag(children = parsed, params = emptyMap(), name = "")
+    }
+
+    override fun preparse(text: String) = text.replace("\r\n", "\n").replace("\r", "\n")
+
+    override fun parseTagWithBody(tagName: String, content: String): TagWrapper =
+        when (tagName) {
+            "see" -> {
+                val referencedName = content.substringBefore(' ')
+                val dri = externalDri(referencedName)
+                See(
+                    parseStringToDocNode(content.substringAfter(' ')),
+                    dri?.fqDeclarationName() ?: referencedName,
+                    dri
+                )
+            }
+            "throws", "exception" -> {
+                val dri = externalDri(content.substringBefore(' '))
+                Throws(
+                    parseStringToDocNode(content.substringAfter(' ')),
+                    dri?.fqDeclarationName() ?: content.substringBefore(' '),
+                    dri
+                )
+            }
+            else -> super.parseTagWithBody(tagName, content)
+        }
+
+    private fun headersHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            node.type,
+            visitNode(node.children.find { it.type == MarkdownTokenTypes.ATX_CONTENT }
+                ?: throw detailedException("Wrong AST Tree. Header does not contain expected content", node)
+            ).flatMap { it.children }
+        )
+
+    private fun horizontalRulesHandler() =
+        DocTagsFromIElementFactory.getInstance(MarkdownTokenTypes.HORIZONTAL_RULE)
+
+    private fun emphasisHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            node.type,
+            children = node.children.evaluateChildrenWithDroppedEnclosingTokens(1)
+        )
+
+    private fun strongHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            node.type,
+            children = node.children.evaluateChildrenWithDroppedEnclosingTokens(2)
+        )
+
+    private fun List<ASTNode>.evaluateChildrenWithDroppedEnclosingTokens(count: Int) =
+        drop(count).dropLast(count).evaluateChildren()
+
+    private fun blockquotesHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            node.type, children = node.children
+                .filterIsInstance<CompositeASTNode>()
+                .evaluateChildren()
+        )
+
+    private fun listsHandler(node: ASTNode): List<DocTag> {
+
+        val children = node.children.filterIsInstance<ListItemCompositeNode>().flatMap {
+            if (it.children.last().type in listOf(
+                    MarkdownElementTypes.ORDERED_LIST,
+                    MarkdownElementTypes.UNORDERED_LIST
+                )
+            ) {
+                val nestedList = it.children.last()
+                (it.children as MutableList).removeAt(it.children.lastIndex)
+                listOf(it, nestedList)
+            } else
+                listOf(it)
+        }
+
+        return DocTagsFromIElementFactory.getInstance(
+            node.type,
+            children =
+            children
+                .flatMap {
+                    if (it.type == MarkdownElementTypes.LIST_ITEM)
+                        DocTagsFromIElementFactory.getInstance(
+                            it.type,
+                            children = it
+                                .children
+                                .filterIsInstance<CompositeASTNode>()
+                                .evaluateChildren()
+                        )
+                    else
+                        visitNode(it)
+                },
+            params =
+            if (node.type == MarkdownElementTypes.ORDERED_LIST) {
+                val listNumberNode = node.children.first().children.first()
+                mapOf(
+                    "start" to text.substring(
+                        listNumberNode.startOffset,
+                        listNumberNode.endOffset
+                    ).trim().dropLast(1)
+                )
+            } else
+                emptyMap()
+        )
+    }
+
+    private fun resolveDRI(mdLink: String): DRI? =
+        mdLink
+            .removePrefix("[")
+            .removeSuffix("]")
+            .let { link ->
+                try {
+                    URL(link)
+                    null
+                } catch (e: MalformedURLException) {
+                    externalDri(link)
+                }
+            }
+
+    private fun getAllDestinationLinks(text: String, node: ASTNode): List<Pair<String, String>> =
+        node.children
+            .filter { it.type == MarkdownElementTypes.LINK_DEFINITION }
+            .map {
+                text.substring(it.children[0].startOffset, it.children[0].endOffset).toLowerCase() to
+                        text.substring(it.children[2].startOffset, it.children[2].endOffset)
+            } +
+                node.children.filterIsInstance<CompositeASTNode>().flatMap { getAllDestinationLinks(text, it) }
+
+
+    private fun referenceLinksHandler(node: ASTNode): List<DocTag> {
+        val linkLabel = node.children.find { it.type == MarkdownElementTypes.LINK_LABEL }
+            ?: throw detailedException("Wrong AST Tree. Reference link does not contain link label", node)
+        val linkText = node.children.findLast { it.type == MarkdownElementTypes.LINK_TEXT } ?: linkLabel
+
+        val linkKey = text.substring(linkLabel.startOffset, linkLabel.endOffset)
+
+        val link = destinationLinksMap[linkKey.toLowerCase()] ?: linkKey
+
+        return linksHandler(linkText, link)
+    }
+
+    private fun inlineLinksHandler(node: ASTNode): List<DocTag> {
+        val linkText = node.children.find { it.type == MarkdownElementTypes.LINK_TEXT }
+            ?: throw detailedException("Wrong AST Tree. Inline link does not contain link text", node)
+        val linkDestination = node.children.find { it.type == MarkdownElementTypes.LINK_DESTINATION }
+        val linkTitle = node.children.find { it.type == MarkdownElementTypes.LINK_TITLE }
+
+        // Link destination may be ommited: https://github.github.com/gfm/#example-495
+        val link = linkDestination?.let { text.substring(it.startOffset, it.endOffset) }
+
+        return linksHandler(linkText, link, linkTitle)
+    }
+
+    private fun markdownFileHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            node.type,
+            children = node.children
+                .filterSpacesAndEOL()
+                .evaluateChildren()
+        )
+
+    private fun autoLinksHandler(node: ASTNode): List<DocTag> {
+        val link = text.substring(node.startOffset + 1, node.endOffset - 1)
+
+        return linksHandler(node, link)
+    }
+
+    private fun linksHandler(linkText: ASTNode, link: String?, linkTitle: ASTNode? = null): List<DocTag> {
+        val dri: DRI? = link?.let { resolveDRI(it) }
+        val linkOrEmpty = link ?: ""
+        val linkTextString =
+            if (linkTitle == null) linkOrEmpty else text.substring(linkTitle.startOffset + 1, linkTitle.endOffset - 1)
+
+        val params = if (linkTitle == null)
+            mapOf("href" to linkOrEmpty)
+        else
+            mapOf("href" to linkOrEmpty, "title" to linkTextString)
+
+        return if (link != null && dri == null && !linkOrEmpty.isRemoteLink()) {
+            DocTagsFromIElementFactory.getInstance(
+                MarkdownTokenTypes.TEXT,
+                params = params,
+                children = linkText.children.drop(1).dropLast(1).evaluateChildren(),
+                body = linkTextString.removeSurrounding("[", "]")
+            )
+        } else {
+            DocTagsFromIElementFactory.getInstance(
+                MarkdownElementTypes.INLINE_LINK,
+                params = params,
+                children = linkText.children.drop(1).dropLast(1).evaluateChildren(),
+                dri = dri
+            )
+        }
+    }
+
+    private fun codeLineHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance(
+        MarkdownElementTypes.CODE_BLOCK,
+        body = text.substring(node.startOffset, node.endOffset)
+    )
+
+    private fun textHandler(node: ASTNode, keepAllFormatting: Boolean) = DocTagsFromIElementFactory.getInstance(
+        MarkdownTokenTypes.TEXT,
+        body = text.substring(node.startOffset, node.endOffset).transform(),
+        keepFormatting = keepAllFormatting
+    )
+
+    private fun strikeThroughHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance(
+        node.type,
+        children = node.children.evaluateChildrenWithDroppedEnclosingTokens(2)
+    )
+
+    private fun tableHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance(
+        GFMElementTypes.TABLE,
+        children = node.children
+            .filter { it.type == GFMElementTypes.ROW || it.type == GFMElementTypes.HEADER }
+            .evaluateChildren()
+    )
+
+    private fun headerHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance(
+        GFMElementTypes.HEADER,
+        children = node.children
+            .filter { it.type == GFMTokenTypes.CELL }
+            .evaluateChildren()
+    )
+
+    private fun rowHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance(
+        GFMElementTypes.ROW,
+        children = node.children
+            .filter { it.type == GFMTokenTypes.CELL }
+            .evaluateChildren()
+    )
+
+    private fun cellHandler(node: ASTNode) = DocTagsFromIElementFactory.getInstance(
+        GFMTokenTypes.CELL,
+        children = node.children.filterTabSeparators().evaluateChildren().trimSurroundingTokensIfText()
+    )
+
+    private fun String.isRemoteLink() = try {
+        URL(this)
+        true
+    } catch (e: MalformedURLException) {
+        false
+    }
+
+    private fun imagesHandler(node: ASTNode): List<DocTag> =
+        with(node.children.last().children) {
+            val destination = find { it.type == MarkdownElementTypes.LINK_DESTINATION }
+            val description = find { it.type == MarkdownElementTypes.LINK_TEXT }
+
+            val src = destination?.let {
+                mapOf("href" to text.substring(it.startOffset, it.endOffset))
+            } ?: emptyMap()
+
+            val alt = description?.let {
+                mapOf("alt" to text.substring(it.startOffset + 1, it.endOffset - 1))
+            } ?: emptyMap()
+
+            return DocTagsFromIElementFactory.getInstance(
+                node.type,
+                params = src + alt
+            )
+        }
+
+
+    private fun rawHtmlHandler(node: ASTNode): List<DocTag> =
+        DocTagsFromIElementFactory.getInstance(
+            node.type,
+            body = text.substring(node.startOffset, node.endOffset)
+        )
+
+    private fun codeSpansHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            node.type,
+            children = DocTagsFromIElementFactory.getInstance(
+                MarkdownTokenTypes.TEXT,
+                body = text.substring(node.startOffset + 1, node.endOffset - 1).replace('\n', ' ').trimIndent(),
+                keepFormatting = true
+            )
+        )
+
+    private fun codeFencesHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            node.type,
+            children = node
+                .children
+                .dropWhile { it.type != MarkdownTokenTypes.CODE_FENCE_CONTENT }
+                .dropLastWhile { it.type != MarkdownTokenTypes.CODE_FENCE_CONTENT }
+                .filter { it.type != MarkdownTokenTypes.WHITE_SPACE }
+                .map {
+                    if (it.type == MarkdownTokenTypes.EOL)
+                        LeafASTNode(MarkdownTokenTypes.HARD_LINE_BREAK, 0, 0)
+                    else
+                        it
+                }.evaluateChildren(keepAllFormatting = true),
+            params = node
+                .children
+                .find { it.type == MarkdownTokenTypes.FENCE_LANG }
+                ?.let { mapOf("lang" to text.substring(it.startOffset, it.endOffset)) }
+                ?: emptyMap()
+        )
+
+    private fun codeBlocksHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(node.type, children = node.children.mergeLeafASTNodes().flatMap {
+            DocTagsFromIElementFactory.getInstance(
+                MarkdownTokenTypes.TEXT,
+                body = HtmlGenerator.trimIndents(text.substring(it.startOffset, it.endOffset), 4).toString()
+            )
+        })
+
+    private fun defaultHandler(node: ASTNode) =
+        DocTagsFromIElementFactory.getInstance(
+            MarkdownElementTypes.PARAGRAPH,
+            children = node.children.evaluateChildren()
+        )
+
+    private fun visitNode(node: ASTNode, keepAllFormatting: Boolean = false): List<DocTag> =
+        when (node.type) {
+            MarkdownElementTypes.ATX_1,
+            MarkdownElementTypes.ATX_2,
+            MarkdownElementTypes.ATX_3,
+            MarkdownElementTypes.ATX_4,
+            MarkdownElementTypes.ATX_5,
+            MarkdownElementTypes.ATX_6,
+            -> headersHandler(node)
+            MarkdownTokenTypes.HORIZONTAL_RULE -> horizontalRulesHandler()
+            MarkdownElementTypes.STRONG -> strongHandler(node)
+            MarkdownElementTypes.EMPH -> emphasisHandler(node)
+            MarkdownElementTypes.FULL_REFERENCE_LINK,
+            MarkdownElementTypes.SHORT_REFERENCE_LINK,
+            -> referenceLinksHandler(node)
+            MarkdownElementTypes.INLINE_LINK -> inlineLinksHandler(node)
+            MarkdownElementTypes.AUTOLINK -> autoLinksHandler(node)
+            MarkdownElementTypes.BLOCK_QUOTE -> blockquotesHandler(node)
+            MarkdownElementTypes.UNORDERED_LIST,
+            MarkdownElementTypes.ORDERED_LIST,
+            -> listsHandler(node)
+            MarkdownElementTypes.CODE_BLOCK -> codeBlocksHandler(node)
+            MarkdownElementTypes.CODE_FENCE -> codeFencesHandler(node)
+            MarkdownElementTypes.CODE_SPAN -> codeSpansHandler(node)
+            MarkdownElementTypes.IMAGE -> imagesHandler(node)
+            MarkdownElementTypes.HTML_BLOCK,
+            MarkdownTokenTypes.HTML_TAG,
+            MarkdownTokenTypes.HTML_BLOCK_CONTENT,
+            -> rawHtmlHandler(node)
+            MarkdownTokenTypes.HARD_LINE_BREAK -> DocTagsFromIElementFactory.getInstance(node.type)
+            MarkdownTokenTypes.CODE_FENCE_CONTENT,
+            MarkdownTokenTypes.CODE_LINE,
+            -> codeLineHandler(node)
+            MarkdownTokenTypes.TEXT -> textHandler(node, keepAllFormatting)
+            MarkdownElementTypes.MARKDOWN_FILE -> markdownFileHandler(node)
+            GFMElementTypes.STRIKETHROUGH -> strikeThroughHandler(node)
+            GFMElementTypes.TABLE -> tableHandler(node)
+            GFMElementTypes.HEADER -> headerHandler(node)
+            GFMElementTypes.ROW -> rowHandler(node)
+            GFMTokenTypes.CELL -> cellHandler(node)
+            else -> defaultHandler(node)
+        }
+
+    private fun List<ASTNode>.filterTabSeparators() =
+        this.filterNot { it.type == GFMTokenTypes.TABLE_SEPARATOR }
+
+    private fun List<ASTNode>.filterSpacesAndEOL() =
+        this.filterNot { it.type == MarkdownTokenTypes.WHITE_SPACE || it.type == MarkdownTokenTypes.EOL }
+
+    private fun List<ASTNode>.evaluateChildren(keepAllFormatting: Boolean = false): List<DocTag> =
+        this.removeUselessTokens().swapImagesThatShouldBeLinks(keepAllFormatting).mergeLeafASTNodes().flatMap { visitNode(it, keepAllFormatting) }
+
+    private fun List<ASTNode>.swapImagesThatShouldBeLinks(keepAllFormatting: Boolean): List<ASTNode> =
+        if (keepAllFormatting) {
+            this
+        } else {
+            flatMap { node ->
+                if (node.type == MarkdownElementTypes.IMAGE
+                    && node.children.firstOrNull()?.let { it is LeafASTNode && it.type.name == "!" } == true
+                    && node.children.lastOrNull()?.type == MarkdownElementTypes.SHORT_REFERENCE_LINK
+                ) {
+                    node.children
+                } else {
+                    listOf(node)
+                }
+            }
+        }
+
+    private fun List<ASTNode>.removeUselessTokens(): List<ASTNode> =
+        this.filterIndexed { index, node ->
+            !(node.type == MarkdownElementTypes.LINK_DEFINITION || (
+                    node.type == MarkdownTokenTypes.EOL &&
+                            this.getOrNull(index - 1)?.type == MarkdownTokenTypes.HARD_LINE_BREAK
+                    ))
+        }
+
+    private fun List<DocTag>.trimSurroundingTokensIfText() = mapIndexed { index, elem ->
+        val elemTransformed = if (index == 0 && elem is Text) elem.copy(elem.body.trimStart()) else elem
+        if (index == lastIndex && elemTransformed is Text) elemTransformed.copy(elemTransformed.body.trimEnd()) else elemTransformed
+    }
+
+    private val notLeafNodes = listOf(
+        MarkdownTokenTypes.HORIZONTAL_RULE,
+        MarkdownTokenTypes.HARD_LINE_BREAK,
+        MarkdownTokenTypes.HTML_TAG,
+        MarkdownTokenTypes.HTML_BLOCK_CONTENT
+    )
+
+    private fun ASTNode.isNotLeaf() = this is CompositeASTNode || this.type in notLeafNodes
+
+    private fun List<ASTNode>.isNotLeaf(index: Int): Boolean =
+        if (index in 0..this.lastIndex)
+            this[index].isNotLeaf()
+        else
+            false
+
+    private fun List<ASTNode>.mergeLeafASTNodes(): List<ASTNode> {
+        val children: MutableList<ASTNode> = mutableListOf()
+        var index = 0
+        while (index <= this.lastIndex) {
+            if (this.isNotLeaf(index)) {
+                children += this[index]
+            } else {
+                val startOffset = this[index].startOffset
+                val sIndex = index
+                while (index < this.lastIndex) {
+                    if (this.isNotLeaf(index + 1) || this[index + 1].startOffset != this[index].endOffset) {
+                        children += mergedLeafNode(this, index, startOffset, sIndex)
+                        break
+                    }
+                    index++
+                }
+                if (index == this.lastIndex) {
+                    children += mergedLeafNode(this, index, startOffset, sIndex)
+                }
+            }
+            index++
+        }
+        return children
+    }
+
+    private fun mergedLeafNode(nodes: List<ASTNode>, index: Int, startOffset: Int, sIndex: Int): LeafASTNode {
+        val endOffset = nodes[index].endOffset
+        val type = if (nodes.subList(sIndex, index)
+                .any { it.type == MarkdownTokenTypes.CODE_LINE }
+        ) MarkdownTokenTypes.CODE_LINE else MarkdownTokenTypes.TEXT
+        return LeafASTNode(type, startOffset, endOffset)
+    }
+
+    private fun String.transform() = this
+        .replace(Regex("\n\n+"), "")        // Squashing new lines between paragraphs
+        .replace(Regex("\n"), " ")
+        .replace(Regex(" >+ +"), " ")      // Replacement used in blockquotes, get rid of garbage
+
+    private fun detailedException(baseMessage: String, node: ASTNode) =
+        IllegalStateException(
+            baseMessage + " in ${kdocLocation ?: "unspecified location"}, element starts from offset ${node.startOffset} and ends ${node.endOffset}: ${
+                text.substring(
+                    node.startOffset,
+                    node.endOffset
+                )
+            }"
+        )
+
+
+    companion object {
+        fun DRI.fqDeclarationName(): String? {
+            if (this.target !is PointingToDeclaration) {
+                return null
+            }
+            return listOfNotNull(this.packageName, this.classNames, this.callable?.name)
+                .joinToString(separator = ".")
+                .takeIf { it.isNotBlank() }
+        }
+    }
+}
+
diff --git a/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/ParseUtils.kt b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/ParseUtils.kt
new file mode 100644
index 00000000..7f520591
--- /dev/null
+++ b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/ParseUtils.kt
@@ -0,0 +1,39 @@
+package org.jetbrains.dokka.analysis.markdown.jb
+
+import org.intellij.markdown.lexer.Compat
+import org.intellij.markdown.lexer.Compat.forEachCodePoint
+import org.jetbrains.dokka.InternalDokkaApi
+import org.jetbrains.dokka.model.doc.DocTag
+import org.jetbrains.dokka.model.doc.Text
+import org.jsoup.internal.StringUtil
+import org.jsoup.nodes.Entities
+
+@InternalDokkaApi
+fun String.parseHtmlEncodedWithNormalisedSpaces(
+    renderWhiteCharactersAsSpaces: Boolean
+): List<DocTag> {
+    val accum = StringBuilder()
+    val tags = mutableListOf<DocTag>()
+    var lastWasWhite = false
+
+    forEachCodePoint { c ->
+        if (renderWhiteCharactersAsSpaces && StringUtil.isWhitespace(c)) {
+            if (!lastWasWhite) {
+                accum.append(' ')
+                lastWasWhite = true
+            }
+        } else if (Compat.codePointToString(c).let { it != Entities.escape(it) }) {
+            accum.toString().takeIf { it.isNotBlank() }?.let { tags.add(Text(it)) }
+            accum.delete(0, accum.length)
+
+            accum.appendCodePoint(c)
+            tags.add(Text(accum.toString(), params = DocTag.contentTypeParam("html")))
+            accum.delete(0, accum.length)
+        } else if (!StringUtil.isInvisibleChar(c)) {
+            accum.appendCodePoint(c)
+            lastWasWhite = false
+        }
+    }
+    accum.toString().takeIf { it.isNotBlank() }?.let { tags.add(Text(it)) }
+    return tags
+}
diff --git a/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/Parser.kt b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/Parser.kt
new file mode 100644
index 00000000..09650b32
--- /dev/null
+++ b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/Parser.kt
@@ -0,0 +1,131 @@
+package org.jetbrains.dokka.analysis.markdown.jb
+
+import org.jetbrains.dokka.InternalDokkaApi
+import org.jetbrains.dokka.model.doc.*
+
+@InternalDokkaApi
+abstract class Parser {
+
+    abstract fun parseStringToDocNode(extractedString: String): DocTag
+
+    protected abstract fun preparse(text: String): String
+
+    open fun parse(text: String): DocumentationNode =
+        DocumentationNode(extractTagsToListOfPairs(preparse(text)).map { (tag, content) -> parseTagWithBody(tag, content) })
+
+    protected open fun parseTagWithBody(tagName: String, content: String): TagWrapper =
+        when (tagName) {
+            "description" -> Description(parseStringToDocNode(content))
+            "author" -> Author(parseStringToDocNode(content))
+            "version" -> Version(parseStringToDocNode(content))
+            "since" -> Since(parseStringToDocNode(content))
+            "see" -> See(
+                parseStringToDocNode(content.substringAfter(' ')),
+                content.substringBefore(' '),
+                null
+            )
+            "param" -> Param(
+                parseStringToDocNode(content.substringAfter(' ')),
+                content.substringBefore(' ')
+            )
+            "property" -> Property(
+                parseStringToDocNode(content.substringAfter(' ')),
+                content.substringBefore(' ')
+            )
+            "return" -> Return(parseStringToDocNode(content))
+            "constructor" -> Constructor(parseStringToDocNode(content))
+            "receiver" -> Receiver(parseStringToDocNode(content))
+            "throws", "exception" -> Throws(
+                parseStringToDocNode(content.substringAfter(' ')),
+                content.substringBefore(' '),
+                null
+            )
+            "deprecated" -> Deprecated(parseStringToDocNode(content))
+            "sample" -> Sample(
+                parseStringToDocNode(content.substringAfter(' ')),
+                content.substringBefore(' ')
+            )
+            "suppress" -> Suppress(parseStringToDocNode(content))
+            else -> CustomTagWrapper(parseStringToDocNode(content), tagName)
+        }
+
+    /**
+     * KDoc parser from Kotlin compiler relies on a comment asterisk
+     * So there is a mini parser here
+     * TODO: at least to adapt [org.jetbrains.kotlin.kdoc.lexer.KDocLexer] to analyze KDoc without the asterisks and use it here
+     */
+    private fun extractTagsToListOfPairs(text: String): List<Pair<String, String>> =
+        "description $text"
+            .extractKDocSections()
+            .map { content ->
+                val contentWithEscapedAts = content.replace("\\@", "@")
+                val (tag, body) = contentWithEscapedAts.split(" ", limit = 2)
+                tag to body
+            }
+
+    /**
+     * Ignore a doc tag inside code spans and blocks
+     * @see org.jetbrains.kotlin.kdoc.psi.impl.KDocSection
+     */
+    private fun CharSequence.extractKDocSections(delimiter: String = "\n@"): List<String> {
+        var countOfBackticks = 0
+        var countOfTildes = 0
+        var countOfBackticksInOpeningFence = 0
+        var countOfTildesInOpeningFence = 0
+
+        var isInCode = false
+        val result = mutableListOf<String>()
+        var rangeStart = 0
+        var rangeEnd = 0
+        var currentOffset = 0
+        while (currentOffset < length) {
+
+            when (get(currentOffset)) {
+                '`' -> {
+                    countOfBackticks++
+                    countOfTildes = 0
+                }
+                '~' -> {
+                    countOfTildes++
+                    countOfBackticks = 0
+                }
+                else -> {
+                    if (isInCode) {
+                        // The closing code fence must be at least as long as the opening fence
+                        if(countOfBackticks >= countOfBackticksInOpeningFence
+                                || countOfTildes >= countOfTildesInOpeningFence)
+                            isInCode = false
+                    } else {
+                        // as per CommonMark spec, there can be any number of backticks for a code span, not only one or three
+                        if (countOfBackticks > 0) {
+                            isInCode = true
+                            countOfBackticksInOpeningFence = countOfBackticks
+                            countOfTildesInOpeningFence = Int.MAX_VALUE
+                        }
+                        // tildes are only for a code block, not code span
+                        if (countOfTildes >= 3) {
+                            isInCode = true
+                            countOfTildesInOpeningFence = countOfTildes
+                            countOfBackticksInOpeningFence = Int.MAX_VALUE
+                        }
+                    }
+                    countOfTildes = 0
+                    countOfBackticks = 0
+                }
+            }
+            if (!isInCode && startsWith(delimiter, currentOffset)) {
+                result.add(substring(rangeStart, rangeEnd))
+                currentOffset += delimiter.length
+                rangeStart = currentOffset
+                rangeEnd = currentOffset
+                continue
+            }
+
+            ++rangeEnd
+            ++currentOffset
+        }
+        result.add(substring(rangeStart, rangeEnd))
+        return result
+    }
+
+}
diff --git a/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/factories/DocTagsFromIElementFactory.kt b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/factories/DocTagsFromIElementFactory.kt
new file mode 100644
index 00000000..86de8000
--- /dev/null
+++ b/subprojects/analysis-markdown-jb/src/main/kotlin/org/jetbrains/dokka/analysis/markdown/jb/factories/DocTagsFromIElementFactory.kt
@@ -0,0 +1,86 @@
+package org.jetbrains.dokka.analysis.markdown.jb.factories
+
+import org.intellij.markdown.IElementType
+import org.intellij.markdown.MarkdownElementTypes
+import org.intellij.markdown.MarkdownTokenTypes
+import org.intellij.markdown.flavours.gfm.GFMElementTypes
+import org.intellij.markdown.flavours.gfm.GFMTokenTypes
+import org.jetbrains.dokka.analysis.markdown.jb.MARKDOWN_ELEMENT_FILE_NAME
+import org.jetbrains.dokka.analysis.markdown.jb.parseHtmlEncodedWithNormalisedSpaces
+import org.jetbrains.dokka.links.DRI
+import org.jetbrains.dokka.model.doc.*
+import org.jetbrains.dokka.model.doc.DocTag.Companion.contentTypeParam
+import org.jsoup.Jsoup
+
+internal object DocTagsFromIElementFactory {
+
+    @Suppress("IMPLICIT_CAST_TO_ANY")
+    fun getInstance(type: IElementType, children: List<DocTag> = emptyList(), params: Map<String, String> = emptyMap(), body: String? = null, dri: DRI? = null, keepFormatting: Boolean = false) =
+        when(type) {
+            MarkdownElementTypes.SHORT_REFERENCE_LINK,
+            MarkdownElementTypes.FULL_REFERENCE_LINK,
+            MarkdownElementTypes.INLINE_LINK            -> if(dri == null) A(children, params) else DocumentationLink(dri, children, params)
+            MarkdownElementTypes.STRONG                 -> B(children, params)
+            MarkdownElementTypes.BLOCK_QUOTE            -> BlockQuote(children, params)
+            MarkdownElementTypes.CODE_SPAN              -> CodeInline(children, params)
+            MarkdownElementTypes.CODE_BLOCK,
+            MarkdownElementTypes.CODE_FENCE             -> CodeBlock(children, params)
+            MarkdownElementTypes.ATX_1                  -> H1(children, params)
+            MarkdownElementTypes.ATX_2                  -> H2(children, params)
+            MarkdownElementTypes.ATX_3                  -> H3(children, params)
+            MarkdownElementTypes.ATX_4                  -> H4(children, params)
+            MarkdownElementTypes.ATX_5                  -> H5(children, params)
+            MarkdownElementTypes.ATX_6                  -> H6(children, params)
+            MarkdownElementTypes.EMPH                   -> I(children, params)
+            MarkdownElementTypes.IMAGE                  -> Img(children, params)
+            MarkdownElementTypes.LIST_ITEM              -> Li(children, params)
+            MarkdownElementTypes.ORDERED_LIST           -> Ol(children, params)
+            MarkdownElementTypes.UNORDERED_LIST         -> Ul(children, params)
+            MarkdownElementTypes.PARAGRAPH              -> P(children, params)
+            MarkdownTokenTypes.TEXT -> if (keepFormatting) Text(
+                body.orEmpty(),
+                children,
+                params
+            ) else {
+                // corner case: there are only spaces between two Markdown nodes
+                val containsOnlySpaces = body?.isNotEmpty() == true && body.all { it.isWhitespace() }
+                if (containsOnlySpaces) Text(" ", children, params)
+                else body?.parseWithNormalisedSpaces(renderWhiteCharactersAsSpaces = false).orEmpty()
+            }
+            MarkdownTokenTypes.HORIZONTAL_RULE          -> HorizontalRule
+            MarkdownTokenTypes.HARD_LINE_BREAK          -> Br
+            GFMElementTypes.STRIKETHROUGH               -> Strikethrough(children, params)
+            GFMElementTypes.TABLE                       -> Table(children, params)
+            GFMElementTypes.HEADER                      -> Th(children, params)
+            GFMElementTypes.ROW                         -> Tr(children, params)
+            GFMTokenTypes.CELL                          -> Td(children, params)
+            MarkdownElementTypes.MARKDOWN_FILE          -> CustomDocTag(children, params, MARKDOWN_ELEMENT_FILE_NAME)
+            MarkdownElementTypes.HTML_BLOCK,
+            MarkdownTokenTypes.HTML_TAG,
+            MarkdownTokenTypes.HTML_BLOCK_CONTENT       -> Text(body.orEmpty(), params = params + contentTypeParam("html"))
+            else                                        -> CustomDocTag(children, params, type.name)
+        }.let {
+            @Suppress("UNCHECKED_CAST")
+            when (it) {
+                is List<*> -> it as List<DocTag>
+                else -> listOf(it as DocTag)
+            }
+        }
+
+    /**
+     * Parses string into [Text] doc tags that can have either value of the string or html-encoded value with content-type=html parameter.
+     * Content type is added when dealing with html entries like `&nbsp;`
+     */
+    private fun String.parseWithNormalisedSpaces(
+        renderWhiteCharactersAsSpaces: Boolean
+    ): List<DocTag> {
+        if (!requiresHtmlEncoding()) {
+            return parseHtmlEncodedWithNormalisedSpaces(renderWhiteCharactersAsSpaces)
+        }
+        // parsing it using jsoup is required to get codePoints, otherwise they are interpreted separately, as chars
+        // But we dont need to do it for java as it is already parsed with jsoup
+        return Jsoup.parseBodyFragment(this).body().wholeText().parseHtmlEncodedWithNormalisedSpaces(renderWhiteCharactersAsSpaces)
+    }
+
+    private fun String.requiresHtmlEncoding(): Boolean = indexOf('&') != -1
+}
author	Ignat Beresnev <ignat.beresnev@jetbrains.com>	2023-07-05 10:04:55 +0200
committer	GitHub <noreply@github.com>	2023-07-05 10:04:55 +0200
commit	9559158bfeeb274e9ccf1b4563f1b23b42afc493 (patch)
tree	3ece0887623cfe2b7148af23001867a1dd5e6597 /subprojects/analysis-markdown-jb/src
parent	cbd9733d3dd2f52992e98e7cebd072091a572529 (diff)
download	dokka-9559158bfeeb274e9ccf1b4563f1b23b42afc493.tar.gz dokka-9559158bfeeb274e9ccf1b4563f1b23b42afc493.tar.bz2 dokka-9559158bfeeb274e9ccf1b4563f1b23b42afc493.zip