diff options
-rw-r--r-- | dokka.iml | 1 | ||||
-rw-r--r-- | src/Formats/StructuredFormatService.kt | 6 | ||||
-rw-r--r-- | src/Formats/TextFormatService.kt | 4 | ||||
-rw-r--r-- | src/Markdown/GeneratedParserUtilBase.java | 1031 | ||||
-rw-r--r-- | src/Markdown/MarkdownLexer.java | 10 | ||||
-rw-r--r-- | src/Markdown/MarkdownProcessor.kt | 56 | ||||
-rw-r--r-- | src/Markdown/MarkdownTokenType.kt | 6 | ||||
-rw-r--r-- | src/Markdown/_MarkdownLexer.flex | 40 | ||||
-rw-r--r-- | src/Markdown/markdown.bnf | 83 | ||||
-rw-r--r-- | src/Markdown/markdown.leg | 781 | ||||
-rw-r--r-- | src/Model/DocumentationContent.kt | 74 | ||||
-rw-r--r-- | src/Model/DocumentationNodeBuilder.kt | 2 | ||||
-rw-r--r-- | src/Processing/CrossReferences.kt | 12 | ||||
-rw-r--r-- | src/RichContent/RichString.kt | 3 | ||||
-rw-r--r-- | src/main.kt | 15 | ||||
-rw-r--r-- | styles/style.css | 256 | ||||
-rw-r--r-- | test/data/markdown/spec.txt | 6150 | ||||
-rw-r--r-- | test/src/markdown/MarkdownTestRunner.kt | 130 | ||||
-rw-r--r-- | test/src/markdown/ParserTest.kt | 52 | ||||
-rw-r--r-- | test/src/markdown/Specification.kt | 10 | ||||
-rw-r--r-- | test/src/model/CommentTest.kt | 30 |
21 files changed, 8697 insertions, 55 deletions
@@ -5,6 +5,7 @@ <content url="file://$MODULE_DIR$"> <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" packagePrefix="org.jetbrains.dokka" /> <sourceFolder url="file://$MODULE_DIR$/test/src" isTestSource="true" packagePrefix="org.jetbrains.dokka.tests" /> + <sourceFolder url="file://$MODULE_DIR$/gen" isTestSource="false" generated="true" /> </content> <orderEntry type="inheritedJdk" /> <orderEntry type="sourceFolder" forTests="false" /> diff --git a/src/Formats/StructuredFormatService.kt b/src/Formats/StructuredFormatService.kt index 339ccf73..0c58f553 100644 --- a/src/Formats/StructuredFormatService.kt +++ b/src/Formats/StructuredFormatService.kt @@ -63,8 +63,10 @@ public abstract class StructuredFormatService(val locationService: LocationServi } appendLine(to, formatText(node.doc.description)) appendLine(to) - for (section in node.doc.sections) { - appendLine(to, formatBold(formatText(section.label))) + for ((label, section) in node.doc.sections) { + if (label.startsWith("$")) + continue + appendLine(to, formatBold(formatText(label))) appendLine(to, formatText(section.text)) appendLine(to) } diff --git a/src/Formats/TextFormatService.kt b/src/Formats/TextFormatService.kt index 29f01a74..77a0bb65 100644 --- a/src/Formats/TextFormatService.kt +++ b/src/Formats/TextFormatService.kt @@ -12,8 +12,8 @@ public class TextFormatService(val signatureGenerator: LanguageService) : Format for (n in 0..node.doc.summary.length()) append("=") - for (section in node.doc.sections) { - appendln(section.label) + for ((label,section) in node.doc.sections) { + appendln(label) appendln(section.text) } } diff --git a/src/Markdown/GeneratedParserUtilBase.java b/src/Markdown/GeneratedParserUtilBase.java new file mode 100644 index 00000000..9dd999b5 --- /dev/null +++ b/src/Markdown/GeneratedParserUtilBase.java @@ -0,0 +1,1031 @@ +/* + * Copyright 2011-2014 Gregory Shrago + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.jetbrains.dokka.Markdown; + +import com.intellij.lang.*; +import com.intellij.lang.impl.PsiBuilderAdapter; +import com.intellij.lang.impl.PsiBuilderImpl; +import com.intellij.lexer.Lexer; +import com.intellij.openapi.diagnostic.Logger; +import com.intellij.openapi.util.Comparing; +import com.intellij.openapi.util.Key; +import com.intellij.openapi.util.Pair; +import com.intellij.openapi.util.text.StringHash; +import com.intellij.openapi.util.text.StringUtil; +import com.intellij.psi.PsiFile; +import com.intellij.psi.PsiReference; +import com.intellij.psi.TokenType; +import com.intellij.psi.impl.source.resolve.FileContextUtil; +import com.intellij.psi.impl.source.tree.CompositePsiElement; +import com.intellij.psi.tree.ICompositeElementType; +import com.intellij.psi.tree.IElementType; +import com.intellij.psi.tree.TokenSet; +import com.intellij.util.Function; +import com.intellij.util.PairProcessor; +import com.intellij.util.containers.ContainerUtil; +import com.intellij.util.containers.LimitedPool; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedList; + +/** + * @author gregsh + */ +public class GeneratedParserUtilBase { + + private static final Logger LOG = Logger.getInstance("org.intellij.grammar.parser.GeneratedParserUtilBase"); + + private static final int MAX_RECURSION_LEVEL = 1000; + private static final int MAX_VARIANTS_SIZE = 10000; + private static final int MAX_VARIANTS_TO_DISPLAY = 50; + + private static final int INITIAL_VARIANTS_SIZE = 1000; + private static final int VARIANTS_POOL_SIZE = 10000; + private static final int FRAMES_POOL_SIZE = 500; + + public static final IElementType DUMMY_BLOCK = new DummyBlockElementType(); + + public interface Parser { + boolean parse(PsiBuilder builder, int level); + } + + public static final Parser TOKEN_ADVANCER = new Parser() { + @Override + public boolean parse(PsiBuilder builder, int level) { + if (builder.eof()) return false; + builder.advanceLexer(); + return true; + } + }; + + public static final Parser TRUE_CONDITION = new Parser() { + @Override + public boolean parse(PsiBuilder builder, int level) { + return true; + } + }; + + public static boolean eof(PsiBuilder builder_, int level_) { + return builder_.eof(); + } + + public static int current_position_(PsiBuilder builder_) { + return builder_.rawTokenIndex(); + } + + public static boolean recursion_guard_(PsiBuilder builder_, int level_, String funcName_) { + if (level_ > MAX_RECURSION_LEVEL) { + builder_.error("Maximum recursion level (" + MAX_RECURSION_LEVEL + ") reached in '" + funcName_ + "'"); + return false; + } + return true; + } + + public static boolean empty_element_parsed_guard_(PsiBuilder builder_, String funcName_, int prev_position_) { + if (prev_position_ == current_position_(builder_)) { + builder_.error("Empty element parsed in '" + funcName_ + "' at offset " + builder_.getCurrentOffset()); + return false; + } + return true; + } + + public static boolean invalid_left_marker_guard_(PsiBuilder builder_, PsiBuilder.Marker marker_, String funcName_) { + //builder_.error("Invalid left marker encountered in " + funcName_ +" at offset " + builder_.getCurrentOffset()); + boolean goodMarker = marker_ != null; // && ((LighterASTNode)marker_).getTokenType() != TokenType.ERROR_ELEMENT; + if (!goodMarker) return false; + ErrorState state = ErrorState.get(builder_); + + return !state.frameStack.isEmpty(); + } + + public static TokenSet create_token_set_(IElementType... tokenTypes_) { + return TokenSet.create(tokenTypes_); + } + + private static boolean consumeTokens(PsiBuilder builder_, boolean smart, int pin, IElementType... tokens) { + ErrorState state = ErrorState.get(builder_); + if (state.completionState != null && state.predicateCount == 0) { + addCompletionVariant(builder_, state.completionState, tokens); + } + // suppress single token completion + CompletionState completionState = state.completionState; + state.completionState = null; + boolean result_ = true; + boolean pinned_ = false; + for (int i = 0, tokensLength = tokens.length; i < tokensLength; i++) { + if (pin > 0 && i == pin) pinned_ = result_; + if (result_ || pinned_) { + boolean fast = smart && i == 0; + if (!(fast ? consumeTokenFast(builder_, tokens[i]) : consumeToken(builder_, tokens[i]))) { + result_ = false; + if (pin < 0 || pinned_) report_error_(builder_, state, false); + } + } + } + state.completionState = completionState; + return pinned_ || result_; + } + + public static boolean consumeTokens(PsiBuilder builder_, int pin_, IElementType... token) { + return consumeTokens(builder_, false, pin_, token); + } + + public static boolean consumeTokensSmart(PsiBuilder builder_, int pin_, IElementType... token) { + return consumeTokens(builder_, true, pin_, token); + } + + public static boolean parseTokens(PsiBuilder builder_, int pin_, IElementType... tokens) { + return parseTokens(builder_, false, pin_, tokens); + } + + public static boolean parseTokensSmart(PsiBuilder builder_, int pin_, IElementType... tokens) { + return parseTokens(builder_, true, pin_, tokens); + } + + public static boolean parseTokens(PsiBuilder builder_, boolean smart, int pin_, IElementType... tokens) { + PsiBuilder.Marker marker_ = builder_.mark(); + boolean result_ = consumeTokens(builder_, smart, pin_, tokens); + if (!result_) { + marker_.rollbackTo(); + } + else { + marker_.drop(); + } + return result_; + } + + public static boolean consumeTokenSmart(PsiBuilder builder_, IElementType token) { + addCompletionVariantSmart(builder_, token); + return consumeTokenFast(builder_, token); + } + + public static boolean consumeTokenSmart(PsiBuilder builder_, String token) { + addCompletionVariantSmart(builder_, token); + return consumeTokenFast(builder_, token); + } + + public static boolean consumeToken(PsiBuilder builder_, IElementType token) { + addVariantSmart(builder_, token, true); + if (nextTokenIsFast(builder_, token)) { + builder_.advanceLexer(); + return true; + } + return false; + } + + public static boolean consumeTokenFast(PsiBuilder builder_, IElementType token) { + if (nextTokenIsFast(builder_, token)) { + builder_.advanceLexer(); + return true; + } + return false; + } + + public static boolean consumeToken(PsiBuilder builder_, String text) { + return consumeToken(builder_, text, ErrorState.get(builder_).caseSensitive); + } + + public static boolean consumeToken(PsiBuilder builder_, String text, boolean caseSensitive) { + addVariantSmart(builder_, text, true); + int count = nextTokenIsFast(builder_, text, caseSensitive); + if (count > 0) { + while (count-- > 0) builder_.advanceLexer(); + return true; + } + return false; + } + + public static boolean consumeTokenFast(PsiBuilder builder_, String text) { + int count = nextTokenIsFast(builder_, text, ErrorState.get(builder_).caseSensitive); + if (count > 0) { + while (count-- > 0) builder_.advanceLexer(); + return true; + } + return false; + } + + public static boolean nextTokenIsFast(PsiBuilder builder_, IElementType token) { + return builder_.getTokenType() == token; + } + + public static boolean nextTokenIsFast(PsiBuilder builder_, IElementType... tokens) { + IElementType tokenType = builder_.getTokenType(); + for (IElementType token : tokens) { + if (token == tokenType) return true; + } + return false; + } + + public static boolean nextTokenIs(PsiBuilder builder_, String frameName, IElementType... tokens) { + ErrorState state = ErrorState.get(builder_); + if (state.completionState != null) return true; + boolean track = !state.suppressErrors && state.predicateCount < 2 && state.predicateSign; + if (!track) return nextTokenIsFast(builder_, tokens); + IElementType tokenType = builder_.getTokenType(); + if (StringUtil.isNotEmpty(frameName)) { + addVariantInner(state, builder_.rawTokenIndex(), frameName); + } + else { + for (IElementType token : tokens) { + addVariant(builder_, state, token); + } + } + if (tokenType == null) return false; + for (IElementType token : tokens) { + if (tokenType == token) return true; + } + return false; + } + + public static boolean nextTokenIs(PsiBuilder builder_, IElementType token) { + if (!addVariantSmart(builder_, token, false)) return true; + return nextTokenIsFast(builder_, token); + } + + public static boolean nextTokenIs(PsiBuilder builder_, String tokenText) { + if (!addVariantSmart(builder_, tokenText, false)) return true; + return nextTokenIsFast(builder_, tokenText, ErrorState.get(builder_).caseSensitive) > 0; + } + + public static boolean nextTokenIsFast(PsiBuilder builder_, String tokenText) { + return nextTokenIsFast(builder_, tokenText, ErrorState.get(builder_).caseSensitive) > 0; + } + + public static int nextTokenIsFast(PsiBuilder builder_, String tokenText, boolean caseSensitive) { + CharSequence sequence = builder_.getOriginalText(); + int offset = builder_.getCurrentOffset(); + int endOffset = offset + tokenText.length(); + CharSequence subSequence = sequence.subSequence(offset, Math.min(endOffset, sequence.length())); + + if (!Comparing.equal(subSequence, tokenText, caseSensitive)) return 0; + + int count = 0; + while (true) { + int nextOffset = builder_.rawTokenTypeStart(++count); + if (nextOffset > endOffset) { + return -count; + } + else if (nextOffset == endOffset) { + break; + } + } + return count; + } + + private static void addCompletionVariantSmart(PsiBuilder builder_, Object token) { + ErrorState state = ErrorState.get(builder_); + CompletionState completionState = state.completionState; + if (completionState != null && state.predicateCount == 0) { + addCompletionVariant(builder_, completionState, token); + } + } + + private static boolean addVariantSmart(PsiBuilder builder_, Object token, boolean force) { + ErrorState state = ErrorState.get(builder_); + // skip FIRST check in completion mode + if (state.completionState != null && !force) return false; + builder_.eof(); + if (!state.suppressErrors && state.predicateCount < 2) { + addVariant(builder_, state, token); + } + return true; + } + + public static void addVariant(PsiBuilder builder_, String text) { + addVariant(builder_, ErrorState.get(builder_), text); + } + + private static void addVariant(PsiBuilder builder_, ErrorState state, Object o) { + builder_.eof(); // skip whitespaces + addVariantInner(state, builder_.rawTokenIndex(), o); + + CompletionState completionState = state.completionState; + if (completionState != null && state.predicateSign) { + addCompletionVariant(builder_, completionState, o); + } + } + + private static void addVariantInner(ErrorState state, int pos, Object o) { + Variant variant = state.VARIANTS.alloc().init(pos, o); + if (state.predicateSign) { + state.variants.add(variant); + if (state.lastExpectedVariantPos < variant.position) { + state.lastExpectedVariantPos = variant.position; + } + } + else { + state.unexpected.add(variant); + } + } + + private static void addCompletionVariant(@NotNull PsiBuilder builder_, @NotNull CompletionState completionState, Object o) { + int offset = builder_.getCurrentOffset(); + if (!builder_.eof() && offset == builder_.rawTokenTypeStart(1)) return; // suppress for zero-length tokens + + boolean add = false; + int diff = completionState.offset - offset; + String text = completionState.convertItem(o); + int length = text == null? 0 : text.length(); + if (length == 0) return; + if (diff == 0) { + add = true; + } + else if (diff > 0 && diff <= length) { + CharSequence fragment = builder_.getOriginalText().subSequence(offset, completionState.offset); + add = completionState.prefixMatches(fragment.toString(), text); + } + else if (diff < 0) { + for (int i=-1; ; i--) { + IElementType type = builder_.rawLookup(i); + int tokenStart = builder_.rawTokenTypeStart(i); + if (isWhitespaceOrComment(builder_, type)) { + diff = completionState.offset - tokenStart; + } + else if (type != null && tokenStart < completionState.offset) { + CharSequence fragment = builder_.getOriginalText().subSequence(tokenStart, completionState.offset); + if (completionState.prefixMatches(fragment.toString(), text)) { + diff = completionState.offset - tokenStart; + } + break; + } + else break; + } + add = diff >= 0 && diff < length; + } + add = add && length > 1 && !(text.charAt(0) == '<' && text.charAt(length - 1) == '>') && + !(text.charAt(0) == '\'' && text.charAt(length - 1) == '\'' && length < 5); + if (add) { + completionState.addItem(builder_, text); + } + } + + public static boolean isWhitespaceOrComment(@NotNull PsiBuilder builder_, @Nullable IElementType type) { + return ((PsiBuilderImpl)((Builder)builder_).getDelegate()).whitespaceOrComment(type); + } + + // here's the new section API for compact parsers & less IntelliJ platform API exposure + public static final int _NONE_ = 0x0; + public static final int _COLLAPSE_ = 0x1; + public static final int _LEFT_ = 0x2; + public static final int _LEFT_INNER_ = 0x4; + public static final int _AND_ = 0x8; + public static final int _NOT_ = 0x10; + + // simple enter/exit methods pair that doesn't require frame object + public static PsiBuilder.Marker enter_section_(PsiBuilder builder_) { + return builder_.mark(); + } + + public static void exit_section_(PsiBuilder builder_, + PsiBuilder.Marker marker, + @Nullable IElementType elementType, + boolean result) { + close_marker_impl_(ErrorState.get(builder_).frameStack.peekLast(), marker, elementType, result); + } + + // complex enter/exit methods pair with frame object + public static PsiBuilder.Marker enter_section_(PsiBuilder builder_, int level, int modifiers, @Nullable String frameName) { + PsiBuilder.Marker marker = builder_.mark(); + enter_section_impl_(builder_, level, modifiers, frameName); + return marker; + } + + private static void enter_section_impl_(PsiBuilder builder_, int level, int modifiers, @Nullable String frameName) { + ErrorState state = ErrorState.get(builder_); + Frame frame = state.FRAMES.alloc().init(builder_, state, level, modifiers, frameName); + Frame prevFrame = state.frameStack.peekLast(); + if (prevFrame != null && prevFrame.errorReportedAt > frame.position) { + // report error for previous unsuccessful frame + reportError(builder_, state, frame, true, false); + } + if (((frame.modifiers & _LEFT_) | (frame.modifiers & _LEFT_INNER_)) != 0) { + PsiBuilder.Marker left = (PsiBuilder.Marker)builder_.getLatestDoneMarker(); + if (invalid_left_marker_guard_(builder_, left, frameName)) { + frame.leftMarker = left; + } + } + state.frameStack.add(frame); + if ((modifiers & _AND_) != 0) { + if (state.predicateCount == 0 && !state.predicateSign) { + throw new AssertionError("Incorrect false predicate sign"); + } + state.predicateCount++; + } + else if ((modifiers & _NOT_) != 0) { + if (state.predicateCount == 0) { + state.predicateSign = false; + } + else { + state.predicateSign = !state.predicateSign; + } + state.predicateCount++; + } + } + + public static void exit_section_(PsiBuilder builder_, + int level, + PsiBuilder.Marker marker, + @Nullable IElementType elementType, + boolean result, + boolean pinned, + @Nullable Parser eatMore) { + ErrorState state = ErrorState.get(builder_); + + Frame frame = state.frameStack.pollLast(); + if (frame == null || level != frame.level) { + LOG.error("Unbalanced error section: got " + frame + ", expected level " + level); + if (frame != null) state.FRAMES.recycle(frame); + close_marker_impl_(frame, marker, elementType, result); + return; + } + + if (((frame.modifiers & _AND_) | (frame.modifiers & _NOT_)) != 0) { + close_marker_impl_(frame, marker, null, false); + state.predicateCount--; + if ((frame.modifiers & _NOT_) != 0) state.predicateSign = !state.predicateSign; + state.FRAMES.recycle(frame); + return; + } + exit_section_impl_(state, frame, builder_, marker, elementType, result, pinned); + + int initialPos = builder_.rawTokenIndex(); + boolean willFail = !result && !pinned; + if (willFail && initialPos == frame.position && state.lastExpectedVariantPos == frame.position && + frame.name != null && state.variants.size() - frame.variantCount > 1) { + state.clearVariants(true, frame.variantCount); + addVariantInner(state, initialPos, frame.name); + } + int lastErrorPos = getLastVariantPos(state, initialPos); + if (!state.suppressErrors && eatMore != null) { + state.suppressErrors = true; + final boolean eatMoreFlagOnce = !builder_.eof() && eatMore.parse(builder_, frame.level + 1); + boolean eatMoreFlag = eatMoreFlagOnce || !result && frame.position == initialPos && lastErrorPos > frame.position; + + PsiBuilderImpl.ProductionMarker latestDoneMarker = + (pinned || result) && (state.altMode || elementType != null) && + eatMoreFlagOnce ? (PsiBuilderImpl.ProductionMarker)builder_.getLatestDoneMarker() : null; + PsiBuilder.Marker extensionMarker = null; + IElementType extensionTokenType = null; + // whitespace prefix makes the very first frame offset bigger than marker start offset which is always 0 + if (latestDoneMarker instanceof PsiBuilder.Marker && + frame.position >= latestDoneMarker.getStartIndex() && + frame.position <= latestDoneMarker.getEndIndex()) { + extensionMarker = ((PsiBuilder.Marker)latestDoneMarker).precede(); + extensionTokenType = latestDoneMarker.getTokenType(); + ((PsiBuilder.Marker)latestDoneMarker).drop(); + } + // advance to the last error pos + // skip tokens until lastErrorPos. parseAsTree might look better here... + int parenCount = 0; + while ((eatMoreFlag || parenCount > 0) && builder_.rawTokenIndex() < lastErrorPos) { + builder_.advanceLexer(); + eatMoreFlag = eatMore.parse(builder_, frame.level + 1); + } + boolean errorReported = frame.errorReportedAt == initialPos || !result && frame.errorReportedAt >= frame.position; + if (errorReported) { + if (eatMoreFlag) { + builder_.advanceLexer(); + parseAsTree(state, builder_, frame.level + 1, DUMMY_BLOCK, true, TOKEN_ADVANCER, eatMore); + } + } + else if (eatMoreFlag) { + errorReported = reportError(builder_, state, frame, true, true); + parseAsTree(state, builder_, frame.level + 1, DUMMY_BLOCK, true, TOKEN_ADVANCER, eatMore); + } + else if (eatMoreFlagOnce || (!result && frame.position != builder_.rawTokenIndex()) || frame.errorReportedAt > initialPos) { + errorReported = reportError(builder_, state, frame, true, false); + } + if (extensionMarker != null) { + extensionMarker.done(extensionTokenType); + } + state.suppressErrors = false; + if (errorReported || result) { + state.clearVariants(true, 0); + state.clearVariants(false, 0); + state.lastExpectedVariantPos = -1; + } + } + else if (!result && pinned && frame.errorReportedAt < 0) { + // do not report if there are errors beyond current position + if (lastErrorPos == initialPos) { + // do not force, inner recoverRoot might have skipped some tokens + reportError(builder_, state, frame, false, false); + } + else if (lastErrorPos > initialPos) { + // set error pos here as if it is reported for future reference + frame.errorReportedAt = lastErrorPos; + } + } + // propagate errorReportedAt up the stack to avoid duplicate reporting + Frame prevFrame = willFail && eatMore == null ? null : state.frameStack.peekLast(); + if (prevFrame != null && prevFrame.errorReportedAt < frame.errorReportedAt) { + prevFrame.errorReportedAt = frame.errorReportedAt; + } + state.FRAMES.recycle(frame); + } + + private static void exit_section_impl_(ErrorState state, + Frame frame, + PsiBuilder builder_, + PsiBuilder.Marker marker, + IElementType elementType, + boolean result, + boolean pinned) { + if (elementType != null && marker != null) { + if ((frame.modifiers & _COLLAPSE_) != 0) { + PsiBuilderImpl.ProductionMarker last = result || pinned? (PsiBuilderImpl.ProductionMarker)builder_.getLatestDoneMarker() : null; + if (last != null && last.getStartIndex() == frame.position && + state.typeExtends(last.getTokenType(), elementType)) { + IElementType resultType = last.getTokenType(); + ((PsiBuilder.Marker)last).drop(); + marker.done(resultType); + return; + } + } + if (result || pinned) { + if ((frame.modifiers & _LEFT_INNER_) != 0 && frame.leftMarker != null) { + marker.done(elementType); + frame.leftMarker.precede().done(((LighterASTNode)frame.leftMarker).getTokenType()); + frame.leftMarker.drop(); + } + else if ((frame.modifiers & _LEFT_) != 0 && frame.leftMarker != null) { + marker.drop(); + frame.leftMarker.precede().done(elementType); + } + else { + if (frame.level == 0) builder_.eof(); // skip whitespaces + marker.done(elementType); + } + } + else { + close_marker_impl_(frame, marker, null, false); + } + } + else if (result || pinned) { + if (marker != null) marker.drop(); + if ((frame.modifiers & _LEFT_INNER_) != 0 && frame.leftMarker != null) { + frame.leftMarker.precede().done(((LighterASTNode)frame.leftMarker).getTokenType()); + frame.leftMarker.drop(); + } + } + else { + close_marker_impl_(frame, marker, null, false); + } + } + + private static void close_marker_impl_(Frame frame, PsiBuilder.Marker marker, IElementType elementType, boolean result) { + if (marker == null) return; + if (result) { + if (elementType != null) { + marker.done(elementType); + } + else { + marker.drop(); + } + } + else { + if (frame != null) { + int position = ((PsiBuilderImpl.ProductionMarker)marker).getStartIndex(); + if (frame.errorReportedAt > position) { + frame.errorReportedAt = frame.errorReportedAtPrev; + } + } + marker.rollbackTo(); + } + } + + public static boolean report_error_(PsiBuilder builder_, boolean result_) { + if (!result_) report_error_(builder_, ErrorState.get(builder_), false); + return result_; + } + + public static void report_error_(PsiBuilder builder_, ErrorState state, boolean advance) { + Frame frame = state.frameStack.isEmpty()? null : state.frameStack.getLast(); + if (frame == null) { + LOG.error("unbalanced enter/exit section call: got null"); + return; + } + int position = builder_.rawTokenIndex(); + if (frame.errorReportedAt < position && getLastVariantPos(state, position + 1) <= position) { + reportError(builder_, state, frame, true, advance); + } + } + + private static int getLastVariantPos(ErrorState state, int defValue) { + return state.lastExpectedVariantPos < 0? defValue : state.lastExpectedVariantPos; + } + + private static boolean reportError(PsiBuilder builder_, + ErrorState state, + Frame frame, + boolean force, + boolean advance) { + String expectedText = state.getExpectedText(builder_); + boolean notEmpty = StringUtil.isNotEmpty(expectedText); + if (force || notEmpty || advance) { + String gotText = builder_.eof()? "unexpected end of file" : + notEmpty? "got '" + builder_.getTokenText() +"'" : + "'" + builder_.getTokenText() +"' unexpected"; + String message = expectedText + gotText; + if (advance) { + PsiBuilder.Marker mark = builder_.mark(); + builder_.advanceLexer(); + mark.error(message); + } + else { + builder_.error(message); + } + builder_.eof(); // skip whitespaces + frame.errorReportedAt = builder_.rawTokenIndex(); + return true; + } + return false; + } + + + public static final Key<CompletionState> COMPLETION_STATE_KEY = Key.create("COMPLETION_STATE_KEY"); + + public static class CompletionState implements Function<Object, String> { + public final int offset; + public final Collection<String> items = ContainerUtil.newTroveSet(); + + public CompletionState(int offset_) { + offset = offset_; + } + + @Nullable + public String convertItem(Object o) { + return o instanceof Object[] ? StringUtil.join((Object[]) o, this, " ") : o.toString(); + } + + @Override + public String fun(Object o) { + return o.toString(); + } + + public void addItem(@NotNull PsiBuilder builder, @NotNull String text) { + items.add(text); + } + + public boolean prefixMatches(@NotNull String prefix, @NotNull String variant) { + return StringUtil.startsWithIgnoreCase(variant, prefix); + } + } + + public static class Builder extends PsiBuilderAdapter { + public final ErrorState state; + public final PsiParser parser; + + public Builder(PsiBuilder builder_, ErrorState state_, PsiParser parser_) { + super(builder_); + state = state_; + parser = parser_; + } + + public Lexer getLexer() { + return ((PsiBuilderImpl)myDelegate).getLexer(); + } + } + + public static PsiBuilder adapt_builder_(IElementType root, PsiBuilder builder, PsiParser parser) { + return adapt_builder_(root, builder, parser, null); + } + + public static PsiBuilder adapt_builder_(IElementType root, PsiBuilder builder, PsiParser parser, TokenSet[] extendsSets) { + ErrorState state = new ErrorState(); + ErrorState.initState(state, builder, root, extendsSets); + return new Builder(builder, state, parser); + } + + public static class ErrorState { + TokenSet[] extendsSets; + public PairProcessor<IElementType, IElementType> altExtendsChecker; + + int predicateCount; + boolean predicateSign = true; + boolean suppressErrors; + public final LinkedList<Frame> frameStack = new LinkedList<Frame>(); + public CompletionState completionState; + + private boolean caseSensitive; + public boolean altMode; + + int lastExpectedVariantPos = -1; + MyList<Variant> variants = new MyList<Variant>(INITIAL_VARIANTS_SIZE); + MyList<Variant> unexpected = new MyList<Variant>(INITIAL_VARIANTS_SIZE / 10); + + final LimitedPool<Variant> VARIANTS = new LimitedPool<Variant>(VARIANTS_POOL_SIZE, new LimitedPool.ObjectFactory<Variant>() { + @Override + public Variant create() { + return new Variant(); + } + + @Override + public void cleanup(final Variant o) { + } + }); + final LimitedPool<Frame> FRAMES = new LimitedPool<Frame>(FRAMES_POOL_SIZE, new LimitedPool.ObjectFactory<Frame>() { + @Override + public Frame create() { + return new Frame(); + } + + @Override + public void cleanup(final Frame o) { + } + }); + + public static ErrorState get(PsiBuilder builder) { + return ((Builder)builder).state; + } + + public static void initState(ErrorState state, PsiBuilder builder, IElementType root, TokenSet[] extendsSets) { + state.extendsSets = extendsSets; + PsiFile file = builder.getUserDataUnprotected(FileContextUtil.CONTAINING_FILE_KEY); + state.completionState = file == null? null: file.getUserData(COMPLETION_STATE_KEY); + Language language = file == null? root.getLanguage() : file.getLanguage(); + state.caseSensitive = language.isCaseSensitive(); + } + + public String getExpectedText(PsiBuilder builder_) { + int position = builder_.rawTokenIndex(); + StringBuilder sb = new StringBuilder(); + if (addExpected(sb, position, true)) { + sb.append(" expected, "); + } + else if (addExpected(sb, position, false)) sb.append(" unexpected, "); + return sb.toString(); + } + + private boolean addExpected(StringBuilder sb, int position, boolean expected) { + MyList<Variant> list = expected ? variants : unexpected; + String[] strings = new String[list.size()]; + long[] hashes = new long[strings.length]; + Arrays.fill(strings, ""); + int count = 0; + loop: for (Variant variant : list) { + if (position == variant.position) { + String text = variant.object.toString(); + long hash = StringHash.calc(text); + for (int i=0; i<count; i++) { + if (hashes[i] == hash) continue loop; + } + hashes[count] = hash; + strings[count] = text; + count++; + } + } + Arrays.sort(strings); + count = 0; + for (String s : strings) { + if (s.length() == 0) continue; + if (count++ > 0) { + if (count > MAX_VARIANTS_TO_DISPLAY) { + sb.append(" and ..."); + break; + } + else { + sb.append(", "); + } + } + char c = s.charAt(0); + String displayText = c == '<' || StringUtil.isJavaIdentifierStart(c) ? s : '\'' + s + '\''; + sb.append(displayText); + } + if (count > 1 && count < MAX_VARIANTS_TO_DISPLAY) { + int idx = sb.lastIndexOf(", "); + sb.replace(idx, idx + 1, " or"); + } + return count > 0; + } + + public void clearVariants(boolean expected, int start) { + MyList<Variant> list = expected? variants : unexpected; + if (start < 0 || start >= list.size()) return; + for (int i = start, len = list.size(); i < len; i ++) { + VARIANTS.recycle(list.get(i)); + } + list.setSize(start); + } + + boolean typeExtends(IElementType child_, IElementType parent_) { + if (child_ == parent_) return true; + if (extendsSets != null) { + for (TokenSet set : extendsSets) { + if (set.contains(child_) && set.contains(parent_)) return true; + } + } + return altExtendsChecker != null && altExtendsChecker.process(child_, parent_); + } + } + + public static class Frame { + public int offset; + public int position; + public int level; + public int modifiers; + public String name; + public int variantCount; + public int errorReportedAt; + public int errorReportedAtPrev; + public PsiBuilder.Marker leftMarker; + + public Frame() { + } + + public Frame init(PsiBuilder builder_, ErrorState state, int level_, int modifiers_, String name_) { + offset = builder_.getCurrentOffset(); + position = builder_.rawTokenIndex(); + level = level_; + modifiers = modifiers_; + name = name_; + variantCount = state.variants.size(); + errorReportedAt = -1; + + Frame prev = state.frameStack.peekLast(); + errorReportedAtPrev = prev == null? -1 : prev.errorReportedAt; + leftMarker = null; + return this; + } + + @Override + public String toString() { + String mod = modifiers == _NONE_ ? "_NONE_, " : + ((modifiers & _COLLAPSE_) != 0? "_CAN_COLLAPSE_, ": "") + + ((modifiers & _LEFT_) != 0? "_LEFT_, ": "") + + ((modifiers & _LEFT_INNER_) != 0? "_LEFT_INNER_, ": "") + + ((modifiers & _AND_) != 0? "_AND_, ": "") + + ((modifiers & _NOT_) != 0? "_NOT_, ": ""); + return String.format("{%s:%s:%d, %d, %s%s}", offset, position, level, errorReportedAt, mod, name); + } + } + + + private static class Variant { + int position; + Object object; + + public Variant init(int pos, Object o) { + position = pos; + object = o; + return this; + } + + @Override + public String toString() { + return "<" + position + ", " + object + ">"; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Variant variant = (Variant)o; + + if (position != variant.position) return false; + if (!this.object.equals(variant.object)) return false; + + return true; + } + + @Override + public int hashCode() { + int result = position; + result = 31 * result + object.hashCode(); + return result; + } + } + + + private static final int MAX_CHILDREN_IN_TREE = 10; + public static boolean parseAsTree(ErrorState state, final PsiBuilder builder_, int level, final IElementType chunkType, + boolean checkBraces, final Parser parser, final Parser eatMoreCondition) { + final LinkedList<Pair<PsiBuilder.Marker, PsiBuilder.Marker>> parenList = new LinkedList<Pair<PsiBuilder.Marker, PsiBuilder.Marker>>(); + final LinkedList<Pair<PsiBuilder.Marker, Integer>> siblingList = new LinkedList<Pair<PsiBuilder.Marker, Integer>>(); + PsiBuilder.Marker marker = null; + + final Runnable checkSiblingsRunnable = new Runnable() { + @Override + public void run() { + main: + while (!siblingList.isEmpty()) { + final Pair<PsiBuilder.Marker, PsiBuilder.Marker> parenPair = parenList.peek(); + final int rating = siblingList.getFirst().second; + int count = 0; + for (Pair<PsiBuilder.Marker, Integer> pair : siblingList) { + if (pair.second != rating || parenPair != null && pair.first == parenPair.second) break main; + if (++count >= MAX_CHILDREN_IN_TREE) { + final PsiBuilder.Marker parentMarker = pair.first.precede(); + while (count-- > 0) { + siblingList.removeFirst(); + } + parentMarker.done(chunkType); + siblingList.addFirst(Pair.create(parentMarker, rating + 1)); + continue main; + } + } + break; + } + } + }; + int totalCount = 0; + int tokenCount = 0; + while (true) { + final IElementType tokenType = builder_.getTokenType(); + if (marker == null) { + marker = builder_.mark(); + } + final boolean result = (!parenList.isEmpty() || eatMoreCondition.parse(builder_, level + 1)) && parser.parse(builder_, level + 1); + if (result) { + tokenCount++; + totalCount++; + } + if (!result) { + break; + } + + if (tokenCount >= MAX_CHILDREN_IN_TREE && marker != null) { + marker.done(chunkType); + siblingList.addFirst(Pair.create(marker, 1)); + checkSiblingsRunnable.run(); + marker = null; + tokenCount = 0; + } + } + if (marker != null) { + marker.drop(); + } + for (Pair<PsiBuilder.Marker, PsiBuilder.Marker> pair : parenList) { + pair.first.drop(); + } + return totalCount != 0; + } + + private static class DummyBlockElementType extends IElementType implements ICompositeElementType{ + DummyBlockElementType() { + super("DUMMY_BLOCK", Language.ANY); + } + + @NotNull + @Override + public ASTNode createCompositeNode() { + return new DummyBlock(); + } + } + + public static class DummyBlock extends CompositePsiElement { + DummyBlock() { + super(DUMMY_BLOCK); + } + + @NotNull + @Override + public PsiReference[] getReferences() { + return PsiReference.EMPTY_ARRAY; + } + + @NotNull + @Override + public Language getLanguage() { + return getParent().getLanguage(); + } + } + + private static class MyList<E> extends ArrayList<E> { + MyList(int initialCapacity) { + super(initialCapacity); + } + + protected void setSize(int fromIndex) { + removeRange(fromIndex, size()); + } + + @Override + public boolean add(E e) { + int size = size(); + if (size >= MAX_VARIANTS_SIZE) { + removeRange(MAX_VARIANTS_SIZE / 4, size - MAX_VARIANTS_SIZE / 4); + } + return super.add(e); + } + } +} diff --git a/src/Markdown/MarkdownLexer.java b/src/Markdown/MarkdownLexer.java new file mode 100644 index 00000000..3486d795 --- /dev/null +++ b/src/Markdown/MarkdownLexer.java @@ -0,0 +1,10 @@ +package org.jetbrains.dokka.Markdown; + +import com.intellij.lexer.FlexAdapter; +import org.jetbrains.markdown.impl._MarkdownLexer; + +public class MarkdownLexer extends FlexAdapter { + public MarkdownLexer() { + super(new _MarkdownLexer()); + } +} diff --git a/src/Markdown/MarkdownProcessor.kt b/src/Markdown/MarkdownProcessor.kt new file mode 100644 index 00000000..9843eb68 --- /dev/null +++ b/src/Markdown/MarkdownProcessor.kt @@ -0,0 +1,56 @@ +package org.jetbrains.dokka + +import org.jetbrains.markdown.* +import com.intellij.lang.impl.PsiBuilderImpl +import com.intellij.psi.tree.TokenSet +import com.intellij.lang.Language +import com.intellij.psi.tree.IFileElementType +import com.intellij.lang.LighterASTNode +import com.intellij.util.diff.FlyweightCapableTreeStructure +import com.intellij.openapi.util.Ref +import org.jetbrains.dokka.Markdown.MarkdownLexer + +public class MarkdownProcessor { + class object { + val EXPR_LANGUAGE = object : Language("MARKDOWN") {} + val DOCUMENT = IFileElementType("DOCUMENT", EXPR_LANGUAGE); + } + + public fun parse(markdown: String): MarkdownTree { + val parser = MarkdownParser() + val builder = PsiBuilderImpl(null, null, TokenSet.EMPTY, TokenSet.EMPTY, MarkdownLexer(), null, markdown, null, null) + parser.parse_only_(DOCUMENT, builder) + val light = builder.getLightTree()!! + return MarkdownTree(markdown, light) + } +} + +public class MarkdownTree(private val text: String, private val structure: FlyweightCapableTreeStructure<LighterASTNode>) { + public fun dump(): String { + val sb = StringBuilder() + visit(sb, "", structure.getRoot(), structure, text) + return sb.toString() + } +} + +fun markdownToHtml(markdown : String) : String { + return MarkdownProcessor().parse(markdown).dump() +} + + +fun visit(sb: StringBuilder, indent: String, node: LighterASTNode, structure: FlyweightCapableTreeStructure<LighterASTNode>, markdown: String) { + sb.append(indent) + sb.append(node.getTokenType().toString()) + val nodeText = markdown.substring(node.getStartOffset(), node.getEndOffset()) + sb.append(":" + nodeText.replace("\n","\u23CE")) + sb.appendln() + val ref = Ref.create<Array<LighterASTNode>?>() + val count = structure.getChildren(node, ref) + val children = ref.get() + if (children == null) + return + for (index in 0..count - 1) { + val child = children[index] + visit(sb, indent + " ", child, structure, markdown) + } +}
\ No newline at end of file diff --git a/src/Markdown/MarkdownTokenType.kt b/src/Markdown/MarkdownTokenType.kt new file mode 100644 index 00000000..293228c3 --- /dev/null +++ b/src/Markdown/MarkdownTokenType.kt @@ -0,0 +1,6 @@ +package org.jetbrains.dokka.Markdown + +import com.intellij.psi.tree.IElementType + +public class MarkdownTokenType(debugName: String) : IElementType(debugName, null) { +}
\ No newline at end of file diff --git a/src/Markdown/_MarkdownLexer.flex b/src/Markdown/_MarkdownLexer.flex new file mode 100644 index 00000000..9c76da8f --- /dev/null +++ b/src/Markdown/_MarkdownLexer.flex @@ -0,0 +1,40 @@ +package org.jetbrains.markdown.impl; + +import com.intellij.lexer.*; +import com.intellij.psi.tree.IElementType; +import static org.jetbrains.markdown.MarkdownElementTypes.*; + +%% + +%{ + public _MarkdownLexer() { + this((java.io.Reader)null); + } +%} + +%public +%class _MarkdownLexer +%implements FlexLexer +%function advance +%type IElementType +%unicode + +Newline="\r"|"\n"|"\r\n" +Spacechar=[\ \t\f] +NUMBER=[0-9]+(\.[0-9]*)? +STRING=[^~\*_`&\[\]()<!#\\ \t\n\r]+ +ANYCHAR=. +Line=!'\r' !'\n' .* {Newline} + +%% +<YYINITIAL> { + {Spacechar} { return SPACECHAR; } + {Newline} { return NEWLINE; } + "\\357\\273\\277" { return BOM; } + + {NUMBER} { return NUMBER; } + {STRING} { return STRING; } + {ANYCHAR} { return ANYCHAR; } + + [^] { return com.intellij.psi.TokenType.BAD_CHARACTER; } +} diff --git a/src/Markdown/markdown.bnf b/src/Markdown/markdown.bnf new file mode 100644 index 00000000..b0a3ede6 --- /dev/null +++ b/src/Markdown/markdown.bnf @@ -0,0 +1,83 @@ +{ + psiPackage = 'org.jetbrains.markdown' + psiImplPackage = 'org.jetbrains.markdown.impl' + + parserClass="org.jetbrains.markdown.MarkdownParser" + parserUtilClass="org.jetbrains.dokka.Markdown.GeneratedParserUtilBase" + elementTypeHolderClass = 'org.jetbrains.markdown.MarkdownElementTypes' + + tokenTypeClass = 'org.jetbrains.dokka.Markdown.MarkdownTokenType' + + tokens=[ + LINE_WS='regexp:[\ \t\f]' + EOL='"\r"|"\n"|"\r\n"' + BOM = '\357\273\277' + number='regexp:\d+(\.\d*)?' + String='regexp:[^~\*_`&\[\]()<!#\\ \t\n\r]+' + AnyChar='regexp:.' + ] +} + +Document ::= BOM? ( Block )* + +OptionalSpace ::= Spacechar* +RequiredSpace ::= Spacechar+ +NonindentSpace ::= (" " | " " | " ")? + +BlankLine ::= OptionalSpace Newline + +Whitespace ::= Spacechar | Newline +EndLine ::= LineBreak | TerminalEndline | NormalEndline +NormalEndline ::= OptionalSpace Newline !BlankLine +TerminalEndline ::= OptionalSpace Newline <<eof>> +LineBreak ::= " " NormalEndline +Indent ::= "\t" | " " + +// ---- BLOCKS ---- +Block ::= BlankLine* ( + Para + | Plain + | OrderedList + | BulletList + ) + +Para ::= NonindentSpace Inlines (BlankLine+ | TerminalEndline) +Plain ::= Inlines + +HorizontalRule ::= NonindentSpace + ( '*' OptionalSpace '*' OptionalSpace '*' (OptionalSpace '*')* + | '-' OptionalSpace '-' OptionalSpace '-' (OptionalSpace '-')* + | '_' OptionalSpace '_' OptionalSpace '_' (OptionalSpace '_')*) + OptionalSpace Newline BlankLine+ + +Bullet ::= !HorizontalRule NonindentSpace ('+' | '*' | '-') Spacechar+ +Enumerator ::= NonindentSpace number '.' Spacechar+ + +BulletList ::= &Bullet List +OrderedList ::= &Enumerator List + +List ::= (ListItem BlankLine*)+ +ListItem ::= (Bullet | Enumerator) ListBlock ( ListContinuationBlock )* + +ListBlock ::= !BlankLine Plain ( ListBlockLine )* +ListBlockLine ::= !BlankLine !(Indent? (Bullet | Enumerator)) !HorizontalRule Indent? Plain + +ListContinuationBlock ::= BlankLine* (Indent ListBlock)+ + + +// ---- INLINES ---- +Inlines ::= (!EndLine Inline | EndLine &Inline )+ EndLine? +Inline ::= String | EndLine | RequiredSpace | Strong | Emph | Link + +Emph ::= EmphStar | EmphUnderscore +EmphStar ::= '*' !Whitespace (!'*' Inline)+ '*' +EmphUnderscore ::= '_' !Whitespace (!'_' Inline)+ '_' + +Strong ::= StrongStar | StrongUnderscore +StrongStar ::= '**' !Whitespace (!'**' Inline)+ '**' +StrongUnderscore ::= '__' !Whitespace (!'__' Inline)+ '__' + +Link ::= ReferenceLink +ReferenceLink ::= ReferenceLinkSingle +ReferenceLinkSingle ::= '[' Target ']' +Target ::= String
\ No newline at end of file diff --git a/src/Markdown/markdown.leg b/src/Markdown/markdown.leg new file mode 100644 index 00000000..ea8bc522 --- /dev/null +++ b/src/Markdown/markdown.leg @@ -0,0 +1,781 @@ +%{ +/********************************************************************** + + markdown_parser.leg - markdown parser in C using a PEG grammar. + (c) 2008 John MacFarlane (jgm at berkeley dot edu). + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License or the MIT + license. See LICENSE for details. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + ***********************************************************************/ + +#include <stdbool.h> +#include <assert.h> +#include "markdown_peg.h" +#include "utility_functions.h" + + + +/********************************************************************** + + Definitions for leg parser generator. + YY_INPUT is the function the parser calls to get new input. + We take all new input from (static) charbuf. + + ***********************************************************************/ + + + +# define YYSTYPE element * +#ifdef __DEBUG__ +# define YY_DEBUG 1 +#endif + +#define YY_INPUT(buf, result, max_size) \ +{ \ + int yyc; \ + if (charbuf && *charbuf != '\0') { \ + yyc= *charbuf++; \ + } else { \ + yyc= EOF; \ + } \ + result= (EOF == yyc) ? 0 : (*(buf)= yyc, 1); \ +} + +#define YY_RULE(T) T + + +/********************************************************************** + + PEG grammar and parser actions for markdown syntax. + + ***********************************************************************/ + +%} + +Doc = BOM? a:StartList ( Block { a = cons($$, a); } )* + { parse_result = reverse(a); } + +Block = BlankLine* + ( BlockQuote + | Verbatim + | Note + | Reference + | HorizontalRule + | Heading + | OrderedList + | BulletList + | HtmlBlock + | StyleBlock + | Para + | Plain ) + +Para = NonindentSpace a:Inlines BlankLine+ + { $$ = a; $$->key = PARA; } + +Plain = a:Inlines + { $$ = a; $$->key = PLAIN; } + +AtxInline = !Newline !(Sp '#'* Sp Newline) Inline + +AtxStart = < ( "######" | "#####" | "####" | "###" | "##" | "#" ) > + { $$ = mk_element(H1 + (strlen(yytext) - 1)); } + +AtxHeading = s:AtxStart Sp a:StartList ( AtxInline { a = cons($$, a); } )+ (Sp '#'* Sp)? Newline + { $$ = mk_list(s->key, a); + free(s); } + +SetextHeading = SetextHeading1 | SetextHeading2 + +SetextBottom1 = '='+ Newline + +SetextBottom2 = '-'+ Newline + +SetextHeading1 = &(RawLine SetextBottom1) + a:StartList ( !Endline Inline { a = cons($$, a); } )+ Sp Newline + SetextBottom1 { $$ = mk_list(H1, a); } + +SetextHeading2 = &(RawLine SetextBottom2) + a:StartList ( !Endline Inline { a = cons($$, a); } )+ Sp Newline + SetextBottom2 { $$ = mk_list(H2, a); } + +Heading = SetextHeading | AtxHeading + +BlockQuote = a:BlockQuoteRaw + { $$ = mk_element(BLOCKQUOTE); + $$->children = a; + } + +BlockQuoteRaw = a:StartList + (( '>' ' '? Line { a = cons($$, a); } ) + ( !'>' !BlankLine Line { a = cons($$, a); } )* + ( BlankLine { a = cons(mk_str("\n"), a); } )* + )+ + { $$ = mk_str_from_list(a, true); + $$->key = RAW; + } + +NonblankIndentedLine = !BlankLine IndentedLine + +VerbatimChunk = a:StartList + ( BlankLine { a = cons(mk_str("\n"), a); } )* + ( NonblankIndentedLine { a = cons($$, a); } )+ + { $$ = mk_str_from_list(a, false); } + +Verbatim = a:StartList ( VerbatimChunk { a = cons($$, a); } )+ + { $$ = mk_str_from_list(a, false); + $$->key = VERBATIM; } + +HorizontalRule = NonindentSpace + ( '*' Sp '*' Sp '*' (Sp '*')* + | '-' Sp '-' Sp '-' (Sp '-')* + | '_' Sp '_' Sp '_' (Sp '_')*) + Sp Newline BlankLine+ + { $$ = mk_element(HRULE); } + +Bullet = !HorizontalRule NonindentSpace ('+' | '*' | '-') Spacechar+ + +BulletList = &Bullet (ListTight | ListLoose) + { $$->key = BULLETLIST; } + +ListTight = a:StartList + ( ListItemTight { a = cons($$, a); } )+ + BlankLine* !(Bullet | Enumerator) + { $$ = mk_list(LIST, a); } + +ListLoose = a:StartList + ( b:ListItem BlankLine* + { element *li; + li = b->children; + li->contents.str = realloc(li->contents.str, strlen(li->contents.str) + 3); + strcat(li->contents.str, "\n\n"); /* In loose list, \n\n added to end of each element */ + a = cons(b, a); + } )+ + { $$ = mk_list(LIST, a); } + +ListItem = ( Bullet | Enumerator ) + a:StartList + ListBlock { a = cons($$, a); } + ( ListContinuationBlock { a = cons($$, a); } )* + { element *raw; + raw = mk_str_from_list(a, false); + raw->key = RAW; + $$ = mk_element(LISTITEM); + $$->children = raw; + } + +ListItemTight = + ( Bullet | Enumerator ) + a:StartList + ListBlock { a = cons($$, a); } + ( !BlankLine + ListContinuationBlock { a = cons($$, a); } )* + !ListContinuationBlock + { element *raw; + raw = mk_str_from_list(a, false); + raw->key = RAW; + $$ = mk_element(LISTITEM); + $$->children = raw; + } + +ListBlock = a:StartList + !BlankLine Line { a = cons($$, a); } + ( ListBlockLine { a = cons($$, a); } )* + { $$ = mk_str_from_list(a, false); } + +ListContinuationBlock = a:StartList + ( < BlankLine* > + { if (strlen(yytext) == 0) + a = cons(mk_str("\001"), a); /* block separator */ + else + a = cons(mk_str(yytext), a); } ) + ( Indent ListBlock { a = cons($$, a); } )+ + { $$ = mk_str_from_list(a, false); } + +Enumerator = NonindentSpace [0-9]+ '.' Spacechar+ + +OrderedList = &Enumerator (ListTight | ListLoose) + { $$->key = ORDEREDLIST; } + +ListBlockLine = !BlankLine + !( Indent? (Bullet | Enumerator) ) + !HorizontalRule + OptionallyIndentedLine + +# Parsers for different kinds of block-level HTML content. +# This is repetitive due to constraints of PEG grammar. + +HtmlBlockOpenAddress = '<' Spnl ("address" | "ADDRESS") Spnl HtmlAttribute* '>' +HtmlBlockCloseAddress = '<' Spnl '/' ("address" | "ADDRESS") Spnl '>' +HtmlBlockAddress = HtmlBlockOpenAddress (HtmlBlockAddress | !HtmlBlockCloseAddress .)* HtmlBlockCloseAddress + +HtmlBlockOpenBlockquote = '<' Spnl ("blockquote" | "BLOCKQUOTE") Spnl HtmlAttribute* '>' +HtmlBlockCloseBlockquote = '<' Spnl '/' ("blockquote" | "BLOCKQUOTE") Spnl '>' +HtmlBlockBlockquote = HtmlBlockOpenBlockquote (HtmlBlockBlockquote | !HtmlBlockCloseBlockquote .)* HtmlBlockCloseBlockquote + +HtmlBlockOpenCenter = '<' Spnl ("center" | "CENTER") Spnl HtmlAttribute* '>' +HtmlBlockCloseCenter = '<' Spnl '/' ("center" | "CENTER") Spnl '>' +HtmlBlockCenter = HtmlBlockOpenCenter (HtmlBlockCenter | !HtmlBlockCloseCenter .)* HtmlBlockCloseCenter + +HtmlBlockOpenDir = '<' Spnl ("dir" | "DIR") Spnl HtmlAttribute* '>' +HtmlBlockCloseDir = '<' Spnl '/' ("dir" | "DIR") Spnl '>' +HtmlBlockDir = HtmlBlockOpenDir (HtmlBlockDir | !HtmlBlockCloseDir .)* HtmlBlockCloseDir + +HtmlBlockOpenDiv = '<' Spnl ("div" | "DIV") Spnl HtmlAttribute* '>' +HtmlBlockCloseDiv = '<' Spnl '/' ("div" | "DIV") Spnl '>' +HtmlBlockDiv = HtmlBlockOpenDiv (HtmlBlockDiv | !HtmlBlockCloseDiv .)* HtmlBlockCloseDiv + +HtmlBlockOpenDl = '<' Spnl ("dl" | "DL") Spnl HtmlAttribute* '>' +HtmlBlockCloseDl = '<' Spnl '/' ("dl" | "DL") Spnl '>' +HtmlBlockDl = HtmlBlockOpenDl (HtmlBlockDl | !HtmlBlockCloseDl .)* HtmlBlockCloseDl + +HtmlBlockOpenFieldset = '<' Spnl ("fieldset" | "FIELDSET") Spnl HtmlAttribute* '>' +HtmlBlockCloseFieldset = '<' Spnl '/' ("fieldset" | "FIELDSET") Spnl '>' +HtmlBlockFieldset = HtmlBlockOpenFieldset (HtmlBlockFieldset | !HtmlBlockCloseFieldset .)* HtmlBlockCloseFieldset + +HtmlBlockOpenForm = '<' Spnl ("form" | "FORM") Spnl HtmlAttribute* '>' +HtmlBlockCloseForm = '<' Spnl '/' ("form" | "FORM") Spnl '>' +HtmlBlockForm = HtmlBlockOpenForm (HtmlBlockForm | !HtmlBlockCloseForm .)* HtmlBlockCloseForm + +HtmlBlockOpenH1 = '<' Spnl ("h1" | "H1") Spnl HtmlAttribute* '>' +HtmlBlockCloseH1 = '<' Spnl '/' ("h1" | "H1") Spnl '>' +HtmlBlockH1 = HtmlBlockOpenH1 (HtmlBlockH1 | !HtmlBlockCloseH1 .)* HtmlBlockCloseH1 + +HtmlBlockOpenH2 = '<' Spnl ("h2" | "H2") Spnl HtmlAttribute* '>' +HtmlBlockCloseH2 = '<' Spnl '/' ("h2" | "H2") Spnl '>' +HtmlBlockH2 = HtmlBlockOpenH2 (HtmlBlockH2 | !HtmlBlockCloseH2 .)* HtmlBlockCloseH2 + +HtmlBlockOpenH3 = '<' Spnl ("h3" | "H3") Spnl HtmlAttribute* '>' +HtmlBlockCloseH3 = '<' Spnl '/' ("h3" | "H3") Spnl '>' +HtmlBlockH3 = HtmlBlockOpenH3 (HtmlBlockH3 | !HtmlBlockCloseH3 .)* HtmlBlockCloseH3 + +HtmlBlockOpenH4 = '<' Spnl ("h4" | "H4") Spnl HtmlAttribute* '>' +HtmlBlockCloseH4 = '<' Spnl '/' ("h4" | "H4") Spnl '>' +HtmlBlockH4 = HtmlBlockOpenH4 (HtmlBlockH4 | !HtmlBlockCloseH4 .)* HtmlBlockCloseH4 + +HtmlBlockOpenH5 = '<' Spnl ("h5" | "H5") Spnl HtmlAttribute* '>' +HtmlBlockCloseH5 = '<' Spnl '/' ("h5" | "H5") Spnl '>' +HtmlBlockH5 = HtmlBlockOpenH5 (HtmlBlockH5 | !HtmlBlockCloseH5 .)* HtmlBlockCloseH5 + +HtmlBlockOpenH6 = '<' Spnl ("h6" | "H6") Spnl HtmlAttribute* '>' +HtmlBlockCloseH6 = '<' Spnl '/' ("h6" | "H6") Spnl '>' +HtmlBlockH6 = HtmlBlockOpenH6 (HtmlBlockH6 | !HtmlBlockCloseH6 .)* HtmlBlockCloseH6 + +HtmlBlockOpenMenu = '<' Spnl ("menu" | "MENU") Spnl HtmlAttribute* '>' +HtmlBlockCloseMenu = '<' Spnl '/' ("menu" | "MENU") Spnl '>' +HtmlBlockMenu = HtmlBlockOpenMenu (HtmlBlockMenu | !HtmlBlockCloseMenu .)* HtmlBlockCloseMenu + +HtmlBlockOpenNoframes = '<' Spnl ("noframes" | "NOFRAMES") Spnl HtmlAttribute* '>' +HtmlBlockCloseNoframes = '<' Spnl '/' ("noframes" | "NOFRAMES") Spnl '>' +HtmlBlockNoframes = HtmlBlockOpenNoframes (HtmlBlockNoframes | !HtmlBlockCloseNoframes .)* HtmlBlockCloseNoframes + +HtmlBlockOpenNoscript = '<' Spnl ("noscript" | "NOSCRIPT") Spnl HtmlAttribute* '>' +HtmlBlockCloseNoscript = '<' Spnl '/' ("noscript" | "NOSCRIPT") Spnl '>' +HtmlBlockNoscript = HtmlBlockOpenNoscript (HtmlBlockNoscript | !HtmlBlockCloseNoscript .)* HtmlBlockCloseNoscript + +HtmlBlockOpenOl = '<' Spnl ("ol" | "OL") Spnl HtmlAttribute* '>' +HtmlBlockCloseOl = '<' Spnl '/' ("ol" | "OL") Spnl '>' +HtmlBlockOl = HtmlBlockOpenOl (HtmlBlockOl | !HtmlBlockCloseOl .)* HtmlBlockCloseOl + +HtmlBlockOpenP = '<' Spnl ("p" | "P") Spnl HtmlAttribute* '>' +HtmlBlockCloseP = '<' Spnl '/' ("p" | "P") Spnl '>' +HtmlBlockP = HtmlBlockOpenP (HtmlBlockP | !HtmlBlockCloseP .)* HtmlBlockCloseP + +HtmlBlockOpenPre = '<' Spnl ("pre" | "PRE") Spnl HtmlAttribute* '>' +HtmlBlockClosePre = '<' Spnl '/' ("pre" | "PRE") Spnl '>' +HtmlBlockPre = HtmlBlockOpenPre (HtmlBlockPre | !HtmlBlockClosePre .)* HtmlBlockClosePre + +HtmlBlockOpenTable = '<' Spnl ("table" | "TABLE") Spnl HtmlAttribute* '>' +HtmlBlockCloseTable = '<' Spnl '/' ("table" | "TABLE") Spnl '>' +HtmlBlockTable = HtmlBlockOpenTable (HtmlBlockTable | !HtmlBlockCloseTable .)* HtmlBlockCloseTable + +HtmlBlockOpenUl = '<' Spnl ("ul" | "UL") Spnl HtmlAttribute* '>' +HtmlBlockCloseUl = '<' Spnl '/' ("ul" | "UL") Spnl '>' +HtmlBlockUl = HtmlBlockOpenUl (HtmlBlockUl | !HtmlBlockCloseUl .)* HtmlBlockCloseUl + +HtmlBlockOpenDd = '<' Spnl ("dd" | "DD") Spnl HtmlAttribute* '>' +HtmlBlockCloseDd = '<' Spnl '/' ("dd" | "DD") Spnl '>' +HtmlBlockDd = HtmlBlockOpenDd (HtmlBlockDd | !HtmlBlockCloseDd .)* HtmlBlockCloseDd + +HtmlBlockOpenDt = '<' Spnl ("dt" | "DT") Spnl HtmlAttribute* '>' +HtmlBlockCloseDt = '<' Spnl '/' ("dt" | "DT") Spnl '>' +HtmlBlockDt = HtmlBlockOpenDt (HtmlBlockDt | !HtmlBlockCloseDt .)* HtmlBlockCloseDt + +HtmlBlockOpenFrameset = '<' Spnl ("frameset" | "FRAMESET") Spnl HtmlAttribute* '>' +HtmlBlockCloseFrameset = '<' Spnl '/' ("frameset" | "FRAMESET") Spnl '>' +HtmlBlockFrameset = HtmlBlockOpenFrameset (HtmlBlockFrameset | !HtmlBlockCloseFrameset .)* HtmlBlockCloseFrameset + +HtmlBlockOpenLi = '<' Spnl ("li" | "LI") Spnl HtmlAttribute* '>' +HtmlBlockCloseLi = '<' Spnl '/' ("li" | "LI") Spnl '>' +HtmlBlockLi = HtmlBlockOpenLi (HtmlBlockLi | !HtmlBlockCloseLi .)* HtmlBlockCloseLi + +HtmlBlockOpenTbody = '<' Spnl ("tbody" | "TBODY") Spnl HtmlAttribute* '>' +HtmlBlockCloseTbody = '<' Spnl '/' ("tbody" | "TBODY") Spnl '>' +HtmlBlockTbody = HtmlBlockOpenTbody (HtmlBlockTbody | !HtmlBlockCloseTbody .)* HtmlBlockCloseTbody + +HtmlBlockOpenTd = '<' Spnl ("td" | "TD") Spnl HtmlAttribute* '>' +HtmlBlockCloseTd = '<' Spnl '/' ("td" | "TD") Spnl '>' +HtmlBlockTd = HtmlBlockOpenTd (HtmlBlockTd | !HtmlBlockCloseTd .)* HtmlBlockCloseTd + +HtmlBlockOpenTfoot = '<' Spnl ("tfoot" | "TFOOT") Spnl HtmlAttribute* '>' +HtmlBlockCloseTfoot = '<' Spnl '/' ("tfoot" | "TFOOT") Spnl '>' +HtmlBlockTfoot = HtmlBlockOpenTfoot (HtmlBlockTfoot | !HtmlBlockCloseTfoot .)* HtmlBlockCloseTfoot + +HtmlBlockOpenTh = '<' Spnl ("th" | "TH") Spnl HtmlAttribute* '>' +HtmlBlockCloseTh = '<' Spnl '/' ("th" | "TH") Spnl '>' +HtmlBlockTh = HtmlBlockOpenTh (HtmlBlockTh | !HtmlBlockCloseTh .)* HtmlBlockCloseTh + +HtmlBlockOpenThead = '<' Spnl ("thead" | "THEAD") Spnl HtmlAttribute* '>' +HtmlBlockCloseThead = '<' Spnl '/' ("thead" | "THEAD") Spnl '>' +HtmlBlockThead = HtmlBlockOpenThead (HtmlBlockThead | !HtmlBlockCloseThead .)* HtmlBlockCloseThead + +HtmlBlockOpenTr = '<' Spnl ("tr" | "TR") Spnl HtmlAttribute* '>' +HtmlBlockCloseTr = '<' Spnl '/' ("tr" | "TR") Spnl '>' +HtmlBlockTr = HtmlBlockOpenTr (HtmlBlockTr | !HtmlBlockCloseTr .)* HtmlBlockCloseTr + +HtmlBlockOpenScript = '<' Spnl ("script" | "SCRIPT") Spnl HtmlAttribute* '>' +HtmlBlockCloseScript = '<' Spnl '/' ("script" | "SCRIPT") Spnl '>' +HtmlBlockScript = HtmlBlockOpenScript (!HtmlBlockCloseScript .)* HtmlBlockCloseScript + +HtmlBlockOpenHead = '<' Spnl ("head" | "HEAD") Spnl HtmlAttribute* '>' +HtmlBlockCloseHead = '<' Spnl '/' ("head" | "HEAD") Spnl '>' +HtmlBlockHead = HtmlBlockOpenHead (!HtmlBlockCloseHead .)* HtmlBlockCloseHead + +HtmlBlockInTags = HtmlBlockAddress + | HtmlBlockBlockquote + | HtmlBlockCenter + | HtmlBlockDir + | HtmlBlockDiv + | HtmlBlockDl + | HtmlBlockFieldset + | HtmlBlockForm + | HtmlBlockH1 + | HtmlBlockH2 + | HtmlBlockH3 + | HtmlBlockH4 + | HtmlBlockH5 + | HtmlBlockH6 + | HtmlBlockMenu + | HtmlBlockNoframes + | HtmlBlockNoscript + | HtmlBlockOl + | HtmlBlockP + | HtmlBlockPre + | HtmlBlockTable + | HtmlBlockUl + | HtmlBlockDd + | HtmlBlockDt + | HtmlBlockFrameset + | HtmlBlockLi + | HtmlBlockTbody + | HtmlBlockTd + | HtmlBlockTfoot + | HtmlBlockTh + | HtmlBlockThead + | HtmlBlockTr + | HtmlBlockScript + | HtmlBlockHead + +HtmlBlock = < ( HtmlBlockInTags | HtmlComment | HtmlBlockSelfClosing ) > + BlankLine+ + { if (extension(EXT_FILTER_HTML)) { + $$ = mk_list(LIST, NULL); + } else { + $$ = mk_str(yytext); + $$->key = HTMLBLOCK; + } + } + +HtmlBlockSelfClosing = '<' Spnl HtmlBlockType Spnl HtmlAttribute* '/' Spnl '>' + +HtmlBlockType = "address" | "blockquote" | "center" | "dir" | "div" | "dl" | "fieldset" | "form" | "h1" | "h2" | "h3" | + "h4" | "h5" | "h6" | "hr" | "isindex" | "menu" | "noframes" | "noscript" | "ol" | "p" | "pre" | "table" | + "ul" | "dd" | "dt" | "frameset" | "li" | "tbody" | "td" | "tfoot" | "th" | "thead" | "tr" | "script" | + "ADDRESS" | "BLOCKQUOTE" | "CENTER" | "DIR" | "DIV" | "DL" | "FIELDSET" | "FORM" | "H1" | "H2" | "H3" | + "H4" | "H5" | "H6" | "HR" | "ISINDEX" | "MENU" | "NOFRAMES" | "NOSCRIPT" | "OL" | "P" | "PRE" | "TABLE" | + "UL" | "DD" | "DT" | "FRAMESET" | "LI" | "TBODY" | "TD" | "TFOOT" | "TH" | "THEAD" | "TR" | "SCRIPT" + +StyleOpen = '<' Spnl ("style" | "STYLE") Spnl HtmlAttribute* '>' +StyleClose = '<' Spnl '/' ("style" | "STYLE") Spnl '>' +InStyleTags = StyleOpen (!StyleClose .)* StyleClose +StyleBlock = < InStyleTags > + BlankLine* + { if (extension(EXT_FILTER_STYLES)) { + $$ = mk_list(LIST, NULL); + } else { + $$ = mk_str(yytext); + $$->key = HTMLBLOCK; + } + } + +Inlines = a:StartList ( !Endline Inline { a = cons($$, a); } + | c:Endline &Inline { a = cons(c, a); } )+ Endline? + { $$ = mk_list(LIST, a); } + +Inline = Str + | Endline + | UlOrStarLine + | Space + | Strong + | Emph + | Strike + | Image + | Link + | NoteReference + | InlineNote + | Code + | RawHtml + | Entity + | EscapedChar + | Smart + | Symbol + +Space = Spacechar+ + { $$ = mk_str(" "); + $$->key = SPACE; } + +Str = a:StartList < NormalChar+ > { a = cons(mk_str(yytext), a); } + ( StrChunk { a = cons($$, a); } )* + { if (a->next == NULL) { $$ = a; } else { $$ = mk_list(LIST, a); } } + +StrChunk = < (NormalChar | '_'+ &Alphanumeric)+ > { $$ = mk_str(yytext); } | + AposChunk + +AposChunk = &{ extension(EXT_SMART) } '\'' &Alphanumeric + { $$ = mk_element(APOSTROPHE); } + +EscapedChar = '\\' !Newline < [-\\`|*_{}[\]()#+.!><] > + { $$ = mk_str(yytext); } + +Entity = ( HexEntity | DecEntity | CharEntity ) + { $$ = mk_str(yytext); $$->key = HTML; } + +Endline = LineBreak | TerminalEndline | NormalEndline + +NormalEndline = Sp Newline !BlankLine !'>' !AtxStart + !(Line ('='+ | '-'+) Newline) + { $$ = mk_str("\n"); + $$->key = SPACE; } + +TerminalEndline = Sp Newline Eof + { $$ = NULL; } + +LineBreak = " " NormalEndline + { $$ = mk_element(LINEBREAK); } + +Symbol = < SpecialChar > + { $$ = mk_str(yytext); } + +# This keeps the parser from getting bogged down on long strings of '*' or '_', +# or strings of '*' or '_' with space on each side: +UlOrStarLine = (UlLine | StarLine) { $$ = mk_str(yytext); } +StarLine = < "****" '*'* > | < Spacechar '*'+ &Spacechar > +UlLine = < "____" '_'* > | < Spacechar '_'+ &Spacechar > + +Emph = EmphStar | EmphUl + +Whitespace = Spacechar | Newline + +EmphStar = '*' !Whitespace + a:StartList + ( !'*' b:Inline { a = cons(b, a); } + | b:StrongStar { a = cons(b, a); } + )+ + '*' + { $$ = mk_list(EMPH, a); } + +EmphUl = '_' !Whitespace + a:StartList + ( !'_' b:Inline { a = cons(b, a); } + | b:StrongUl { a = cons(b, a); } + )+ + '_' + { $$ = mk_list(EMPH, a); } + +Strong = StrongStar | StrongUl + +StrongStar = "**" !Whitespace + a:StartList + ( !"**" b:Inline { a = cons(b, a); })+ + "**" + { $$ = mk_list(STRONG, a); } + +StrongUl = "__" !Whitespace + a:StartList + ( !"__" b:Inline { a = cons(b, a); })+ + "__" + { $$ = mk_list(STRONG, a); } + +Strike = &{ extension(EXT_STRIKE) } + "~~" !Whitespace + a:StartList + ( !"~~" b:Inline { a = cons(b, a); })+ + "~~" + { $$ = mk_list(STRIKE, a); } + +Image = '!' ( ExplicitLink | ReferenceLink ) + { if ($$->key == LINK) { + $$->key = IMAGE; + } else { + element *result; + result = $$; + $$->children = cons(mk_str("!"), result->children); + } } + +Link = ExplicitLink | ReferenceLink | AutoLink + +ReferenceLink = ReferenceLinkDouble | ReferenceLinkSingle + +ReferenceLinkDouble = a:Label < Spnl > !"[]" b:Label + { link match; + if (find_reference(&match, b->children)) { + $$ = mk_link(a->children, match.url, match.title); + free(a); + free_element_list(b); + } else { + element *result; + result = mk_element(LIST); + result->children = cons(mk_str("["), cons(a, cons(mk_str("]"), cons(mk_str(yytext), + cons(mk_str("["), cons(b, mk_str("]"))))))); + $$ = result; + } + } + +ReferenceLinkSingle = a:Label < (Spnl "[]")? > + { link match; + if (find_reference(&match, a->children)) { + $$ = mk_link(a->children, match.url, match.title); + free(a); + } + else { + element *result; + result = mk_element(LIST); + result->children = cons(mk_str("["), cons(a, cons(mk_str("]"), mk_str(yytext)))); + $$ = result; + } + } + +ExplicitLink = l:Label '(' Sp s:Source Spnl t:Title Sp ')' + { $$ = mk_link(l->children, s->contents.str, t->contents.str); + free_element(s); + free_element(t); + free(l); } + +Source = ( '<' < SourceContents > '>' | < SourceContents > ) + { $$ = mk_str(yytext); } + +SourceContents = ( ( !'(' !')' !'>' Nonspacechar )+ | '(' SourceContents ')')* + +Title = ( TitleSingle | TitleDouble | < "" > ) + { $$ = mk_str(yytext); } + +TitleSingle = '\'' < ( !( '\'' Sp ( ')' | Newline ) ) . )* > '\'' + +TitleDouble = '"' < ( !( '"' Sp ( ')' | Newline ) ) . )* > '"' + +AutoLink = AutoLinkUrl | AutoLinkEmail + +AutoLinkUrl = '<' < [A-Za-z]+ "://" ( !Newline !'>' . )+ > '>' + { $$ = mk_link(mk_str(yytext), yytext, ""); } + +AutoLinkEmail = '<' ( "mailto:" )? < [-A-Za-z0-9+_./!%~$]+ '@' ( !Newline !'>' . )+ > '>' + { char *mailto = malloc(strlen(yytext) + 8); + sprintf(mailto, "mailto:%s", yytext); + $$ = mk_link(mk_str(yytext), mailto, ""); + free(mailto); + } + +Reference = NonindentSpace !"[]" l:Label ':' Spnl s:RefSrc t:RefTitle BlankLine+ + { $$ = mk_link(l->children, s->contents.str, t->contents.str); + free_element(s); + free_element(t); + free(l); + $$->key = REFERENCE; } + +Label = '[' ( !'^' &{ extension(EXT_NOTES) } | &. &{ !extension(EXT_NOTES) } ) + a:StartList + ( !']' Inline { a = cons($$, a); } )* + ']' + { $$ = mk_list(LIST, a); } + +RefSrc = < Nonspacechar+ > + { $$ = mk_str(yytext); + $$->key = HTML; } + +RefTitle = ( RefTitleSingle | RefTitleDouble | RefTitleParens | EmptyTitle ) + { $$ = mk_str(yytext); } + +EmptyTitle = < "" > + +RefTitleSingle = Spnl '\'' < ( !( '\'' Sp Newline | Newline ) . )* > '\'' + +RefTitleDouble = Spnl '"' < ( !('"' Sp Newline | Newline) . )* > '"' + +RefTitleParens = Spnl '(' < ( !(')' Sp Newline | Newline) . )* > ')' + +References = a:StartList + ( b:Reference { a = cons(b, a); } | SkipBlock )* + { references = reverse(a); } + +Ticks1 = "`" !'`' +Ticks2 = "``" !'`' +Ticks3 = "```" !'`' +Ticks4 = "````" !'`' +Ticks5 = "`````" !'`' + +Code = ( Ticks1 Sp < ( ( !'`' Nonspacechar )+ | !Ticks1 '`'+ | !( Sp Ticks1 ) ( Spacechar | Newline !BlankLine ) )+ > Sp Ticks1 + | Ticks2 Sp < ( ( !'`' Nonspacechar )+ | !Ticks2 '`'+ | !( Sp Ticks2 ) ( Spacechar | Newline !BlankLine ) )+ > Sp Ticks2 + | Ticks3 Sp < ( ( !'`' Nonspacechar )+ | !Ticks3 '`'+ | !( Sp Ticks3 ) ( Spacechar | Newline !BlankLine ) )+ > Sp Ticks3 + | Ticks4 Sp < ( ( !'`' Nonspacechar )+ | !Ticks4 '`'+ | !( Sp Ticks4 ) ( Spacechar | Newline !BlankLine ) )+ > Sp Ticks4 + | Ticks5 Sp < ( ( !'`' Nonspacechar )+ | !Ticks5 '`'+ | !( Sp Ticks5 ) ( Spacechar | Newline !BlankLine ) )+ > Sp Ticks5 + ) + { $$ = mk_str(yytext); $$->key = CODE; } + +RawHtml = < (HtmlComment | HtmlBlockScript | HtmlTag) > + { if (extension(EXT_FILTER_HTML)) { + $$ = mk_list(LIST, NULL); + } else { + $$ = mk_str(yytext); + $$->key = HTML; + } + } + +BlankLine = Sp Newline + +Quoted = '"' (!'"' .)* '"' | '\'' (!'\'' .)* '\'' +HtmlAttribute = (AlphanumericAscii | '-')+ Spnl ('=' Spnl (Quoted | (!'>' Nonspacechar)+))? Spnl +HtmlComment = "<!--" (!"-->" .)* "-->" +HtmlTag = '<' Spnl '/'? AlphanumericAscii+ Spnl HtmlAttribute* '/'? Spnl '>' +Eof = !. +Spacechar = ' ' | '\t' +Nonspacechar = !Spacechar !Newline . +Newline = '\n' | '\r' '\n'? +Sp = Spacechar* +Spnl = Sp (Newline Sp)? +SpecialChar = '~' | '*' | '_' | '`' | '&' | '[' | ']' | '(' | ')' | '<' | '!' | '#' | '\\' | '\'' | '"' | ExtendedSpecialChar +NormalChar = !( SpecialChar | Spacechar | Newline ) . +Alphanumeric = [0-9A-Za-z] | '\200' | '\201' | '\202' | '\203' | '\204' | '\205' | '\206' | '\207' | '\210' | '\211' | '\212' | '\213' | '\214' | '\215' | '\216' | '\217' | '\220' | '\221' | '\222' | '\223' | '\224' | '\225' | '\226' | '\227' | '\230' | '\231' | '\232' | '\233' | '\234' | '\235' | '\236' | '\237' | '\240' | '\241' | '\242' | '\243' | '\244' | '\245' | '\246' | '\247' | '\250' | '\251' | '\252' | '\253' | '\254' | '\255' | '\256' | '\257' | '\260' | '\261' | '\262' | '\263' | '\264' | '\265' | '\266' | '\267' | '\270' | '\271' | '\272' | '\273' | '\274' | '\275' | '\276' | '\277' | '\300' | '\301' | '\302' | '\303' | '\304' | '\305' | '\306' | '\307' | '\310' | '\311' | '\312' | '\313' | '\314' | '\315' | '\316' | '\317' | '\320' | '\321' | '\322' | '\323' | '\324' | '\325' | '\326' | '\327' | '\330' | '\331' | '\332' | '\333' | '\334' | '\335' | '\336' | '\337' | '\340' | '\341' | '\342' | '\343' | '\344' | '\345' | '\346' | '\347' | '\350' | '\351' | '\352' | '\353' | '\354' | '\355' | '\356' | '\357' | '\360' | '\361' | '\362' | '\363' | '\364' | '\365' | '\366' | '\367' | '\370' | '\371' | '\372' | '\373' | '\374' | '\375' | '\376' | '\377' +AlphanumericAscii = [A-Za-z0-9] +Digit = [0-9] +BOM = "\357\273\277" + +HexEntity = < '&' '#' [Xx] [0-9a-fA-F]+ ';' > +DecEntity = < '&' '#' [0-9]+ > ';' > +CharEntity = < '&' [A-Za-z0-9]+ ';' > + +NonindentSpace = " " | " " | " " | "" +Indent = "\t" | " " +IndentedLine = Indent Line +OptionallyIndentedLine = Indent? Line + +# StartList starts a list data structure that can be added to with cons: +StartList = &. + { $$ = NULL; } + +Line = RawLine + { $$ = mk_str(yytext); } +RawLine = ( < (!'\r' !'\n' .)* Newline > | < .+ > Eof ) + +SkipBlock = HtmlBlock + | ( !'#' !SetextBottom1 !SetextBottom2 !BlankLine RawLine )+ BlankLine* + | BlankLine+ + | RawLine + +# Syntax extensions + +ExtendedSpecialChar = &{ extension(EXT_SMART) } ('.' | '-' | '\'' | '"') + | &{ extension(EXT_NOTES) } ( '^' ) + +Smart = &{ extension(EXT_SMART) } + ( Ellipsis | Dash | SingleQuoted | DoubleQuoted | Apostrophe ) + +Apostrophe = '\'' + { $$ = mk_element(APOSTROPHE); } + +Ellipsis = ("..." | ". . .") + { $$ = mk_element(ELLIPSIS); } + +Dash = EmDash | EnDash + +EnDash = '-' &Digit + { $$ = mk_element(ENDASH); } + +EmDash = ("---" | "--") + { $$ = mk_element(EMDASH); } + +SingleQuoteStart = '\'' !(Spacechar | Newline) + +SingleQuoteEnd = '\'' !Alphanumeric + +SingleQuoted = SingleQuoteStart + a:StartList + ( !SingleQuoteEnd b:Inline { a = cons(b, a); } )+ + SingleQuoteEnd + { $$ = mk_list(SINGLEQUOTED, a); } + +DoubleQuoteStart = '"' + +DoubleQuoteEnd = '"' + +DoubleQuoted = DoubleQuoteStart + a:StartList + ( !DoubleQuoteEnd b:Inline { a = cons(b, a); } )+ + DoubleQuoteEnd + { $$ = mk_list(DOUBLEQUOTED, a); } + +NoteReference = &{ extension(EXT_NOTES) } + ref:RawNoteReference + { element *match; + if (find_note(&match, ref->contents.str)) { + $$ = mk_element(NOTE); + assert(match->children != NULL); + $$->children = match->children; + $$->contents.str = 0; + } else { + char *s; + s = malloc(strlen(ref->contents.str) + 4); + sprintf(s, "[^%s]", ref->contents.str); + $$ = mk_str(s); + free(s); + } + } + +RawNoteReference = "[^" < ( !Newline !']' . )+ > ']' + { $$ = mk_str(yytext); } + +Note = &{ extension(EXT_NOTES) } + NonindentSpace ref:RawNoteReference ':' Sp + a:StartList + ( RawNoteBlock { a = cons($$, a); } ) + ( &Indent RawNoteBlock { a = cons($$, a); } )* + { $$ = mk_list(NOTE, a); + $$->contents.str = strdup(ref->contents.str); + } + +InlineNote = &{ extension(EXT_NOTES) } + "^[" + a:StartList + ( !']' Inline { a = cons($$, a); } )+ + ']' + { $$ = mk_list(NOTE, a); + $$->contents.str = 0; } + +Notes = a:StartList + ( b:Note { a = cons(b, a); } | SkipBlock )* + { notes = reverse(a); } + +RawNoteBlock = a:StartList + ( !BlankLine OptionallyIndentedLine { a = cons($$, a); } )+ + ( < BlankLine* > { a = cons(mk_str(yytext), a); } ) + { $$ = mk_str_from_list(a, true); + $$->key = RAW; + } + +%% + diff --git a/src/Model/DocumentationContent.kt b/src/Model/DocumentationContent.kt index cebb429b..77e8c764 100644 --- a/src/Model/DocumentationContent.kt +++ b/src/Model/DocumentationContent.kt @@ -9,40 +9,38 @@ public class DocumentationContentSection(public val label: String, public val te } } -// TODO: refactor sections to map -public class DocumentationContent(public val summary: RichString, - public val description: RichString, - public val sections: List<DocumentationContentSection>) { +public class DocumentationContent(public val sections: Map<String, DocumentationContentSection>) { + + public val summary: RichString get() = sections["\$summary"]?.text ?: RichString.empty + public val description: RichString get() = sections["\$description"]?.text ?: RichString.empty override fun equals(other: Any?): Boolean { if (other !is DocumentationContent) return false - if (summary != other.summary) - return false if (sections.size != other.sections.size) return false - for (index in sections.indices) - if (sections[index] != other.sections[index]) + for (keys in sections.keySet()) + if (sections[keys] != other.sections[keys]) return false return true } override fun hashCode(): Int { - return summary.hashCode() + sections.map { it.hashCode() }.sum() + return sections.map { it.hashCode() }.sum() } override fun toString(): String { if (sections.isEmpty()) - return summary.toString() - return "$summary | " + sections.joinToString() + return "<empty>" + return sections.values().joinToString() } val isEmpty: Boolean get() = description.isEmpty() && sections.none() class object { - val Empty = DocumentationContent(RichString.empty, RichString.empty, listOf()) + val Empty = DocumentationContent(mapOf()) } } @@ -50,31 +48,33 @@ public class DocumentationContent(public val summary: RichString, fun BindingContext.getDocumentation(descriptor: DeclarationDescriptor): DocumentationContent { val docText = getDocumentationElements(descriptor).map { it.extractText() }.join("\n") val sections = docText.parseSections() - val (summary, description) = sections.extractSummaryAndDescription() - return DocumentationContent(summary, description, sections.drop(1)) + sections.createSummaryAndDescription() + return DocumentationContent(sections) } -fun List<DocumentationContentSection>.extractSummaryAndDescription() : Pair<RichString, RichString> { - // TODO: rework to unify - // if no $summary and $description is present, parse unnamed section and create specific sections - // otherwise, create empty sections for missing +fun MutableMap<String, DocumentationContentSection>.createSummaryAndDescription() { - val summary = firstOrNull { it.label == "\$summary" } - if (summary != null) { - val description = firstOrNull { it.label == "\$description" } - return Pair(summary.text, description?.text ?: RichString.empty) + val summary = get("\$summary") + val description = get("\$description") + if (summary != null && description == null) { + return } - val description = firstOrNull { it.label == "\$description" } - if (description != null) { - return Pair(RichString.empty, description.text) + if (summary == null && description != null) { + return } - val default = firstOrNull { it.label == "" }?.text - if (default == null) - return Pair(RichString.empty, RichString.empty) + val unnamed = get("") + if (unnamed == null) { + return + } - return default.splitBy("\n") + val split = unnamed.text.splitBy("\n") + remove("") + if (!split.first.isEmpty()) + put("\$summary", DocumentationContentSection("\$summary", split.first)) + if (!split.second.isEmpty()) + put("\$description", DocumentationContentSection("\$description", split.second)) } fun String.parseLabel(index: Int): Pair<String, Int> { @@ -104,8 +104,8 @@ fun String.parseLabel(index: Int): Pair<String, Int> { return "" to -1 } -fun String.parseSections(): List<DocumentationContentSection> { - val sections = arrayListOf<DocumentationContentSection>() +fun String.parseSections(): MutableMap<String, DocumentationContentSection> { + val sections = hashMapOf<String, DocumentationContentSection>() var currentLabel = "" var currentSectionStart = 0 var currentIndex = 0 @@ -117,7 +117,7 @@ fun String.parseSections(): List<DocumentationContentSection> { // section starts, add previous section val currentContent = substring(currentSectionStart, currentIndex).trim() val section = DocumentationContentSection(currentLabel, currentContent.toRichString()) - sections.add(section) + sections.put(section.label, section) currentLabel = label currentIndex = index + 1 @@ -131,12 +131,20 @@ fun String.parseSections(): List<DocumentationContentSection> { val currentContent = substring(currentSectionStart, currentIndex).trim() val section = DocumentationContentSection(currentLabel, currentContent.toRichString()) - sections.add(section) + sections.put(section.label, section) return sections } fun String.toRichString() : RichString { val content = RichString() + for(index in indices) { + val ch = get(index) + when { + ch == '\\' -> continue + ch == '*' && index < length-1 && !get(index + 1).isWhitespace() -> ch + } + } + content.addSlice(this, NormalStyle) return content }
\ No newline at end of file diff --git a/src/Model/DocumentationNodeBuilder.kt b/src/Model/DocumentationNodeBuilder.kt index c8744172..f724c444 100644 --- a/src/Model/DocumentationNodeBuilder.kt +++ b/src/Model/DocumentationNodeBuilder.kt @@ -35,7 +35,7 @@ class DocumentationNodeBuilder(val context: BindingContext) : DeclarationDescrip val classifierDescriptor = typeConstructor.getDeclarationDescriptor() val name = when (classifierDescriptor) { is Named -> classifierDescriptor.getName().asString() - else -> "<BAD>" + else -> "<anonymous>" } val node = DocumentationNode(descriptor, name, DocumentationContent.Empty, DocumentationNode.Kind.Type) reference(data, node, DocumentationReference.Kind.Detail) diff --git a/src/Processing/CrossReferences.kt b/src/Processing/CrossReferences.kt new file mode 100644 index 00000000..9f21da6e --- /dev/null +++ b/src/Processing/CrossReferences.kt @@ -0,0 +1,12 @@ +package org.jetbrains.dokka + +public fun DocumentationNode.buildCrossReferences() { + for (member in members) { + member.buildCrossReferences() + member.details(DocumentationNode.Kind.Receiver).forEach { detail -> + + + } + } +} + diff --git a/src/RichContent/RichString.kt b/src/RichContent/RichString.kt index f09e4715..2110c47f 100644 --- a/src/RichContent/RichString.kt +++ b/src/RichContent/RichString.kt @@ -5,7 +5,8 @@ public class RichString { public val slices: List<RichStringSlice> get() = sliceList public fun addSlice(slice: RichStringSlice) { - sliceList.add(slice) + if (slice.text.length() > 0) + sliceList.add(slice) } public fun addSlice(text: String, style: RichStringStyle) { diff --git a/src/main.kt b/src/main.kt index 2f2ac93c..a541831d 100644 --- a/src/main.kt +++ b/src/main.kt @@ -59,10 +59,23 @@ public fun main(args: Array<String>) { val timeAnalyse = System.currentTimeMillis() - startAnalyse println("done in ${timeAnalyse / 1000} secs") + print("Processing cross references... ") + val startProcessing = System.currentTimeMillis() + documentation.buildCrossReferences() + val timeProcessing = System.currentTimeMillis() - startProcessing + println("done in ${timeProcessing / 1000} secs") + val startBuild = System.currentTimeMillis() val signatureGenerator = KotlinLanguageService() val locationService = FoldersLocationService(arguments.outputDir) - val formatter = JekyllFormatService(locationService, signatureGenerator) + val templateService = HtmlTemplateService.default("/dokka/styles/style.css") + val resolutionService = object : ResolutionService { + override fun resolve(text: String): DocumentationNode { + return documentation + } + } + + val formatter = HtmlFormatService(locationService, resolutionService, signatureGenerator, templateService) val generator = FileGenerator(signatureGenerator, locationService, formatter) print("Building pages... ") generator.buildPage(documentation) diff --git a/styles/style.css b/styles/style.css new file mode 100644 index 00000000..012587a6 --- /dev/null +++ b/styles/style.css @@ -0,0 +1,256 @@ +@import url(https://fonts.googleapis.com/css?family=Lato:300italic,700italic,300,700); + +body, table { + padding:50px; + font:14px/1.5 Lato, "Helvetica Neue", Helvetica, Arial, sans-serif; + color:#555; + font-weight:300; +} + +h1, h2, h3, h4, h5, h6 { + color:#222; + margin:0 0 20px; +} + +p, ul, ol, table, pre, dl { + margin:0 0 20px; +} + +h1, h2, h3 { + line-height:1.1; +} + +h1 { + font-size:28px; +} + +h2 { + color:#393939; +} + +h3, h4, h5, h6 { + color:#494949; +} + +a { + color:#39c; + font-weight:400; + text-decoration:none; +} + +a small { + font-size:11px; + color:#555; + margin-top:-0.6em; + display:block; +} + +.wrapper { + width:860px; + margin:0 auto; +} + +blockquote { + border-left:1px solid #e5e5e5; + margin:0; + padding:0 0 0 20px; + font-style:italic; +} + +code, pre { + font-family:Monaco, Bitstream Vera Sans Mono, Lucida Console, Terminal; + color:#333; + font-size:12px; + display: block; +} + +pre { + padding:8px 8px; + background: #f8f8f8; + border-radius:5px; + border:1px solid #e5e5e5; + overflow-x: auto; +} + +table { + width:100%; + border-collapse:collapse; +} + +th, td { + text-align:left; + vertical-align: top; + padding:5px 10px; +} + +dt { + color:#444; + font-weight:700; +} + +th { + color:#444; +} + +img { + max-width:100%; +} + +header { + width:270px; + float:left; + position:fixed; +} + +header ul { + list-style:none; + height:40px; + + padding:0; + + background: #eee; + background: -moz-linear-gradient(top, #f8f8f8 0%, #dddddd 100%); + background: -webkit-gradient(linear, left top, left bottom, color-stop(0%,#f8f8f8), color-stop(100%,#dddddd)); + background: -webkit-linear-gradient(top, #f8f8f8 0%,#dddddd 100%); + background: -o-linear-gradient(top, #f8f8f8 0%,#dddddd 100%); + background: -ms-linear-gradient(top, #f8f8f8 0%,#dddddd 100%); + background: linear-gradient(top, #f8f8f8 0%,#dddddd 100%); + + border-radius:5px; + border:1px solid #d2d2d2; + box-shadow:inset #fff 0 1px 0, inset rgba(0,0,0,0.03) 0 -1px 0; + width:270px; +} + +header li { + width:89px; + float:left; + border-right:1px solid #d2d2d2; + height:40px; +} + +header ul a { + line-height:1; + font-size:11px; + color:#999; + display:block; + text-align:center; + padding-top:6px; + height:40px; +} + +strong { + color:#222; + font-weight:700; +} + +header ul li + li { + width:88px; + border-left:1px solid #fff; +} + +header ul li + li + li { + border-right:none; + width:89px; +} + +header ul a strong { + font-size:14px; + display:block; + color:#222; +} + +section { + width:500px; + float:right; + padding-bottom:50px; +} + +small { + font-size:11px; +} + +hr { + border:0; + background:#e5e5e5; + height:1px; + margin:0 0 20px; +} + +footer { + width:270px; + float:left; + position:fixed; + bottom:50px; +} + +@media print, screen and (max-width: 960px) { + + div.wrapper { + width:auto; + margin:0; + } + + header, section, footer { + float:none; + position:static; + width:auto; + } + + header { + padding-right:320px; + } + + section { + border:1px solid #e5e5e5; + border-width:1px 0; + padding:20px 0; + margin:0 0 20px; + } + + header a small { + display:inline; + } + + header ul { + position:absolute; + right:50px; + top:52px; + } +} + +@media print, screen and (max-width: 720px) { + body { + word-wrap:break-word; + } + + header { + padding:0; + } + + header ul, header p.view { + position:static; + } + + pre, code { + word-wrap:normal; + } +} + +@media print, screen and (max-width: 480px) { + body { + padding:15px; + } + + header ul { + display:none; + } +} + +@media print { + body { + padding:0.4in; + font-size:12pt; + color:#444; + } +} diff --git a/test/data/markdown/spec.txt b/test/data/markdown/spec.txt new file mode 100644 index 00000000..fce87924 --- /dev/null +++ b/test/data/markdown/spec.txt @@ -0,0 +1,6150 @@ +--- +title: CommonMark Spec +author: +- John MacFarlane +version: 2 +date: 2014-09-19 +... + +# Introduction + +## What is Markdown? + +Markdown is a plain text format for writing structured documents, +based on conventions used for indicating formatting in email and +usenet posts. It was developed in 2004 by John Gruber, who wrote +the first Markdown-to-HTML converter in perl, and it soon became +widely used in websites. By 2014 there were dozens of +implementations in many languages. Some of them extended basic +Markdown syntax with conventions for footnotes, definition lists, +tables, and other constructs, and some allowed output not just in +HTML but in LaTeX and many other formats. + +## Why is a spec needed? + +John Gruber's [canonical description of Markdown's +syntax](http://daringfireball.net/projects/markdown/syntax) +does not specify the syntax unambiguously. Here are some examples of +questions it does not answer: + +1. How much indentation is needed for a sublist? The spec says that + continuation paragraphs need to be indented four spaces, but is + not fully explicit about sublists. It is natural to think that + they, too, must be indented four spaces, but `Markdown.pl` does + not require that. This is hardly a "corner case," and divergences + between implementations on this issue often lead to surprises for + users in real documents. (See [this comment by John + Gruber](http://article.gmane.org/gmane.text.markdown.general/1997).) + +2. Is a blank line needed before a block quote or header? + Most implementations do not require the blank line. However, + this can lead to unexpected results in hard-wrapped text, and + also to ambiguities in parsing (note that some implementations + put the header inside the blockquote, while others do not). + (John Gruber has also spoken [in favor of requiring the blank + lines](http://article.gmane.org/gmane.text.markdown.general/2146).) + +3. Is a blank line needed before an indented code block? + (`Markdown.pl` requires it, but this is not mentioned in the + documentation, and some implementations do not require it.) + + ``` markdown + paragraph + code? + ``` + +4. What is the exact rule for determining when list items get + wrapped in `<p>` tags? Can a list be partially "loose" and partially + "tight"? What should we do with a list like this? + + ``` markdown + 1. one + + 2. two + 3. three + ``` + + Or this? + + ``` markdown + 1. one + - a + + - b + 2. two + ``` + + (There are some relevant comments by John Gruber + [here](http://article.gmane.org/gmane.text.markdown.general/2554).) + +5. Can list markers be indented? Can ordered list markers be right-aligned? + + ``` markdown + 8. item 1 + 9. item 2 + 10. item 2a + ``` + +6. Is this one list with a horizontal rule in its second item, + or two lists separated by a horizontal rule? + + ``` markdown + * a + * * * * * + * b + ``` + +7. When list markers change from numbers to bullets, do we have + two lists or one? (The Markdown syntax description suggests two, + but the perl scripts and many other implementations produce one.) + + ``` markdown + 1. fee + 2. fie + - foe + - fum + ``` + +8. What are the precedence rules for the markers of inline structure? + For example, is the following a valid link, or does the code span + take precedence ? + + ``` markdown + [a backtick (`)](/url) and [another backtick (`)](/url). + ``` + +9. What are the precedence rules for markers of emphasis and strong + emphasis? For example, how should the following be parsed? + + ``` markdown + *foo *bar* baz* + ``` + +10. What are the precedence rules between block-level and inline-level + structure? For example, how should the following be parsed? + + ``` markdown + - `a long code span can contain a hyphen like this + - and it can screw things up` + ``` + +11. Can list items include headers? (`Markdown.pl` does not allow this, + but headers can occur in blockquotes.) + + ``` markdown + - # Heading + ``` + +12. Can link references be defined inside block quotes or list items? + + ``` markdown + > Blockquote [foo]. + > + > [foo]: /url + ``` + +13. If there are multiple definitions for the same reference, which takes + precedence? + + ``` markdown + [foo]: /url1 + [foo]: /url2 + + [foo][] + ``` + +In the absence of a spec, early implementers consulted `Markdown.pl` +to resolve these ambiguities. But `Markdown.pl` was quite buggy, and +gave manifestly bad results in many cases, so it was not a +satisfactory replacement for a spec. + +Because there is no unambiguous spec, implementations have diverged +considerably. As a result, users are often surprised to find that +a document that renders one way on one system (say, a github wiki) +renders differently on another (say, converting to docbook using +pandoc). To make matters worse, because nothing in Markdown counts +as a "syntax error," the divergence often isn't discovered right away. + +## About this document + +This document attempts to specify Markdown syntax unambiguously. +It contains many examples with side-by-side Markdown and +HTML. These are intended to double as conformance tests. An +accompanying script `runtests.pl` can be used to run the tests +against any Markdown program: + + perl runtests.pl spec.txt PROGRAM + +Since this document describes how Markdown is to be parsed into +an abstract syntax tree, it would have made sense to use an abstract +representation of the syntax tree instead of HTML. But HTML is capable +of representing the structural distinctions we need to make, and the +choice of HTML for the tests makes it possible to run the tests against +an implementation without writing an abstract syntax tree renderer. + +This document is generated from a text file, `spec.txt`, written +in Markdown with a small extension for the side-by-side tests. +The script `spec2md.pl` can be used to turn `spec.txt` into pandoc +Markdown, which can then be converted into other formats. + +In the examples, the `→` character is used to represent tabs. + +# Preprocessing + +A [line](#line) <a id="line"></a> +is a sequence of zero or more characters followed by a line +ending (CR, LF, or CRLF) or by the end of +file. + +This spec does not specify an encoding; it thinks of lines as composed +of characters rather than bytes. A conforming parser may be limited +to a certain encoding. + +Tabs in lines are expanded to spaces, with a tab stop of 4 characters: + +. +→foo→baz→→bim +. +<pre><code>foo baz bim +</code></pre> +. + +. + a→a + ὐ→a +. +<pre><code>a a +ὐ a +</code></pre> +. + +Line endings are replaced by newline characters (LF). + +A line containing no characters, or a line containing only spaces (after +tab expansion), is called a [blank line](#blank-line). +<a id="blank-line"></a> + +# Blocks and inlines + +We can think of a document as a sequence of [blocks](#block)<a +id="block"></a>---structural elements like paragraphs, block quotations, +lists, headers, rules, and code blocks. Blocks can contain other +blocks, or they can contain [inline](#inline)<a id="inline"></a> content: +words, spaces, links, emphasized text, images, and inline code. + +## Precedence + +Indicators of block structure always take precedence over indicators +of inline structure. So, for example, the following is a list with +two items, not a list with one item containing a code span: + +. +- `one +- two` +. +<ul> +<li>`one</li> +<li>two`</li> +</ul> +. + +This means that parsing can proceed in two steps: first, the block +structure of the document can be discerned; second, text lines inside +paragraphs, headers, and other block constructs can be parsed for inline +structure. The second step requires information about link reference +definitions that will be available only at the end of the first +step. Note that the first step requires processing lines in sequence, +but the second can be parallelized, since the inline parsing of +one block element does not affect the inline parsing of any other. + +## Container blocks and leaf blocks + +We can divide blocks into two types: +[container blocks](#container-block), <a id="container-block"></a> +which can contain other blocks, and [leaf blocks](#leaf-block), +<a id="leaf-block"></a> which cannot. + +# Leaf blocks + +This section describes the different kinds of leaf block that make up a +Markdown document. + +## Horizontal rules + +A line consisting of 0-3 spaces of indentation, followed by a sequence +of three or more matching `-`, `_`, or `*` characters, each followed +optionally by any number of spaces, forms a [horizontal +rule](#horizontal-rule). <a id="horizontal-rule"></a> + +. +*** +--- +___ +. +<hr /> +<hr /> +<hr /> +. + +Wrong characters: + +. ++++ +. +<p>+++</p> +. + +. +=== +. +<p>===</p> +. + +Not enough characters: + +. +-- +** +__ +. +<p>-- +** +__</p> +. + +One to three spaces indent are allowed: + +. + *** + *** + *** +. +<hr /> +<hr /> +<hr /> +. + +Four spaces is too many: + +. + *** +. +<pre><code>*** +</code></pre> +. + +. +Foo + *** +. +<p>Foo +***</p> +. + +More than three characters may be used: + +. +_____________________________________ +. +<hr /> +. + +Spaces are allowed between the characters: + +. + - - - +. +<hr /> +. + +. + ** * ** * ** * ** +. +<hr /> +. + +. +- - - - +. +<hr /> +. + +Spaces are allowed at the end: + +. +- - - - +. +<hr /> +. + +However, no other characters may occur at the end or the +beginning: + +. +_ _ _ _ a + +a------ +. +<p>_ _ _ _ a</p> +<p>a------</p> +. + +It is required that all of the non-space characters be the same. +So, this is not a horizontal rule: + +. + *-* +. +<p><em>-</em></p> +. + +Horizontal rules do not need blank lines before or after: + +. +- foo +*** +- bar +. +<ul> +<li>foo</li> +</ul> +<hr /> +<ul> +<li>bar</li> +</ul> +. + +Horizontal rules can interrupt a paragraph: + +. +Foo +*** +bar +. +<p>Foo</p> +<hr /> +<p>bar</p> +. + +Note, however, that this is a setext header, not a paragraph followed +by a horizontal rule: + +. +Foo +--- +bar +. +<h2>Foo</h2> +<p>bar</p> +. + +When both a horizontal rule and a list item are possible +interpretations of a line, the horizontal rule is preferred: + +. +* Foo +* * * +* Bar +. +<ul> +<li>Foo</li> +</ul> +<hr /> +<ul> +<li>Bar</li> +</ul> +. + +If you want a horizontal rule in a list item, use a different bullet: + +. +- Foo +- * * * +. +<ul> +<li>Foo</li> +<li><hr /></li> +</ul> +. + +## ATX headers + +An [ATX header](#atx-header) <a id="atx-header"></a> +consists of a string of characters, parsed as inline content, between an +opening sequence of 1--6 unescaped `#` characters and an optional +closing sequence of any number of `#` characters. The opening sequence +of `#` characters cannot be followed directly by a nonspace character. +The closing `#` characters may be followed by spaces only. The opening +`#` character may be indented 0-3 spaces. The raw contents of the +header are stripped of leading and trailing spaces before being parsed +as inline content. The header level is equal to the number of `#` +characters in the opening sequence. + +Simple headers: + +. +# foo +## foo +### foo +#### foo +##### foo +###### foo +. +<h1>foo</h1> +<h2>foo</h2> +<h3>foo</h3> +<h4>foo</h4> +<h5>foo</h5> +<h6>foo</h6> +. + +More than six `#` characters is not a header: + +. +####### foo +. +<p>####### foo</p> +. + +A space is required between the `#` characters and the header's +contents. Note that many implementations currently do not require +the space. However, the space was required by the [original ATX +implementation](http://www.aaronsw.com/2002/atx/atx.py), and it helps +prevent things like the following from being parsed as headers: + +. +#5 bolt +. +<p>#5 bolt</p> +. + +This is not a header, because the first `#` is escaped: + +. +\## foo +. +<p>## foo</p> +. + +Contents are parsed as inlines: + +. +# foo *bar* \*baz\* +. +<h1>foo <em>bar</em> *baz*</h1> +. + +Leading and trailing blanks are ignored in parsing inline content: + +. +# foo +. +<h1>foo</h1> +. + +One to three spaces indentation are allowed: + +. + ### foo + ## foo + # foo +. +<h3>foo</h3> +<h2>foo</h2> +<h1>foo</h1> +. + +Four spaces are too much: + +. + # foo +. +<pre><code># foo +</code></pre> +. + +. +foo + # bar +. +<p>foo +# bar</p> +. + +A closing sequence of `#` characters is optional: + +. +## foo ## + ### bar ### +. +<h2>foo</h2> +<h3>bar</h3> +. + +It need not be the same length as the opening sequence: + +. +# foo ################################## +##### foo ## +. +<h1>foo</h1> +<h5>foo</h5> +. + +Spaces are allowed after the closing sequence: + +. +### foo ### +. +<h3>foo</h3> +. + +A sequence of `#` characters with a nonspace character following it +is not a closing sequence, but counts as part of the contents of the +header: + +. +### foo ### b +. +<h3>foo ### b</h3> +. + +Backslash-escaped `#` characters do not count as part +of the closing sequence: + +. +### foo \### +## foo \#\## +# foo \# +. +<h3>foo #</h3> +<h2>foo ##</h2> +<h1>foo #</h1> +. + +ATX headers need not be separated from surrounding content by blank +lines, and they can interrupt paragraphs: + +. +**** +## foo +**** +. +<hr /> +<h2>foo</h2> +<hr /> +. + +. +Foo bar +# baz +Bar foo +. +<p>Foo bar</p> +<h1>baz</h1> +<p>Bar foo</p> +. + +ATX headers can be empty: + +. +## +# +### ### +. +<h2></h2> +<h1></h1> +<h3></h3> +. + +## Setext headers + +A [setext header](#setext-header) <a id="setext-header"></a> +consists of a line of text, containing at least one nonspace character, +with no more than 3 spaces indentation, followed by a [setext header +underline](#setext-header-underline). A [setext header +underline](#setext-header-underline) <a id="setext-header-underline"></a> +is a sequence of `=` characters or a sequence of `-` characters, with no +more than 3 spaces indentation and any number of trailing +spaces. The header is a level 1 header if `=` characters are used, and +a level 2 header if `-` characters are used. The contents of the header +are the result of parsing the first line as Markdown inline content. + +In general, a setext header need not be preceded or followed by a +blank line. However, it cannot interrupt a paragraph, so when a +setext header comes after a paragraph, a blank line is needed between +them. + +Simple examples: + +. +Foo *bar* +========= + +Foo *bar* +--------- +. +<h1>Foo <em>bar</em></h1> +<h2>Foo <em>bar</em></h2> +. + +The underlining can be any length: + +. +Foo +------------------------- + +Foo += +. +<h2>Foo</h2> +<h1>Foo</h1> +. + +The header content can be indented up to three spaces, and need +not line up with the underlining: + +. + Foo +--- + + Foo +----- + + Foo + === +. +<h2>Foo</h2> +<h2>Foo</h2> +<h1>Foo</h1> +. + +Four spaces indent is too much: + +. + Foo + --- + + Foo +--- +. +<pre><code>Foo +--- + +Foo +</code></pre> +<hr /> +. + +The setext header underline can be indented up to three spaces, and +may have trailing spaces: + +. +Foo + ---- +. +<h2>Foo</h2> +. + +Four spaces is too much: + +. +Foo + --- +. +<p>Foo +---</p> +. + +The setext header underline cannot contain internal spaces: + +. +Foo += = + +Foo +--- - +. +<p>Foo += =</p> +<p>Foo</p> +<hr /> +. + +Trailing spaces in the content line do not cause a line break: + +. +Foo +----- +. +<h2>Foo</h2> +. + +Nor does a backslash at the end: + +. +Foo\ +---- +. +<h2>Foo\</h2> +. + +Since indicators of block structure take precedence over +indicators of inline structure, the following are setext headers: + +. +`Foo +---- +` + +<a title="a lot +--- +of dashes"/> +. +<h2>`Foo</h2> +<p>`</p> +<h2><a title="a lot</h2> +<p>of dashes"/></p> +. + +The setext header underline cannot be a lazy line: + +. +> Foo +--- +. +<blockquote> +<p>Foo</p> +</blockquote> +<hr /> +. + +A setext header cannot interrupt a paragraph: + +. +Foo +Bar +--- + +Foo +Bar +=== +. +<p>Foo +Bar</p> +<hr /> +<p>Foo +Bar +===</p> +. + +But in general a blank line is not required before or after: + +. +--- +Foo +--- +Bar +--- +Baz +. +<hr /> +<h2>Foo</h2> +<h2>Bar</h2> +<p>Baz</p> +. + +Setext headers cannot be empty: + +. + +==== +. +<p>====</p> +. + + +## Indented code blocks + +An [indented code block](#indented-code-block) +<a id="indented-code-block"></a> is composed of one or more +[indented chunks](#indented-chunk) separated by blank lines. +An [indented chunk](#indented-chunk) <a id="indented-chunk"></a> +is a sequence of non-blank lines, each indented four or more +spaces. An indented code block cannot interrupt a paragraph, so +if it occurs before or after a paragraph, there must be an +intervening blank line. The contents of the code block are +the literal contents of the lines, including trailing newlines, +minus four spaces of indentation. An indented code block has no +attributes. + +. + a simple + indented code block +. +<pre><code>a simple + indented code block +</code></pre> +. + +The contents are literal text, and do not get parsed as Markdown: + +. + <a/> + *hi* + + - one +. +<pre><code><a/> +*hi* + +- one +</code></pre> +. + +Here we have three chunks separated by blank lines: + +. + chunk1 + + chunk2 + + + + chunk3 +. +<pre><code>chunk1 + +chunk2 + + + +chunk3 +</code></pre> +. + +Any initial spaces beyond four will be included in the content, even +in interior blank lines: + +. + chunk1 + + chunk2 +. +<pre><code>chunk1 + + chunk2 +</code></pre> +. + +An indented code block cannot interrupt a paragraph. (This +allows hanging indents and the like.) + +. +Foo + bar + +. +<p>Foo +bar</p> +. + +However, any non-blank line with fewer than four leading spaces ends +the code block immediately. So a paragraph may occur immediately +after indented code: + +. + foo +bar +. +<pre><code>foo +</code></pre> +<p>bar</p> +. + +And indented code can occur immediately before and after other kinds of +blocks: + +. +# Header + foo +Header +------ + foo +---- +. +<h1>Header</h1> +<pre><code>foo +</code></pre> +<h2>Header</h2> +<pre><code>foo +</code></pre> +<hr /> +. + +The first line can be indented more than four spaces: + +. + foo + bar +. +<pre><code> foo +bar +</code></pre> +. + +Blank lines preceding or following an indented code block +are not included in it: + +. + + + foo + + +. +<pre><code>foo +</code></pre> +. + +Trailing spaces are included in the code block's content: + +. + foo +. +<pre><code>foo +</code></pre> +. + + +## Fenced code blocks + +A [code fence](#code-fence) <a id="code-fence"></a> is a sequence +of at least three consecutive backtick characters (`` ` ``) or +tildes (`~`). (Tildes and backticks cannot be mixed.) +A [fenced code block](#fenced-code-block) <a id="fenced-code-block"></a> +begins with a code fence, indented no more than three spaces. + +The line with the opening code fence may optionally contain some text +following the code fence; this is trimmed of leading and trailing +spaces and called the [info string](#info-string). +<a id="info-string"></a> The info string may not contain any backtick +characters. (The reason for this restriction is that otherwise +some inline code would be incorrectly interpreted as the +beginning of a fenced code block.) + +The content of the code block consists of all subsequent lines, until +a closing [code fence](#code-fence) of the same type as the code block +began with (backticks or tildes), and with at least as many backticks +or tildes as the opening code fence. If the leading code fence is +indented N spaces, then up to N spaces of indentation are removed from +each line of the content (if present). (If a content line is not +indented, it is preserved unchanged. If it is indented less than N +spaces, all of the indentation is removed.) + +The closing code fence may be indented up to three spaces, and may be +followed only by spaces, which are ignored. If the end of the +containing block (or document) is reached and no closing code fence +has been found, the code block contains all of the lines after the +opening code fence until the end of the containing block (or +document). (An alternative spec would require backtracking in the +event that a closing code fence is not found. But this makes parsing +much less efficient, and there seems to be no real down side to the +behavior described here.) + +A fenced code block may interrupt a paragraph, and does not require +a blank line either before or after. + +The content of a code fence is treated as literal text, not parsed +as inlines. The first word of the info string is typically used to +specify the language of the code sample, and rendered in the `class` +attribute of the `code` tag. However, this spec does not mandate any +particular treatment of the info string. + +Here is a simple example with backticks: + +. +``` +< + > +``` +. +<pre><code>< + > +</code></pre> +. + +With tildes: + +. +~~~ +< + > +~~~ +. +<pre><code>< + > +</code></pre> +. + +The closing code fence must use the same character as the opening +fence: + +. +``` +aaa +~~~ +``` +. +<pre><code>aaa +~~~ +</code></pre> +. + +. +~~~ +aaa +``` +~~~ +. +<pre><code>aaa +``` +</code></pre> +. + +The closing code fence must be at least as long as the opening fence: + +. +```` +aaa +``` +`````` +. +<pre><code>aaa +``` +</code></pre> +. + +. +~~~~ +aaa +~~~ +~~~~ +. +<pre><code>aaa +~~~ +</code></pre> +. + +Unclosed code blocks are closed by the end of the document: + +. +``` +. +<pre><code></code></pre> +. + +. +````` + +``` +aaa +. +<pre><code> +``` +aaa +</code></pre> +. + +A code block can have all empty lines as its content: + +. +``` + + +``` +. +<pre><code> + +</code></pre> +. + +A code block can be empty: + +. +``` +``` +. +<pre><code></code></pre> +. + +Fences can be indented. If the opening fence is indented, +content lines will have equivalent opening indentation removed, +if present: + +. + ``` + aaa +aaa +``` +. +<pre><code>aaa +aaa +</code></pre> +. + +. + ``` +aaa + aaa +aaa + ``` +. +<pre><code>aaa +aaa +aaa +</code></pre> +. + +. + ``` + aaa + aaa + aaa + ``` +. +<pre><code>aaa + aaa +aaa +</code></pre> +. + +Four spaces indentation produces an indented code block: + +. + ``` + aaa + ``` +. +<pre><code>``` +aaa +``` +</code></pre> +. + +Code fences (opening and closing) cannot contain internal spaces: + +. +``` ``` +aaa +. +<p><code></code> +aaa</p> +. + +. +~~~~~~ +aaa +~~~ ~~ +. +<pre><code>aaa +~~~ ~~ +</code></pre> +. + +Fenced code blocks can interrupt paragraphs, and can be followed +directly by paragraphs, without a blank line between: + +. +foo +``` +bar +``` +baz +. +<p>foo</p> +<pre><code>bar +</code></pre> +<p>baz</p> +. + +Other blocks can also occur before and after fenced code blocks +without an intervening blank line: + +. +foo +--- +~~~ +bar +~~~ +# baz +. +<h2>foo</h2> +<pre><code>bar +</code></pre> +<h1>baz</h1> +. + +An [info string](#info-string) can be provided after the opening code fence. +Opening and closing spaces will be stripped, and the first word, prefixed +with `language-`, is used as the value for the `class` attribute of the +`code` element within the enclosing `pre` element. + +. +```ruby +def foo(x) + return 3 +end +``` +. +<pre><code class="language-ruby">def foo(x) + return 3 +end +</code></pre> +. + +. +~~~~ ruby startline=3 $%@#$ +def foo(x) + return 3 +end +~~~~~~~ +. +<pre><code class="language-ruby">def foo(x) + return 3 +end +</code></pre> +. + +. +````; +```` +. +<pre><code class="language-;"></code></pre> +. + +Info strings for backtick code blocks cannot contain backticks: + +. +``` aa ``` +foo +. +<p><code>aa</code> +foo</p> +. + +Closing code fences cannot have info strings: + +. +``` +``` aaa +``` +. +<pre><code>``` aaa +</code></pre> +. + + +## HTML blocks + +An [HTML block tag](#html-block-tag) <a id="html-block-tag"></a> is +an [open tag](#open-tag) or [closing tag](#closing-tag) whose tag +name is one of the following (case-insensitive): +`article`, `header`, `aside`, `hgroup`, `blockquote`, `hr`, `iframe`, +`body`, `li`, `map`, `button`, `object`, `canvas`, `ol`, `caption`, +`output`, `col`, `p`, `colgroup`, `pre`, `dd`, `progress`, `div`, +`section`, `dl`, `table`, `td`, `dt`, `tbody`, `embed`, `textarea`, +`fieldset`, `tfoot`, `figcaption`, `th`, `figure`, `thead`, `footer`, +`footer`, `tr`, `form`, `ul`, `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, +`video`, `script`, `style`. + +An [HTML block](#html-block) <a id="html-block"></a> begins with an +[HTML block tag](#html-block-tag), [HTML comment](#html-comment), +[processing instruction](#processing-instruction), +[declaration](#declaration), or [CDATA section](#cdata-section). +It ends when a [blank line](#blank-line) or the end of the +input is encountered. The initial line may be indented up to three +spaces, and subsequent lines may have any indentation. The contents +of the HTML block are interpreted as raw HTML, and will not be escaped +in HTML output. + +Some simple examples: + +. +<table> + <tr> + <td> + hi + </td> + </tr> +</table> + +okay. +. +<table> + <tr> + <td> + hi + </td> + </tr> +</table> +<p>okay.</p> +. + +. + <div> + *hello* + <foo><a> +. + <div> + *hello* + <foo><a> +. + +Here we have two code blocks with a Markdown paragraph between them: + +. +<DIV CLASS="foo"> + +*Markdown* + +</DIV> +. +<DIV CLASS="foo"> +<p><em>Markdown</em></p> +</DIV> +. + +In the following example, what looks like a Markdown code block +is actually part of the HTML block, which continues until a blank +line or the end of the document is reached: + +. +<div></div> +``` c +int x = 33; +``` +. +<div></div> +``` c +int x = 33; +``` +. + +A comment: + +. +<!-- Foo +bar + baz --> +. +<!-- Foo +bar + baz --> +. + +A processing instruction: + +. +<?php + echo 'foo' +?> +. +<?php + echo 'foo' +?> +. + +CDATA: + +. +<![CDATA[ +function matchwo(a,b) +{ +if (a < b && a < 0) then + { + return 1; + } +else + { + return 0; + } +} +]]> +. +<![CDATA[ +function matchwo(a,b) +{ +if (a < b && a < 0) then + { + return 1; + } +else + { + return 0; + } +} +]]> +. + +The opening tag can be indented 1-3 spaces, but not 4: + +. + <!-- foo --> + + <!-- foo --> +. + <!-- foo --> +<pre><code><!-- foo --> +</code></pre> +. + +An HTML block can interrupt a paragraph, and need not be preceded +by a blank line. + +. +Foo +<div> +bar +</div> +. +<p>Foo</p> +<div> +bar +</div> +. + +However, a following blank line is always needed, except at the end of +a document: + +. +<div> +bar +</div> +*foo* +. +<div> +bar +</div> +*foo* +. + +An incomplete HTML block tag may also start an HTML block: + +. +<div class +foo +. +<div class +foo +. + +This rule differs from John Gruber's original Markdown syntax +specification, which says: + +> The only restrictions are that block-level HTML elements — +> e.g. `<div>`, `<table>`, `<pre>`, `<p>`, etc. — must be separated from +> surrounding content by blank lines, and the start and end tags of the +> block should not be indented with tabs or spaces. + +In some ways Gruber's rule is more restrictive than the one given +here: + +- It requires that an HTML block be preceded by a blank line. +- It does not allow the start tag to be indented. +- It requires a matching end tag, which it also does not allow to + be indented. + +Indeed, most Markdown implementations, including some of Gruber's +own perl implementations, do not impose these restrictions. + +There is one respect, however, in which Gruber's rule is more liberal +than the one given here, since it allows blank lines to occur inside +an HTML block. There are two reasons for disallowing them here. +First, it removes the need to parse balanced tags, which is +expensive and can require backtracking from the end of the document +if no matching end tag is found. Second, it provides a very simple +and flexible way of including Markdown content inside HTML tags: +simply separate the Markdown from the HTML using blank lines: + +. +<div> + +*Emphasized* text. + +</div> +. +<div> +<p><em>Emphasized</em> text.</p> +</div> +. + +Compare: + +. +<div> +*Emphasized* text. +</div> +. +<div> +*Emphasized* text. +</div> +. + +Some Markdown implementations have adopted a convention of +interpreting content inside tags as text if the open tag has +the attribute `markdown=1`. The rule given above seems a simpler and +more elegant way of achieving the same expressive power, which is also +much simpler to parse. + +The main potential drawback is that one can no longer paste HTML +blocks into Markdown documents with 100% reliability. However, +*in most cases* this will work fine, because the blank lines in +HTML are usually followed by HTML block tags. For example: + +. +<table> + +<tr> + +<td> +Hi +</td> + +</tr> + +</table> +. +<table> +<tr> +<td> +Hi +</td> +</tr> +</table> +. + +Moreover, blank lines are usually not necessary and can be +deleted. The exception is inside `<pre>` tags; here, one can +replace the blank lines with ` ` entities. + +So there is no important loss of expressive power with the new rule. + +## Link reference definitions + +A [link reference definition](#link-reference-definition) +<a id="link-reference-definition"></a> consists of a [link +label](#link-label), indented up to three spaces, followed +by a colon (`:`), optional blank space (including up to one +newline), a [link destination](#link-destination), optional +blank space (including up to one newline), and an optional [link +title](#link-title), which if it is present must be separated +from the [link destination](#link-destination) by whitespace. +No further non-space characters may occur on the line. + +A [link reference-definition](#link-reference-definition) +does not correspond to a structural element of a document. Instead, it +defines a label which can be used in [reference links](#reference-link) +and reference-style [images](#image) elsewhere in the document. [Link +reference definitions] can come either before or after the links that use +them. + +. +[foo]: /url "title" + +[foo] +. +<p><a href="/url" title="title">foo</a></p> +. + +. + [foo]: + /url + 'the title' + +[foo] +. +<p><a href="/url" title="the title">foo</a></p> +. + +. +[Foo*bar\]]:my_(url) 'title (with parens)' + +[Foo*bar\]] +. +<p><a href="my_(url)" title="title (with parens)">Foo*bar]</a></p> +. + +. +[Foo bar]: +<my url> +'title' + +[Foo bar] +. +<p><a href="my%20url" title="title">Foo bar</a></p> +. + +The title may be omitted: + +. +[foo]: +/url + +[foo] +. +<p><a href="/url">foo</a></p> +. + +The link destination may not be omitted: + +. +[foo]: + +[foo] +. +<p>[foo]:</p> +<p>[foo]</p> +. + +A link can come before its corresponding definition: + +. +[foo] + +[foo]: url +. +<p><a href="url">foo</a></p> +. + +If there are several matching definitions, the first one takes +precedence: + +. +[foo] + +[foo]: first +[foo]: second +. +<p><a href="first">foo</a></p> +. + +As noted in the section on [Links], matching of labels is +case-insensitive (see [matches](#matches)). + +. +[FOO]: /url + +[Foo] +. +<p><a href="/url">Foo</a></p> +. + +. +[ΑΓΩ]: /φου + +[αγω] +. +<p><a href="/%CF%86%CE%BF%CF%85">αγω</a></p> +. + +Here is a link reference definition with no corresponding link. +It contributes nothing to the document. + +. +[foo]: /url +. +. + +This is not a link reference definition, because there are +non-space characters after the title: + +. +[foo]: /url "title" ok +. +<p>[foo]: /url "title" ok</p> +. + +This is not a link reference definition, because it is indented +four spaces: + +. + [foo]: /url "title" + +[foo] +. +<pre><code>[foo]: /url "title" +</code></pre> +<p>[foo]</p> +. + +This is not a link reference definition, because it occurs inside +a code block: + +. +``` +[foo]: /url +``` + +[foo] +. +<pre><code>[foo]: /url +</code></pre> +<p>[foo]</p> +. + +A [link reference definition](#link-reference-definition) cannot +interrupt a paragraph. + +. +Foo +[bar]: /baz + +[bar] +. +<p>Foo +[bar]: /baz</p> +<p>[bar]</p> +. + +However, it can directly follow other block elements, such as headers +and horizontal rules, and it need not be followed by a blank line. + +. +# [Foo] +[foo]: /url +> bar +. +<h1><a href="/url">Foo</a></h1> +<blockquote> +<p>bar</p> +</blockquote> +. + +Several [link references](#link-reference) can occur one after another, +without intervening blank lines. + +. +[foo]: /foo-url "foo" +[bar]: /bar-url + "bar" +[baz]: /baz-url + +[foo], +[bar], +[baz] +. +<p><a href="/foo-url" title="foo">foo</a>, +<a href="/bar-url" title="bar">bar</a>, +<a href="/baz-url">baz</a></p> +. + +[Link reference definitions](#link-reference-definition) can occur +inside block containers, like lists and block quotations. They +affect the entire document, not just the container in which they +are defined: + +. +[foo] + +> [foo]: /url +. +<p><a href="/url">foo</a></p> +<blockquote> +</blockquote> +. + + +## Paragraphs + +A sequence of non-blank lines that cannot be interpreted as other +kinds of blocks forms a [paragraph](#paragraph).<a id="paragraph"></a> +The contents of the paragraph are the result of parsing the +paragraph's raw content as inlines. The paragraph's raw content +is formed by concatenating the lines and removing initial and final +spaces. + +A simple example with two paragraphs: + +. +aaa + +bbb +. +<p>aaa</p> +<p>bbb</p> +. + +Paragraphs can contain multiple lines, but no blank lines: + +. +aaa +bbb + +ccc +ddd +. +<p>aaa +bbb</p> +<p>ccc +ddd</p> +. + +Multiple blank lines between paragraph have no effect: + +. +aaa + + +bbb +. +<p>aaa</p> +<p>bbb</p> +. + +Leading spaces are skipped: + +. + aaa + bbb +. +<p>aaa +bbb</p> +. + +Lines after the first may be indented any amount, since indented +code blocks cannot interrupt paragraphs. + +. +aaa + bbb + ccc +. +<p>aaa +bbb +ccc</p> +. + +However, the first line may be indented at most three spaces, +or an indented code block will be triggered: + +. + aaa +bbb +. +<p>aaa +bbb</p> +. + +. + aaa +bbb +. +<pre><code>aaa +</code></pre> +<p>bbb</p> +. + +Final spaces are stripped before inline parsing, so a paragraph +that ends with two or more spaces will not end with a hard line +break: + +. +aaa +bbb +. +<p>aaa<br /> +bbb</p> +. + +## Blank lines + +[Blank lines](#blank-line) between block-level elements are ignored, +except for the role they play in determining whether a [list](#list) +is [tight](#tight) or [loose](#loose). + +Blank lines at the beginning and end of the document are also ignored. + +. + + +aaa + + +# aaa + + +. +<p>aaa</p> +<h1>aaa</h1> +. + + +# Container blocks + +A [container block](#container-block) is a block that has other +blocks as its contents. There are two basic kinds of container blocks: +[block quotes](#block-quote) and [list items](#list-item). +[Lists](#list) are meta-containers for [list items](#list-item). + +We define the syntax for container blocks recursively. The general +form of the definition is: + +> If X is a sequence of blocks, then the result of +> transforming X in such-and-such a way is a container of type Y +> with these blocks as its content. + +So, we explain what counts as a block quote or list item by explaining +how these can be *generated* from their contents. This should suffice +to define the syntax, although it does not give a recipe for *parsing* +these constructions. (A recipe is provided below in the section entitled +[A parsing strategy](#appendix-a-a-parsing-strategy).) + +## Block quotes + +A [block quote marker](#block-quote-marker) <a id="block-quote-marker"></a> +consists of 0-3 spaces of initial indent, plus (a) the character `>` together +with a following space, or (b) a single character `>` not followed by a space. + +The following rules define [block quotes](#block-quote): +<a id="block-quote"></a> + +1. **Basic case.** If a string of lines *Ls* constitute a sequence + of blocks *Bs*, then the result of appending a [block quote + marker](#block-quote-marker) to the beginning of each line in *Ls* + is a [block quote](#block-quote) containing *Bs*. + +2. **Laziness.** If a string of lines *Ls* constitute a [block + quote](#block-quote) with contents *Bs*, then the result of deleting + the initial [block quote marker](#block-quote-marker) from one or + more lines in which the next non-space character after the [block + quote marker](#block-quote-marker) is [paragraph continuation + text](#paragraph-continuation-text) is a block quote with *Bs* as + its content. <a id="paragraph-continuation-text"></a> + [Paragraph continuation text](#paragraph-continuation-text) is text + that will be parsed as part of the content of a paragraph, but does + not occur at the beginning of the paragraph. + +3. **Consecutiveness.** A document cannot contain two [block + quotes](#block-quote) in a row unless there is a [blank + line](#blank-line) between them. + +Nothing else counts as a [block quote](#block-quote). + +Here is a simple example: + +. +> # Foo +> bar +> baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +The spaces after the `>` characters can be omitted: + +. +># Foo +>bar +> baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +The `>` characters can be indented 1-3 spaces: + +. + > # Foo + > bar + > baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +Four spaces gives us a code block: + +. + > # Foo + > bar + > baz +. +<pre><code>> # Foo +> bar +> baz +</code></pre> +. + +The Laziness clause allows us to omit the `>` before a +paragraph continuation line: + +. +> # Foo +> bar +baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +A block quote can contain some lazy and some non-lazy +continuation lines: + +. +> bar +baz +> foo +. +<blockquote> +<p>bar +baz +foo</p> +</blockquote> +. + +Laziness only applies to lines that are continuations of +paragraphs. Lines containing characters or indentation that indicate +block structure cannot be lazy. + +. +> foo +--- +. +<blockquote> +<p>foo</p> +</blockquote> +<hr /> +. + +. +> - foo +- bar +. +<blockquote> +<ul> +<li>foo</li> +</ul> +</blockquote> +<ul> +<li>bar</li> +</ul> +. + +. +> foo + bar +. +<blockquote> +<pre><code>foo +</code></pre> +</blockquote> +<pre><code>bar +</code></pre> +. + +. +> ``` +foo +``` +. +<blockquote> +<pre><code></code></pre> +</blockquote> +<p>foo</p> +<pre><code></code></pre> +. + +A block quote can be empty: + +. +> +. +<blockquote> +</blockquote> +. + +. +> +> +> +. +<blockquote> +</blockquote> +. + +A block quote can have initial or final blank lines: + +. +> +> foo +> +. +<blockquote> +<p>foo</p> +</blockquote> +. + +A blank line always separates block quotes: + +. +> foo + +> bar +. +<blockquote> +<p>foo</p> +</blockquote> +<blockquote> +<p>bar</p> +</blockquote> +. + +(Most current Markdown implementations, including John Gruber's +original `Markdown.pl`, will parse this example as a single block quote +with two paragraphs. But it seems better to allow the author to decide +whether two block quotes or one are wanted.) + +Consecutiveness means that if we put these block quotes together, +we get a single block quote: + +. +> foo +> bar +. +<blockquote> +<p>foo +bar</p> +</blockquote> +. + +To get a block quote with two paragraphs, use: + +. +> foo +> +> bar +. +<blockquote> +<p>foo</p> +<p>bar</p> +</blockquote> +. + +Block quotes can interrupt paragraphs: + +. +foo +> bar +. +<p>foo</p> +<blockquote> +<p>bar</p> +</blockquote> +. + +In general, blank lines are not needed before or after block +quotes: + +. +> aaa +*** +> bbb +. +<blockquote> +<p>aaa</p> +</blockquote> +<hr /> +<blockquote> +<p>bbb</p> +</blockquote> +. + +However, because of laziness, a blank line is needed between +a block quote and a following paragraph: + +. +> bar +baz +. +<blockquote> +<p>bar +baz</p> +</blockquote> +. + +. +> bar + +baz +. +<blockquote> +<p>bar</p> +</blockquote> +<p>baz</p> +. + +. +> bar +> +baz +. +<blockquote> +<p>bar</p> +</blockquote> +<p>baz</p> +. + +It is a consequence of the Laziness rule that any number +of initial `>`s may be omitted on a continuation line of a +nested block quote: + +. +> > > foo +bar +. +<blockquote> +<blockquote> +<blockquote> +<p>foo +bar</p> +</blockquote> +</blockquote> +</blockquote> +. + +. +>>> foo +> bar +>>baz +. +<blockquote> +<blockquote> +<blockquote> +<p>foo +bar +baz</p> +</blockquote> +</blockquote> +</blockquote> +. + +When including an indented code block in a block quote, +remember that the [block quote marker](#block-quote-marker) includes +both the `>` and a following space. So *five spaces* are needed after +the `>`: + +. +> code + +> not code +. +<blockquote> +<pre><code>code +</code></pre> +</blockquote> +<blockquote> +<p>not code</p> +</blockquote> +. + + +## List items + +A [list marker](#list-marker) <a id="list-marker"></a> is a +[bullet list marker](#bullet-list-marker) or an [ordered list +marker](#ordered-list-marker). + +A [bullet list marker](#bullet-list-marker) <a id="bullet-list-marker"></a> +is a `-`, `+`, or `*` character. + +An [ordered list marker](#ordered-list-marker) <a id="ordered-list-marker"></a> +is a sequence of one of more digits (`0-9`), followed by either a +`.` character or a `)` character. + +The following rules define [list items](#list-item): + +1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of + blocks *Bs* starting with a non-space character and not separated + from each other by more than one blank line, and *M* is a list + marker *M* of width *W* followed by 0 < *N* < 5 spaces, then the result + of prepending *M* and the following spaces to the first line of + *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a + list item with *Bs* as its contents. The type of the list item + (bullet or ordered) is determined by the type of its list marker. + If the list item is ordered, then it is also assigned a start + number, based on the ordered list marker. + +For example, let *Ls* be the lines + +. +A paragraph +with two lines. + + indented code + +> A block quote. +. +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +. + +And let *M* be the marker `1.`, and *N* = 2. Then rule #1 says +that the following is an ordered list item with start number 1, +and the same contents as *Ls*: + +. +1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +The most important thing to notice is that the position of +the text after the list marker determines how much indentation +is needed in subsequent blocks in the list item. If the list +marker takes up two spaces, and there are three spaces between +the list marker and the next nonspace character, then blocks +must be indented five spaces in order to fall under the list +item. + +Here are some examples showing how far content must be indented to be +put under the list item: + +. +- one + + two +. +<ul> +<li>one</li> +</ul> +<p>two</p> +. + +. +- one + + two +. +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +. + +. + - one + + two +. +<ul> +<li>one</li> +</ul> +<pre><code> two +</code></pre> +. + +. + - one + + two +. +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +. + +It is tempting to think of this in terms of columns: the continuation +blocks must be indented at least to the column of the first nonspace +character after the list marker. However, that is not quite right. +The spaces after the list marker determine how much relative indentation +is needed. Which column this indentation reaches will depend on +how the list item is embedded in other constructions, as shown by +this example: + +. + > > 1. one +>> +>> two +. +<blockquote> +<blockquote> +<ol> +<li><p>one</p> +<p>two</p></li> +</ol> +</blockquote> +</blockquote> +. + +Here `two` occurs in the same column as the list marker `1.`, +but is actually contained in the list item, because there is +sufficent indentation after the last containing blockquote marker. + +The converse is also possible. In the following example, the word `two` +occurs far to the right of the initial text of the list item, `one`, but +it is not considered part of the list item, because it is not indented +far enough past the blockquote marker: + +. +>>- one +>> + > > two +. +<blockquote> +<blockquote> +<ul> +<li>one</li> +</ul> +<p>two</p> +</blockquote> +</blockquote> +. + +A list item may not contain blocks that are separated by more than +one blank line. Thus, two blank lines will end a list, unless the +two blanks are contained in a [fenced code block](#fenced-code-block). + +. +- foo + + bar + +- foo + + + bar + +- ``` + foo + + + bar + ``` +. +<ul> +<li><p>foo</p> +<p>bar</p></li> +<li><p>foo</p></li> +</ul> +<p>bar</p> +<ul> +<li><pre><code>foo + + +bar +</code></pre></li> +</ul> +. + +A list item may contain any kind of block: + +. +1. foo + + ``` + bar + ``` + + baz + + > bam +. +<ol> +<li><p>foo</p> +<pre><code>bar +</code></pre> +<p>baz</p> +<blockquote> +<p>bam</p> +</blockquote></li> +</ol> +. + +2. **Item starting with indented code.** If a sequence of lines *Ls* + constitute a sequence of blocks *Bs* starting with an indented code + block and not separated from each other by more than one blank line, + and *M* is a list marker *M* of width *W* followed by + one space, then the result of prepending *M* and the following + space to the first line of *Ls*, and indenting subsequent lines of + *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. + If a line is empty, then it need not be indented. The type of the + list item (bullet or ordered) is determined by the type of its list + marker. If the list item is ordered, then it is also assigned a + start number, based on the ordered list marker. + +An indented code block will have to be indented four spaces beyond +the edge of the region where text will be included in the list item. +In the following case that is 6 spaces: + +. +- foo + + bar +. +<ul> +<li><p>foo</p> +<pre><code>bar +</code></pre></li> +</ul> +. + +And in this case it is 11 spaces: + +. + 10. foo + + bar +. +<ol start="10"> +<li><p>foo</p> +<pre><code>bar +</code></pre></li> +</ol> +. + +If the *first* block in the list item is an indented code block, +then by rule #2, the contents must be indented *one* space after the +list marker: + +. + indented code + +paragraph + + more code +. +<pre><code>indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre> +. + +. +1. indented code + + paragraph + + more code +. +<ol> +<li><pre><code>indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre></li> +</ol> +. + +Note that an additional space indent is interpreted as space +inside the code block: + +. +1. indented code + + paragraph + + more code +. +<ol> +<li><pre><code> indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre></li> +</ol> +. + +Note that rules #1 and #2 only apply to two cases: (a) cases +in which the lines to be included in a list item begin with a nonspace +character, and (b) cases in which they begin with an indented code +block. In a case like the following, where the first block begins with +a three-space indent, the rules do not allow us to form a list item by +indenting the whole thing and prepending a list marker: + +. + foo + +bar +. +<p>foo</p> +<p>bar</p> +. + +. +- foo + + bar +. +<ul> +<li>foo</li> +</ul> +<p>bar</p> +. + +This is not a significant restriction, because when a block begins +with 1-3 spaces indent, the indentation can always be removed without +a change in interpretation, allowing rule #1 to be applied. So, in +the above case: + +. +- foo + + bar +. +<ul> +<li><p>foo</p> +<p>bar</p></li> +</ul> +. + + +3. **Indentation.** If a sequence of lines *Ls* constitutes a list item + according to rule #1 or #2, then the result of indenting each line + of *L* by 1-3 spaces (the same for each line) also constitutes a + list item with the same contents and attributes. If a line is + empty, then it need not be indented. + +Indented one space: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Indented two spaces: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Indented three spaces: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Four spaces indent gives a code block: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<pre><code>1. A paragraph + with two lines. + + indented code + + > A block quote. +</code></pre> +. + + +4. **Laziness.** If a string of lines *Ls* constitute a [list + item](#list-item) with contents *Bs*, then the result of deleting + some or all of the indentation from one or more lines in which the + next non-space character after the indentation is + [paragraph continuation text](#paragraph-continuation-text) is a + list item with the same contents and attributes. + +Here is an example with lazy continuation lines: + +. + 1. A paragraph +with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Indentation can be partially deleted: + +. + 1. A paragraph + with two lines. +. +<ol> +<li>A paragraph +with two lines.</li> +</ol> +. + +These examples show how laziness can work in nested structures: + +. +> 1. > Blockquote +continued here. +. +<blockquote> +<ol> +<li><blockquote> +<p>Blockquote +continued here.</p> +</blockquote></li> +</ol> +</blockquote> +. + +. +> 1. > Blockquote +> continued here. +. +<blockquote> +<ol> +<li><blockquote> +<p>Blockquote +continued here.</p> +</blockquote></li> +</ol> +</blockquote> +. + + +5. **That's all.** Nothing that is not counted as a list item by rules + #1--4 counts as a [list item](#list-item). + +The rules for sublists follow from the general rules above. A sublist +must be indented the same number of spaces a paragraph would need to be +in order to be included in the list item. + +So, in this case we need two spaces indent: + +. +- foo + - bar + - baz +. +<ul> +<li>foo +<ul> +<li>bar +<ul> +<li>baz</li> +</ul></li> +</ul></li> +</ul> +. + +One is not enough: + +. +- foo + - bar + - baz +. +<ul> +<li>foo</li> +<li>bar</li> +<li>baz</li> +</ul> +. + +Here we need four, because the list marker is wider: + +. +10) foo + - bar +. +<ol start="10"> +<li>foo +<ul> +<li>bar</li> +</ul></li> +</ol> +. + +Three is not enough: + +. +10) foo + - bar +. +<ol start="10"> +<li>foo</li> +</ol> +<ul> +<li>bar</li> +</ul> +. + +A list may be the first block in a list item: + +. +- - foo +. +<ul> +<li><ul> +<li>foo</li> +</ul></li> +</ul> +. + +. +1. - 2. foo +. +<ol> +<li><ul> +<li><ol start="2"> +<li>foo</li> +</ol></li> +</ul></li> +</ol> +. + +A list item may be empty: + +. +- foo +- +- bar +. +<ul> +<li>foo</li> +<li></li> +<li>bar</li> +</ul> +. + +. +- +. +<ul> +<li></li> +</ul> +. + +### Motivation + +John Gruber's Markdown spec says the following about list items: + +1. "List markers typically start at the left margin, but may be indented + by up to three spaces. List markers must be followed by one or more + spaces or a tab." + +2. "To make lists look nice, you can wrap items with hanging indents.... + But if you don't want to, you don't have to." + +3. "List items may consist of multiple paragraphs. Each subsequent + paragraph in a list item must be indented by either 4 spaces or one + tab." + +4. "It looks nice if you indent every line of the subsequent paragraphs, + but here again, Markdown will allow you to be lazy." + +5. "To put a blockquote within a list item, the blockquote's `>` + delimiters need to be indented." + +6. "To put a code block within a list item, the code block needs to be + indented twice — 8 spaces or two tabs." + +These rules specify that a paragraph under a list item must be indented +four spaces (presumably, from the left margin, rather than the start of +the list marker, but this is not said), and that code under a list item +must be indented eight spaces instead of the usual four. They also say +that a block quote must be indented, but not by how much; however, the +example given has four spaces indentation. Although nothing is said +about other kinds of block-level content, it is certainly reasonable to +infer that *all* block elements under a list item, including other +lists, must be indented four spaces. This principle has been called the +*four-space rule*. + +The four-space rule is clear and principled, and if the reference +implementation `Markdown.pl` had followed it, it probably would have +become the standard. However, `Markdown.pl` allowed paragraphs and +sublists to start with only two spaces indentation, at least on the +outer level. Worse, its behavior was inconsistent: a sublist of an +outer-level list needed two spaces indentation, but a sublist of this +sublist needed three spaces. It is not surprising, then, that different +implementations of Markdown have developed very different rules for +determining what comes under a list item. (Pandoc and python-Markdown, +for example, stuck with Gruber's syntax description and the four-space +rule, while discount, redcarpet, marked, PHP Markdown, and others +followed `Markdown.pl`'s behavior more closely.) + +Unfortunately, given the divergences between implementations, there +is no way to give a spec for list items that will be guaranteed not +to break any existing documents. However, the spec given here should +correctly handle lists formatted with either the four-space rule or +the more forgiving `Markdown.pl` behavior, provided they are laid out +in a way that is natural for a human to read. + +The strategy here is to let the width and indentation of the list marker +determine the indentation necessary for blocks to fall under the list +item, rather than having a fixed and arbitrary number. The writer can +think of the body of the list item as a unit which gets indented to the +right enough to fit the list marker (and any indentation on the list +marker). (The laziness rule, #4, then allows continuation lines to be +unindented if needed.) + +This rule is superior, we claim, to any rule requiring a fixed level of +indentation from the margin. The four-space rule is clear but +unnatural. It is quite unintuitive that + +``` markdown +- foo + + bar + + - baz +``` + +should be parsed as two lists with an intervening paragraph, + +``` html +<ul> +<li>foo</li> +</ul> +<p>bar</p> +<ul> +<li>baz</li> +</ul> +``` + +as the four-space rule demands, rather than a single list, + +``` html +<ul> +<li><p>foo</p> +<p>bar</p> +<ul> +<li>baz</li> +</ul></li> +</ul> +``` + +The choice of four spaces is arbitrary. It can be learned, but it is +not likely to be guessed, and it trips up beginners regularly. + +Would it help to adopt a two-space rule? The problem is that such +a rule, together with the rule allowing 1--3 spaces indentation of the +initial list marker, allows text that is indented *less than* the +original list marker to be included in the list item. For example, +`Markdown.pl` parses + +``` markdown + - one + + two +``` + +as a single list item, with `two` a continuation paragraph: + +``` html +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +``` + +and similarly + +``` markdown +> - one +> +> two +``` + +as + +``` html +<blockquote> +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +</blockquote> +``` + +This is extremely unintuitive. + +Rather than requiring a fixed indent from the margin, we could require +a fixed indent (say, two spaces, or even one space) from the list marker (which +may itself be indented). This proposal would remove the last anomaly +discussed. Unlike the spec presented above, it would count the following +as a list item with a subparagraph, even though the paragraph `bar` +is not indented as far as the first paragraph `foo`: + +``` markdown + 10. foo + + bar +``` + +Arguably this text does read like a list item with `bar` as a subparagraph, +which may count in favor of the proposal. However, on this proposal indented +code would have to be indented six spaces after the list marker. And this +would break a lot of existing Markdown, which has the pattern: + +``` markdown +1. foo + + indented code +``` + +where the code is indented eight spaces. The spec above, by contrast, will +parse this text as expected, since the code block's indentation is measured +from the beginning of `foo`. + +The one case that needs special treatment is a list item that *starts* +with indented code. How much indentation is required in that case, since +we don't have a "first paragraph" to measure from? Rule #2 simply stipulates +that in such cases, we require one space indentation from the list marker +(and then the normal four spaces for the indented code). This will match the +four-space rule in cases where the list marker plus its initial indentation +takes four spaces (a common case), but diverge in other cases. + +## Lists + +A [list](#list) <a id="list"></a> is a sequence of one or more +list items [of the same type](#of-the-same-type). The list items +may be separated by single [blank lines](#blank-line), but two +blank lines end all containing lists. + +Two list items are [of the same type](#of-the-same-type) +<a id="of-the-same-type"></a> if they begin with a [list +marker](#list-marker) of the same type. Two list markers are of the +same type if (a) they are bullet list markers using the same character +(`-`, `+`, or `*`) or (b) they are ordered list numbers with the same +delimiter (either `.` or `)`). + +A list is an [ordered list](#ordered-list) <a id="ordered-list"></a> +if its constituent list items begin with +[ordered list markers](#ordered-list-marker), and a [bullet +list](#bullet-list) <a id="bullet-list"></a> if its constituent list +items begin with [bullet list markers](#bullet-list-marker). + +The [start number](#start-number) <a id="start-number"></a> +of an [ordered list](#ordered-list) is determined by the list number of +its initial list item. The numbers of subsequent list items are +disregarded. + +A list is [loose](#loose) if it any of its constituent list items are +separated by blank lines, or if any of its constituent list items +directly contain two block-level elements with a blank line between +them. Otherwise a list is [tight](#tight). (The difference in HTML output +is that paragraphs in a loose with are wrapped in `<p>` tags, while +paragraphs in a tight list are not.) + +Changing the bullet or ordered list delimiter starts a new list: + +. +- foo +- bar ++ baz +. +<ul> +<li>foo</li> +<li>bar</li> +</ul> +<ul> +<li>baz</li> +</ul> +. + +. +1. foo +2. bar +3) baz +. +<ol> +<li>foo</li> +<li>bar</li> +</ol> +<ol start="3"> +<li>baz</li> +</ol> +. + +There can be blank lines between items, but two blank lines end +a list: + +. +- foo + +- bar + + +- baz +. +<ul> +<li><p>foo</p></li> +<li><p>bar</p></li> +</ul> +<ul> +<li>baz</li> +</ul> +. + +As illustrated above in the section on [list items](#list-item), +two blank lines between blocks *within* a list item will also end a +list: + +. +- foo + + + bar +- baz +. +<ul> +<li>foo</li> +</ul> +<p>bar</p> +<ul> +<li>baz</li> +</ul> +. + +Indeed, two blank lines will end *all* containing lists: + +. +- foo + - bar + - baz + + + bim +. +<ul> +<li>foo +<ul> +<li>bar +<ul> +<li>baz</li> +</ul></li> +</ul></li> +</ul> +<pre><code> bim +</code></pre> +. + +Thus, two blank lines can be used to separate consecutive lists of +the same type, or to separate a list from an indented code block +that would otherwise be parsed as a subparagraph of the final list +item: + +. +- foo +- bar + + +- baz +- bim +. +<ul> +<li>foo</li> +<li>bar</li> +</ul> +<ul> +<li>baz</li> +<li>bim</li> +</ul> +. + +. +- foo + + notcode + +- foo + + + code +. +<ul> +<li><p>foo</p> +<p>notcode</p></li> +<li><p>foo</p></li> +</ul> +<pre><code>code +</code></pre> +. + +List items need not be indented to the same level. The following +list items will be treated as items at the same list level, +since none is indented enough to belong to the previous list +item: + +. +- a + - b + - c + - d + - e + - f +- g +. +<ul> +<li>a</li> +<li>b</li> +<li>c</li> +<li>d</li> +<li>e</li> +<li>f</li> +<li>g</li> +</ul> +. + +This is a loose list, because there is a blank line between +two of the list items: + +. +- a +- b + +- c +. +<ul> +<li><p>a</p></li> +<li><p>b</p></li> +<li><p>c</p></li> +</ul> +. + +So is this, with a empty second item: + +. +* a +* + +* c +. +<ul> +<li><p>a</p></li> +<li></li> +<li><p>c</p></li> +</ul> +. + +These are loose lists, even though there is no space between the items, +because one of the items directly contains two block-level elements +with a blank line between them: + +. +- a +- b + + c +- d +. +<ul> +<li><p>a</p></li> +<li><p>b</p> +<p>c</p></li> +<li><p>d</p></li> +</ul> +. + +. +- a +- b + + [ref]: /url +- d +. +<ul> +<li><p>a</p></li> +<li><p>b</p></li> +<li><p>d</p></li> +</ul> +. + +This is a tight list, because the blank lines are in a code block: + +. +- a +- ``` + b + + + ``` +- c +. +<ul> +<li>a</li> +<li><pre><code>b + + +</code></pre></li> +<li>c</li> +</ul> +. + +This is a tight list, because the blank line is between two +paragraphs of a sublist. So the inner list is loose while +the other list is tight: + +. +- a + - b + + c +- d +. +<ul> +<li>a +<ul> +<li><p>b</p> +<p>c</p></li> +</ul></li> +<li>d</li> +</ul> +. + +This is a tight list, because the blank line is inside the +block quote: + +. +* a + > b + > +* c +. +<ul> +<li>a +<blockquote> +<p>b</p> +</blockquote></li> +<li>c</li> +</ul> +. + +This list is tight, because the consecutive block elements +are not separated by blank lines: + +. +- a + > b + ``` + c + ``` +- d +. +<ul> +<li>a +<blockquote> +<p>b</p> +</blockquote> +<pre><code>c +</code></pre></li> +<li>d</li> +</ul> +. + +A single-paragraph list is tight: + +. +- a +. +<ul> +<li>a</li> +</ul> +. + +. +- a + - b +. +<ul> +<li>a +<ul> +<li>b</li> +</ul></li> +</ul> +. + +Here the outer list is loose, the inner list tight: + +. +* foo + * bar + + baz +. +<ul> +<li><p>foo</p> +<ul> +<li>bar</li> +</ul> +<p>baz</p></li> +</ul> +. + +. +- a + - b + - c + +- d + - e + - f +. +<ul> +<li><p>a</p> +<ul> +<li>b</li> +<li>c</li> +</ul></li> +<li><p>d</p> +<ul> +<li>e</li> +<li>f</li> +</ul></li> +</ul> +. + +# Inlines + +Inlines are parsed sequentially from the beginning of the character +stream to the end (left to right, in left-to-right languages). +Thus, for example, in + +. +`hi`lo` +. +<p><code>hi</code>lo`</p> +. + +`hi` is parsed as code, leaving the backtick at the end as a literal +backtick. + +## Backslash escapes + +Any ASCII punctuation character may be backslash-escaped: + +. +\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ +. +<p>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</p> +. + +Backslashes before other characters are treated as literal +backslashes: + +. +\→\A\a\ \3\φ\« +. +<p>\ \A\a\ \3\φ\«</p> +. + +Escaped characters are treated as regular characters and do +not have their usual Markdown meanings: + +. +\*not emphasized* +\<br/> not a tag +\[not a link](/foo) +\`not code` +1\. not a list +\* not a list +\# not a header +\[foo]: /url "not a reference" +. +<p>*not emphasized* +<br/> not a tag +[not a link](/foo) +`not code` +1. not a list +* not a list +# not a header +[foo]: /url "not a reference"</p> +. + +If a backslash is itself escaped, the following character is not: + +. +\\*emphasis* +. +<p>\<em>emphasis</em></p> +. + +A backslash at the end of the line is a hard line break: + +. +foo\ +bar +. +<p>foo<br /> +bar</p> +. + +Backslash escapes do not work in code blocks, code spans, autolinks, or +raw HTML: + +. +`` \[\` `` +. +<p><code>\[\`</code></p> +. + +. + \[\] +. +<pre><code>\[\] +</code></pre> +. + +. +~~~ +\[\] +~~~ +. +<pre><code>\[\] +</code></pre> +. + +. +<http://google.com?find=\*> +. +<p><a href="http://google.com?find=%5C*">http://google.com?find=\*</a></p> +. + +. +<a href="/bar\/)"> +. +<p><a href="/bar\/)"></p> +. + +But they work in all other contexts, including URLs and link titles, +link references, and info strings in [fenced code +blocks](#fenced-code-block): + +. +[foo](/bar\* "ti\*tle") +. +<p><a href="/bar*" title="ti*tle">foo</a></p> +. + +. +[foo] + +[foo]: /bar\* "ti\*tle" +. +<p><a href="/bar*" title="ti*tle">foo</a></p> +. + +. +``` foo\+bar +foo +``` +. +<pre><code class="language-foo+bar">foo +</code></pre> +. + + +## Entities + +With the goal of making this standard as HTML-agnostic as possible, all HTML valid HTML Entities in any +context are recognized as such and converted into their actual values (i.e. the UTF8 characters representing +the entity itself) before they are stored in the AST. + +This allows implementations that target HTML output to trivially escape the entities when generating HTML, +and simplifies the job of implementations targetting other languages, as these will only need to handle the +UTF8 chars and need not be HTML-entity aware. + +[Named entities](#name-entities) <a id="named-entities"></a> consist of `&` ++ any of the valid HTML5 entity names + `;`. The [following document](http://www.whatwg.org/specs/web-apps/current-work/multipage/entities.json) +is used as an authoritative source of the valid entity names and their corresponding codepoints. + +Conforming implementations that target Markdown don't need to generate entities for all the valid +named entities that exist, with the exception of `"` (`"`), `&` (`&`), `<` (`<`) and `>` (`>`), +which always need to be written as entities for security reasons. + +. + & © Æ Ď ¾ ℋ ⅆ ∲ +. +<p> & © Æ Ď ¾ ℋ ⅆ ∲</p> +. + +[Decimal entities](#decimal-entities) <a id="decimal-entities"></a> +consist of `&#` + a string of 1--8 arabic digits + `;`. Again, these entities need to be recognised +and tranformed into their corresponding UTF8 codepoints. Invalid Unicode codepoints will be written +as the "unknown codepoint" character (`0xFFFD`) + +. +# Ӓ Ϡ � +. +<p># Ӓ Ϡ �</p> +. + +[Hexadecimal entities](#hexadecimal-entities) <a id="hexadecimal-entities"></a> +consist of `&#` + either `X` or `x` + a string of 1-8 hexadecimal digits ++ `;`. They will also be parsed and turned into their corresponding UTF8 values in the AST. + +. +" ആ ಫ +. +<p>" ആ ಫ</p> +. + +Here are some nonentities: + +. +  &x; &#; &#x; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?; +. +<p>&nbsp &x; &#; &#x; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?;</p> +. + +Although HTML5 does accept some entities without a trailing semicolon +(such as `©`), these are not recognized as entities here, because it makes the grammar too ambiguous: + +. +© +. +<p>&copy</p> +. + +Strings that are not on the list of HTML5 named entities are not recognized as entities either: + +. +&MadeUpEntity; +. +<p>&MadeUpEntity;</p> +. + +Entities are recognized in any context besides code spans or +code blocks, including raw HTML, URLs, [link titles](#link-title), and +[fenced code block](#fenced-code-block) info strings: + +. +<a href="öö.html"> +. +<p><a href="öö.html"></p> +. + +. +[foo](/föö "föö") +. +<p><a href="/f%C3%B6%C3%B6" title="föö">foo</a></p> +. + +. +[foo] + +[foo]: /föö "föö" +. +<p><a href="/f%C3%B6%C3%B6" title="föö">foo</a></p> +. + +. +``` föö +foo +``` +. +<pre><code class="language-föö">foo +</code></pre> +. + +Entities are treated as literal text in code spans and code blocks: + +. +`föö` +. +<p><code>f&ouml;&ouml;</code></p> +. + +. + föfö +. +<pre><code>f&ouml;f&ouml; +</code></pre> +. + +## Code span + +A [backtick string](#backtick-string) <a id="backtick-string"></a> +is a string of one or more backtick characters (`` ` ``) that is neither +preceded nor followed by a backtick. + +A code span begins with a backtick string and ends with a backtick +string of equal length. The contents of the code span are the +characters between the two backtick strings, with leading and trailing +spaces and newlines removed, and consecutive spaces and newlines +collapsed to single spaces. + +This is a simple code span: + +. +`foo` +. +<p><code>foo</code></p> +. + +Here two backticks are used, because the code contains a backtick. +This example also illustrates stripping of leading and trailing spaces: + +. +`` foo ` bar `` +. +<p><code>foo ` bar</code></p> +. + +This example shows the motivation for stripping leading and trailing +spaces: + +. +` `` ` +. +<p><code>``</code></p> +. + +Newlines are treated like spaces: + +. +`` +foo +`` +. +<p><code>foo</code></p> +. + +Interior spaces and newlines are collapsed into single spaces, just +as they would be by a browser: + +. +`foo bar + baz` +. +<p><code>foo bar baz</code></p> +. + +Q: Why not just leave the spaces, since browsers will collapse them +anyway? A: Because we might be targeting a non-HTML format, and we +shouldn't rely on HTML-specific rendering assumptions. + +(Existing implementations differ in their treatment of internal +spaces and newlines. Some, including `Markdown.pl` and +`showdown`, convert an internal newline into a `<br />` tag. +But this makes things difficult for those who like to hard-wrap +their paragraphs, since a line break in the midst of a code +span will cause an unintended line break in the output. Others +just leave internal spaces as they are, which is fine if only +HTML is being targeted.) + +. +`foo `` bar` +. +<p><code>foo `` bar</code></p> +. + +Note that backslash escapes do not work in code spans. All backslashes +are treated literally: + +. +`foo\`bar` +. +<p><code>foo\</code>bar`</p> +. + +Backslash escapes are never needed, because one can always choose a +string of *n* backtick characters as delimiters, where the code does +not contain any strings of exactly *n* backtick characters. + +Code span backticks have higher precedence than any other inline +constructs except HTML tags and autolinks. Thus, for example, this is +not parsed as emphasized text, since the second `*` is part of a code +span: + +. +*foo`*` +. +<p>*foo<code>*</code></p> +. + +And this is not parsed as a link: + +. +[not a `link](/foo`) +. +<p>[not a <code>link](/foo</code>)</p> +. + +But this is a link: + +. +<http://foo.bar.`baz>` +. +<p><a href="http://foo.bar.%60baz">http://foo.bar.`baz</a>`</p> +. + +And this is an HTML tag: + +. +<a href="`">` +. +<p><a href="`">`</p> +. + +When a backtick string is not closed by a matching backtick string, +we just have literal backticks: + +. +```foo`` +. +<p>```foo``</p> +. + +. +`foo +. +<p>`foo</p> +. + +## Emphasis and strong emphasis + +John Gruber's original [Markdown syntax +description](http://daringfireball.net/projects/markdown/syntax#em) says: + +> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of +> emphasis. Text wrapped with one `*` or `_` will be wrapped with an HTML +> `<em>` tag; double `*`'s or `_`'s will be wrapped with an HTML `<strong>` +> tag. + +This is enough for most users, but these rules leave much undecided, +especially when it comes to nested emphasis. The original +`Markdown.pl` test suite makes it clear that triple `***` and +`___` delimiters can be used for strong emphasis, and most +implementations have also allowed the following patterns: + +``` markdown +***strong emph*** +***strong** in emph* +***emph* in strong** +**in strong *emph*** +*in emph **strong*** +``` + +The following patterns are less widely supported, but the intent +is clear and they are useful (especially in contexts like bibliography +entries): + +``` markdown +*emph *with emph* in it* +**strong **with strong** in it** +``` + +Many implementations have also restricted intraword emphasis to +the `*` forms, to avoid unwanted emphasis in words containing +internal underscores. (It is best practice to put these in code +spans, but users often do not.) + +``` markdown +internal emphasis: foo*bar*baz +no emphasis: foo_bar_baz +``` + +The following rules capture all of these patterns, while allowing +for efficient parsing strategies that do not backtrack: + +1. A single `*` character [can open emphasis](#can-open-emphasis) + <a id="can-open-emphasis"></a> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, + (b) it is not followed by whitespace, and + (c) either it is not followed by a `*` character or it is + followed immediately by strong emphasis. + +2. A single `_` character [can open emphasis](#can-open-emphasis) iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not followed by whitespace, + (c) it is not preceded by an ASCII alphanumeric character, and + (d) either it is not followed by a `_` character or it is + followed immediately by strong emphasis. + +3. A single `*` character [can close emphasis](#can-close-emphasis) + <a id="can-close-emphasis"></a> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, and + (b) it is not preceded by whitespace. + +4. A single `_` character [can close emphasis](#can-close-emphasis) iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not preceded by whitespace, and + (c) it is not followed by an ASCII alphanumeric character. + +5. A double `**` [can open strong emphasis](#can-open-strong-emphasis) + <a id="can-open-strong-emphasis" ></a> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, + (b) it is not followed by whitespace, and + (c) either it is not followed by a `*` character or it is + followed immediately by emphasis. + +6. A double `__` [can open strong emphasis](#can-open-strong-emphasis) + iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not followed by whitespace, and + (c) it is not preceded by an ASCII alphanumeric character, and + (d) either it is not followed by a `_` character or it is + followed immediately by emphasis. + +7. A double `**` [can close strong emphasis](#can-close-strong-emphasis) + <a id="can-close-strong-emphasis" ></a> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, and + (b) it is not preceded by whitespace. + +8. A double `__` [can close strong emphasis](#can-close-strong-emphasis) + iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not preceded by whitespace, and + (c) it is not followed by an ASCII alphanumeric character. + +9. Emphasis begins with a delimiter that [can open + emphasis](#can-open-emphasis) and includes inlines parsed + sequentially until a delimiter that [can close + emphasis](#can-close-emphasis), and that uses the same + character (`_` or `*`) as the opening delimiter, is reached. + +10. Strong emphasis begins with a delimiter that [can open strong + emphasis](#can-open-strong-emphasis) and includes inlines parsed + sequentially until a delimiter that [can close strong + emphasis](#can-close-strong-emphasis), and that uses the + same character (`_` or `*`) as the opening delimiter, is reached. + +These rules can be illustrated through a series of examples. + +Simple emphasis: + +. +*foo bar* +. +<p><em>foo bar</em></p> +. + +. +_foo bar_ +. +<p><em>foo bar</em></p> +. + +Simple strong emphasis: + +. +**foo bar** +. +<p><strong>foo bar</strong></p> +. + +. +__foo bar__ +. +<p><strong>foo bar</strong></p> +. + +Emphasis can continue over line breaks: + +. +*foo +bar* +. +<p><em>foo +bar</em></p> +. + +. +_foo +bar_ +. +<p><em>foo +bar</em></p> +. + +. +**foo +bar** +. +<p><strong>foo +bar</strong></p> +. + +. +__foo +bar__ +. +<p><strong>foo +bar</strong></p> +. + +Emphasis can contain other inline constructs: + +. +*foo [bar](/url)* +. +<p><em>foo <a href="/url">bar</a></em></p> +. + +. +_foo [bar](/url)_ +. +<p><em>foo <a href="/url">bar</a></em></p> +. + +. +**foo [bar](/url)** +. +<p><strong>foo <a href="/url">bar</a></strong></p> +. + +. +__foo [bar](/url)__ +. +<p><strong>foo <a href="/url">bar</a></strong></p> +. + +Symbols contained in other inline constructs will not +close emphasis: + +. +*foo [bar*](/url) +. +<p>*foo <a href="/url">bar*</a></p> +. + +. +_foo [bar_](/url) +. +<p>_foo <a href="/url">bar_</a></p> +. + +. +**<a href="**"> +. +<p>**<a href="**"></p> +. + +. +__<a href="__"> +. +<p>__<a href="__"></p> +. + +. +*a `*`* +. +<p><em>a <code>*</code></em></p> +. + +. +_a `_`_ +. +<p><em>a <code>_</code></em></p> +. + +. +**a<http://foo.bar?q=**> +. +<p>**a<a href="http://foo.bar?q=**">http://foo.bar?q=**</a></p> +. + +. +__a<http://foo.bar?q=__> +. +<p>__a<a href="http://foo.bar?q=__">http://foo.bar?q=__</a></p> +. + +This is not emphasis, because the opening delimiter is +followed by white space: + +. +and * foo bar* +. +<p>and * foo bar*</p> +. + +. +_ foo bar_ +. +<p>_ foo bar_</p> +. + +. +and ** foo bar** +. +<p>and ** foo bar**</p> +. + +. +__ foo bar__ +. +<p>__ foo bar__</p> +. + +This is not emphasis, because the closing delimiter is +preceded by white space: + +. +and *foo bar * +. +<p>and *foo bar *</p> +. + +. +and _foo bar _ +. +<p>and _foo bar _</p> +. + +. +and **foo bar ** +. +<p>and **foo bar **</p> +. + +. +and __foo bar __ +. +<p>and __foo bar __</p> +. + +The rules imply that a sequence of four or more unescaped `*` or +`_` characters will always be parsed as a literal string: + +. +****hi**** +. +<p>****hi****</p> +. + +. +_____hi_____ +. +<p>_____hi_____</p> +. + +. +Sign here: _________ +. +<p>Sign here: _________</p> +. + +The rules also imply that there can be no empty emphasis or strong +emphasis: + +. +** is not an empty emphasis +. +<p>** is not an empty emphasis</p> +. + +. +**** is not an empty strong emphasis +. +<p>**** is not an empty strong emphasis</p> +. + +To include `*` or `_` in emphasized sections, use backslash escapes +or code spans: + +. +*here is a \** +. +<p><em>here is a *</em></p> +. + +. +__this is a double underscore (`__`)__ +. +<p><strong>this is a double underscore (<code>__</code>)</strong></p> +. + +`*` delimiters allow intra-word emphasis; `_` delimiters do not: + +. +foo*bar*baz +. +<p>foo<em>bar</em>baz</p> +. + +. +foo_bar_baz +. +<p>foo_bar_baz</p> +. + +. +foo__bar__baz +. +<p>foo__bar__baz</p> +. + +. +_foo_bar_baz_ +. +<p><em>foo_bar_baz</em></p> +. + +. +11*15*32 +. +<p>11<em>15</em>32</p> +. + +. +11_15_32 +. +<p>11_15_32</p> +. + +Internal underscores will be ignored in underscore-delimited +emphasis: + +. +_foo_bar_baz_ +. +<p><em>foo_bar_baz</em></p> +. + +. +__foo__bar__baz__ +. +<p><strong>foo__bar__baz</strong></p> +. + +The rules are sufficient for the following nesting patterns: + +. +***foo bar*** +. +<p><strong><em>foo bar</em></strong></p> +. + +. +___foo bar___ +. +<p><strong><em>foo bar</em></strong></p> +. + +. +***foo** bar* +. +<p><em><strong>foo</strong> bar</em></p> +. + +. +___foo__ bar_ +. +<p><em><strong>foo</strong> bar</em></p> +. + +. +***foo* bar** +. +<p><strong><em>foo</em> bar</strong></p> +. + +. +___foo_ bar__ +. +<p><strong><em>foo</em> bar</strong></p> +. + +. +*foo **bar*** +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +_foo __bar___ +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +**foo *bar*** +. +<p><strong>foo <em>bar</em></strong></p> +. + +. +__foo _bar___ +. +<p><strong>foo <em>bar</em></strong></p> +. + +. +*foo **bar*** +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +_foo __bar___ +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +*foo *bar* baz* +. +<p><em>foo <em>bar</em> baz</em></p> +. + +. +_foo _bar_ baz_ +. +<p><em>foo <em>bar</em> baz</em></p> +. + +. +**foo **bar** baz** +. +<p><strong>foo <strong>bar</strong> baz</strong></p> +. + +. +__foo __bar__ baz__ +. +<p><strong>foo <strong>bar</strong> baz</strong></p> +. + +. +*foo **bar** baz* +. +<p><em>foo <strong>bar</strong> baz</em></p> +. + +. +_foo __bar__ baz_ +. +<p><em>foo <strong>bar</strong> baz</em></p> +. + +. +**foo *bar* baz** +. +<p><strong>foo <em>bar</em> baz</strong></p> +. + +. +__foo _bar_ baz__ +. +<p><strong>foo <em>bar</em> baz</strong></p> +. + +Note that you cannot nest emphasis directly inside emphasis +using the same delimeter, or strong emphasis directly inside +strong emphasis: + +. +**foo** +. +<p><strong>foo</strong></p> +. + +. +****foo**** +. +<p>****foo****</p> +. + +For these nestings, you need to switch delimiters: + +. +*_foo_* +. +<p><em><em>foo</em></em></p> +. + +. +**__foo__** +. +<p><strong><strong>foo</strong></strong></p> +. + +Note that a `*` followed by a `*` can close emphasis, and +a `**` followed by a `*` can close strong emphasis (and +similarly for `_` and `__`): + +. +*foo** +. +<p><em>foo</em>*</p> +. + +. +*foo *bar** +. +<p><em>foo <em>bar</em></em></p> +. + +. +**foo*** +. +<p><strong>foo</strong>*</p> +. + +. +***foo* bar*** +. +<p><strong><em>foo</em> bar</strong>*</p> +. + +. +***foo** bar*** +. +<p><em><strong>foo</strong> bar</em>**</p> +. + +The following contains no strong emphasis, because the opening +delimiter is closed by the first `*` before `bar`: + +. +*foo**bar*** +. +<p><em>foo</em><em>bar</em>**</p> +. + +However, a string of four or more `****` can never close emphasis: + +. +*foo**** +. +<p>*foo****</p> +. + +Note that there are some asymmetries here: + +. +*foo** + +**foo* +. +<p><em>foo</em>*</p> +<p>**foo*</p> +. + +. +*foo *bar** + +**foo* bar* +. +<p><em>foo <em>bar</em></em></p> +<p>**foo* bar*</p> +. + +More cases with mismatched delimiters: + +. +**foo* bar* +. +<p>**foo* bar*</p> +. + +. +*bar*** +. +<p><em>bar</em>**</p> +. + +. +***foo* +. +<p>***foo*</p> +. + +. +**bar*** +. +<p><strong>bar</strong>*</p> +. + +. +***foo** +. +<p>***foo**</p> +. + +. +***foo *bar* +. +<p>***foo <em>bar</em></p> +. + +## Links + +A link contains a [link label](#link-label) (the visible text), +a [destination](#destination) (the URI that is the link destination), +and optionally a [link title](#link-title). There are two basic kinds +of links in Markdown. In [inline links](#inline-links) the destination +and title are given immediately after the label. In [reference +links](#reference-links) the destination and title are defined elsewhere +in the document. + +A [link label](#link-label) <a id="link-label"></a> consists of + +- an opening `[`, followed by +- zero or more backtick code spans, autolinks, HTML tags, link labels, + backslash-escaped ASCII punctuation characters, or non-`]` characters, + followed by +- a closing `]`. + +These rules are motivated by the following intuitive ideas: + +- A link label is a container for inline elements. +- The square brackets bind more tightly than emphasis markers, + but less tightly than `<>` or `` ` ``. +- Link labels may contain material in matching square brackets. + +A [link destination](#link-destination) <a id="link-destination"></a> +consists of either + +- a sequence of zero or more characters between an opening `<` and a + closing `>` that contains no line breaks or unescaped `<` or `>` + characters, or + +- a nonempty sequence of characters that does not include + ASCII space or control characters, and includes parentheses + only if (a) they are backslash-escaped or (b) they are part of + a balanced pair of unescaped parentheses that is not itself + inside a balanced pair of unescaped paretheses. + +A [link title](#link-title) <a id="link-title"></a> consists of either + +- a sequence of zero or more characters between straight double-quote + characters (`"`), including a `"` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between straight single-quote + characters (`'`), including a `'` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between matching parentheses + (`(...)`), including a `)` character only if it is backslash-escaped. + +An [inline link](#inline-link) <a id="inline-link"></a> +consists of a [link label](#link-label) followed immediately +by a left parenthesis `(`, optional whitespace, +an optional [link destination](#link-destination), +an optional [link title](#link-title) separated from the link +destination by whitespace, optional whitespace, and a right +parenthesis `)`. The link's text consists of the label (excluding +the enclosing square brackets) parsed as inlines. The link's +URI consists of the link destination, excluding enclosing `<...>` if +present, with backslash-escapes in effect as described above. The +link's title consists of the link title, excluding its enclosing +delimiters, with backslash-escapes in effect as described above. + +Here is a simple inline link: + +. +[link](/uri "title") +. +<p><a href="/uri" title="title">link</a></p> +. + +The title may be omitted: + +. +[link](/uri) +. +<p><a href="/uri">link</a></p> +. + +Both the title and the destination may be omitted: + +. +[link]() +. +<p><a href="">link</a></p> +. + +. +[link](<>) +. +<p><a href="">link</a></p> +. + + +If the destination contains spaces, it must be enclosed in pointy +braces: + +. +[link](/my uri) +. +<p>[link](/my uri)</p> +. + +. +[link](</my uri>) +. +<p><a href="/my%20uri">link</a></p> +. + +The destination cannot contain line breaks, even with pointy braces: + +. +[link](foo +bar) +. +<p>[link](foo +bar)</p> +. + +One level of balanced parentheses is allowed without escaping: + +. +[link]((foo)and(bar)) +. +<p><a href="(foo)and(bar)">link</a></p> +. + +However, if you have parentheses within parentheses, you need to escape +or use the `<...>` form: + +. +[link](foo(and(bar))) +. +<p>[link](foo(and(bar)))</p> +. + +. +[link](foo(and\(bar\))) +. +<p><a href="foo(and(bar))">link</a></p> +. + +. +[link](<foo(and(bar))>) +. +<p><a href="foo(and(bar))">link</a></p> +. + +Parentheses and other symbols can also be escaped, as usual +in Markdown: + +. +[link](foo\)\:) +. +<p><a href="foo):">link</a></p> +. + +URL-escaping and should be left alone inside the destination, as all URL-escaped characters +are also valid URL characters. HTML entities in the destination will be parsed into their UTF8 +codepoints, as usual, and optionally URL-escaped when written as HTML. + +. +[link](foo%20bä) +. +<p><a href="foo%20b%C3%A4">link</a></p> +. + +Note that, because titles can often be parsed as destinations, +if you try to omit the destination and keep the title, you'll +get unexpected results: + +. +[link]("title") +. +<p><a href="%22title%22">link</a></p> +. + +Titles may be in single quotes, double quotes, or parentheses: + +. +[link](/url "title") +[link](/url 'title') +[link](/url (title)) +. +<p><a href="/url" title="title">link</a> +<a href="/url" title="title">link</a> +<a href="/url" title="title">link</a></p> +. + +Backslash escapes and entities may be used in titles: + +. +[link](/url "title \""") +. +<p><a href="/url" title="title """>link</a></p> +. + +Nested balanced quotes are not allowed without escaping: + +. +[link](/url "title "and" title") +. +<p>[link](/url "title "and" title")</p> +. + +But it is easy to work around this by using a different quote type: + +. +[link](/url 'title "and" title') +. +<p><a href="/url" title="title "and" title">link</a></p> +. + +(Note: `Markdown.pl` did allow double quotes inside a double-quoted +title, and its test suite included a test demonstrating this. +But it is hard to see a good rationale for the extra complexity this +brings, since there are already many ways---backslash escaping, +entities, or using a different quote type for the enclosing title---to +write titles containing double quotes. `Markdown.pl`'s handling of +titles has a number of other strange features. For example, it allows +single-quoted titles in inline links, but not reference links. And, in +reference links but not inline links, it allows a title to begin with +`"` and end with `)`. `Markdown.pl` 1.0.1 even allows titles with no closing +quotation mark, though 1.0.2b8 does not. It seems preferable to adopt +a simple, rational rule that works the same way in inline links and +link reference definitions.) + +Whitespace is allowed around the destination and title: + +. +[link]( /uri + "title" ) +. +<p><a href="/uri" title="title">link</a></p> +. + +But it is not allowed between the link label and the +following parenthesis: + +. +[link] (/uri) +. +<p>[link] (/uri)</p> +. + +Note that this is not a link, because the closing `]` occurs in +an HTML tag: + +. +[foo <bar attr="](baz)"> +. +<p>[foo <bar attr="](baz)"></p> +. + + +There are three kinds of [reference links](#reference-link): +<a id="reference-link"></a> + +A [full reference link](#full-reference-link) <a id="full-reference-link"></a> +consists of a [link label](#link-label), optional whitespace, and +another [link label](#link-label) that [matches](#matches) a +[link reference definition](#link-reference-definition) elsewhere in the +document. + +One label [matches](#matches) <a id="matches"></a> +another just in case their normalized forms are equal. To normalize a +label, perform the *unicode case fold* and collapse consecutive internal +whitespace to a single space. If there are multiple matching reference +link definitions, the one that comes first in the document is used. (It +is desirable in such cases to emit a warning.) + +The contents of the first link label are parsed as inlines, which are +used as the link's text. The link's URI and title are provided by the +matching [link reference definition](#link-reference-definition). + +Here is a simple example: + +. +[foo][bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +The first label can contain inline content: + +. +[*foo\!*][bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo!</em></a></p> +. + +Matching is case-insensitive: + +. +[foo][BaR] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +Unicode case fold is used: + +. +[Толпой][Толпой] is a Russian word. + +[ТОЛПОЙ]: /url +. +<p><a href="/url">Толпой</a> is a Russian word.</p> +. + +Consecutive internal whitespace is treated as one space for +purposes of determining matching: + +. +[Foo + bar]: /url + +[Baz][Foo bar] +. +<p><a href="/url">Baz</a></p> +. + +There can be whitespace between the two labels: + +. +[foo] [bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +. +[foo] +[bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +When there are multiple matching [link reference +definitions](#link-reference-definition), the first is used: + +. +[foo]: /url1 + +[foo]: /url2 + +[bar][foo] +. +<p><a href="/url1">bar</a></p> +. + +Note that matching is performed on normalized strings, not parsed +inline content. So the following does not match, even though the +labels define equivalent inline content: + +. +[bar][foo\!] + +[foo!]: /url +. +<p>[bar][foo!]</p> +. + +A [collapsed reference link](#collapsed-reference-link) +<a id="collapsed-reference-link"></a> consists of a [link +label](#link-label) that [matches](#matches) a [link reference +definition](#link-reference-definition) elsewhere in the +document, optional whitespace, and the string `[]`. The contents of the +first link label are parsed as inlines, which are used as the link's +text. The link's URI and title are provided by the matching reference +link definition. Thus, `[foo][]` is equivalent to `[foo][foo]`. + +. +[foo][] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +. +[*foo* bar][] + +[*foo* bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo</em> bar</a></p> +. + +The link labels are case-insensitive: + +. +[Foo][] + +[foo]: /url "title" +. +<p><a href="/url" title="title">Foo</a></p> +. + + +As with full reference links, whitespace is allowed +between the two sets of brackets: + +. +[foo] +[] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +A [shortcut reference link](#shortcut-reference-link) +<a id="shortcut-reference-link"></a> consists of a [link +label](#link-label) that [matches](#matches) a [link reference +definition](#link-reference-definition) elsewhere in the +document and is not followed by `[]` or a link label. +The contents of the first link label are parsed as inlines, +which are used as the link's text. the link's URI and title +are provided by the matching link reference definition. +Thus, `[foo]` is equivalent to `[foo][]`. + +. +[foo] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +. +[*foo* bar] + +[*foo* bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo</em> bar</a></p> +. + +. +[[*foo* bar]] + +[*foo* bar]: /url "title" +. +<p>[<a href="/url" title="title"><em>foo</em> bar</a>]</p> +. + +The link labels are case-insensitive: + +. +[Foo] + +[foo]: /url "title" +. +<p><a href="/url" title="title">Foo</a></p> +. + +If you just want bracketed text, you can backslash-escape the +opening bracket to avoid links: + +. +\[foo] + +[foo]: /url "title" +. +<p>[foo]</p> +. + +Note that this is a link, because link labels bind more tightly +than emphasis: + +. +[foo*]: /url + +*[foo*] +. +<p>*<a href="/url">foo*</a></p> +. + +However, this is not, because link labels bind less +tightly than code backticks: + +. +[foo`]: /url + +[foo`]` +. +<p>[foo<code>]</code></p> +. + +Link labels can contain matched square brackets: + +. +[[[foo]]] + +[[[foo]]]: /url +. +<p><a href="/url">[[foo]]</a></p> +. + +. +[[[foo]]] + +[[[foo]]]: /url1 +[foo]: /url2 +. +<p><a href="/url1">[[foo]]</a></p> +. + +For non-matching brackets, use backslash escapes: + +. +[\[foo] + +[\[foo]: /url +. +<p><a href="/url">[foo</a></p> +. + +Full references take precedence over shortcut references: + +. +[foo][bar] + +[foo]: /url1 +[bar]: /url2 +. +<p><a href="/url2">foo</a></p> +. + +In the following case `[bar][baz]` is parsed as a reference, +`[foo]` as normal text: + +. +[foo][bar][baz] + +[baz]: /url +. +<p>[foo]<a href="/url">bar</a></p> +. + +Here, though, `[foo][bar]` is parsed as a reference, since +`[bar]` is defined: + +. +[foo][bar][baz] + +[baz]: /url1 +[bar]: /url2 +. +<p><a href="/url2">foo</a><a href="/url1">baz</a></p> +. + +Here `[foo]` is not parsed as a shortcut reference, because it +is followed by a link label (even though `[bar]` is not defined): + +. +[foo][bar][baz] + +[baz]: /url1 +[foo]: /url2 +. +<p>[foo]<a href="/url1">bar</a></p> +. + + +## Images + +An (unescaped) exclamation mark (`!`) followed by a reference or +inline link will be parsed as an image. The link label will be +used as the image's alt text, and the link title, if any, will +be used as the image's title. + +. +![foo](/url "title") +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +. +![foo *bar*] + +[foo *bar*]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +. + +. +![foo *bar*][] + +[foo *bar*]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +. + +. +![foo *bar*][foobar] + +[FOOBAR]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +. + +. +![foo](train.jpg) +. +<p><img src="train.jpg" alt="foo" /></p> +. + +. +My ![foo bar](/path/to/train.jpg "title" ) +. +<p>My <img src="/path/to/train.jpg" alt="foo bar" title="title" /></p> +. + +. +![foo](<url>) +. +<p><img src="url" alt="foo" /></p> +. + +. +![](/url) +. +<p><img src="/url" alt="" /></p> +. + +Reference-style: + +. +![foo] [bar] + +[bar]: /url +. +<p><img src="/url" alt="foo" /></p> +. + +. +![foo] [bar] + +[BAR]: /url +. +<p><img src="/url" alt="foo" /></p> +. + +Collapsed: + +. +![foo][] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +. +![*foo* bar][] + +[*foo* bar]: /url "title" +. +<p><img src="/url" alt="<em>foo</em> bar" title="title" /></p> +. + +The labels are case-insensitive: + +. +![Foo][] + +[foo]: /url "title" +. +<p><img src="/url" alt="Foo" title="title" /></p> +. + +As with full reference links, whitespace is allowed +between the two sets of brackets: + +. +![foo] +[] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +Shortcut: + +. +![foo] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +. +![*foo* bar] + +[*foo* bar]: /url "title" +. +<p><img src="/url" alt="<em>foo</em> bar" title="title" /></p> +. + +. +![[foo]] + +[[foo]]: /url "title" +. +<p><img src="/url" alt="[foo]" title="title" /></p> +. + +The link labels are case-insensitive: + +. +![Foo] + +[foo]: /url "title" +. +<p><img src="/url" alt="Foo" title="title" /></p> +. + +If you just want bracketed text, you can backslash-escape the +opening `!` and `[`: + +. +\!\[foo] + +[foo]: /url "title" +. +<p>![foo]</p> +. + +If you want a link after a literal `!`, backslash-escape the +`!`: + +. +\![foo] + +[foo]: /url "title" +. +<p>!<a href="/url" title="title">foo</a></p> +. + +## Autolinks + +Autolinks are absolute URIs and email addresses inside `<` and `>`. +They are parsed as links, with the URL or email address as the link +label. + +A [URI autolink](#uri-autolink) <a id="uri-autolink"></a> +consists of `<`, followed by an [absolute +URI](#absolute-uri) not containing `<`, followed by `>`. It is parsed +as a link to the URI, with the URI as the link's label. + +An [absolute URI](#absolute-uri), <a id="absolute-uri"></a> +for these purposes, consists of a [scheme](#scheme) followed by a colon (`:`) +followed by zero or more characters other than ASCII whitespace and +control characters, `<`, and `>`. If the URI includes these characters, +you must use percent-encoding (e.g. `%20` for a space). + +The following [schemes](#scheme) <a id="scheme"></a> +are recognized (case-insensitive): +`coap`, `doi`, `javascript`, `aaa`, `aaas`, `about`, `acap`, `cap`, +`cid`, `crid`, `data`, `dav`, `dict`, `dns`, `file`, `ftp`, `geo`, `go`, +`gopher`, `h323`, `http`, `https`, `iax`, `icap`, `im`, `imap`, `info`, +`ipp`, `iris`, `iris.beep`, `iris.xpc`, `iris.xpcs`, `iris.lwz`, `ldap`, +`mailto`, `mid`, `msrp`, `msrps`, `mtqp`, `mupdate`, `news`, `nfs`, +`ni`, `nih`, `nntp`, `opaquelocktoken`, `pop`, `pres`, `rtsp`, +`service`, `session`, `shttp`, `sieve`, `sip`, `sips`, `sms`, `snmp`,` +soap.beep`, `soap.beeps`, `tag`, `tel`, `telnet`, `tftp`, `thismessage`, +`tn3270`, `tip`, `tv`, `urn`, `vemmi`, `ws`, `wss`, `xcon`, +`xcon-userid`, `xmlrpc.beep`, `xmlrpc.beeps`, `xmpp`, `z39.50r`, +`z39.50s`, `adiumxtra`, `afp`, `afs`, `aim`, `apt`,` attachment`, `aw`, +`beshare`, `bitcoin`, `bolo`, `callto`, `chrome`,` chrome-extension`, +`com-eventbrite-attendee`, `content`, `cvs`,` dlna-playsingle`, +`dlna-playcontainer`, `dtn`, `dvb`, `ed2k`, `facetime`, `feed`, +`finger`, `fish`, `gg`, `git`, `gizmoproject`, `gtalk`, `hcp`, `icon`, +`ipn`, `irc`, `irc6`, `ircs`, `itms`, `jar`, `jms`, `keyparc`, `lastfm`, +`ldaps`, `magnet`, `maps`, `market`,` message`, `mms`, `ms-help`, +`msnim`, `mumble`, `mvn`, `notes`, `oid`, `palm`, `paparazzi`, +`platform`, `proxy`, `psyc`, `query`, `res`, `resource`, `rmi`, `rsync`, +`rtmp`, `secondlife`, `sftp`, `sgn`, `skype`, `smb`, `soldat`, +`spotify`, `ssh`, `steam`, `svn`, `teamspeak`, `things`, `udp`, +`unreal`, `ut2004`, `ventrilo`, `view-source`, `webcal`, `wtai`, +`wyciwyg`, `xfire`, `xri`, `ymsgr`. + +Here are some valid autolinks: + +. +<http://foo.bar.baz> +. +<p><a href="http://foo.bar.baz">http://foo.bar.baz</a></p> +. + +. +<http://foo.bar.baz?q=hello&id=22&boolean> +. +<p><a href="http://foo.bar.baz?q=hello&id=22&boolean">http://foo.bar.baz?q=hello&id=22&boolean</a></p> +. + +. +<irc://foo.bar:2233/baz> +. +<p><a href="irc://foo.bar:2233/baz">irc://foo.bar:2233/baz</a></p> +. + +Uppercase is also fine: + +. +<MAILTO:FOO@BAR.BAZ> +. +<p><a href="MAILTO:FOO@BAR.BAZ">MAILTO:FOO@BAR.BAZ</a></p> +. + +Spaces are not allowed in autolinks: + +. +<http://foo.bar/baz bim> +. +<p><http://foo.bar/baz bim></p> +. + +An [email autolink](#email-autolink) <a id="email-autolink"></a> +consists of `<`, followed by an [email address](#email-address), +followed by `>`. The link's label is the email address, +and the URL is `mailto:` followed by the email address. + +An [email address](#email-address), <a id="email-address"></a> +for these purposes, is anything that matches +the [non-normative regex from the HTML5 +spec](http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#e-mail-state-%28type=email%29): + + /^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? + (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ + +Examples of email autolinks: + +. +<foo@bar.baz.com> +. +<p><a href="mailto:foo@bar.baz.com">foo@bar.baz.com</a></p> +. + +. +<foo+special@Bar.baz-bar0.com> +. +<p><a href="mailto:foo+special@Bar.baz-bar0.com">foo+special@Bar.baz-bar0.com</a></p> +. + +These are not autolinks: + +. +<> +. +<p><></p> +. + +. +<heck://bing.bong> +. +<p><heck://bing.bong></p> +. + +. +< http://foo.bar > +. +<p>< http://foo.bar ></p> +. + +. +<foo.bar.baz> +. +<p><foo.bar.baz></p> +. + +. +<localhost:5001/foo> +. +<p><localhost:5001/foo></p> +. + +. +http://google.com +. +<p>http://google.com</p> +. + +. +foo@bar.baz.com +. +<p>foo@bar.baz.com</p> +. + +## Raw HTML + +Text between `<` and `>` that looks like an HTML tag is parsed as a +raw HTML tag and will be rendered in HTML without escaping. +Tag and attribute names are not limited to current HTML tags, +so custom tags (and even, say, DocBook tags) may be used. + +Here is the grammar for tags: + +A [tag name](#tag-name) <a id="tag-name"></a> consists of an ASCII letter +followed by zero or more ASCII letters or digits. + +An [attribute](#attribute) <a id="attribute"></a> consists of whitespace, +an **attribute name**, and an optional **attribute value +specification**. + +An [attribute name](#attribute-name) <a id="attribute-name"></a> +consists of an ASCII letter, `_`, or `:`, followed by zero or more ASCII +letters, digits, `_`, `.`, `:`, or `-`. (Note: This is the XML +specification restricted to ASCII. HTML5 is laxer.) + +An [attribute value specification](#attribute-value-specification) +<a id="attribute-value-specification"></a> consists of optional whitespace, +a `=` character, optional whitespace, and an [attribute +value](#attribute-value). + +An [attribute value](#attribute-value) <a id="attribute-value"></a> +consists of an [unquoted attribute value](#unquoted-attribute-value), +a [single-quoted attribute value](#single-quoted-attribute-value), +or a [double-quoted attribute value](#double-quoted-attribute-value). + +An [unquoted attribute value](#unquoted-attribute-value) +<a id="unquoted-attribute-value"></a> is a nonempty string of characters not +including spaces, `"`, `'`, `=`, `<`, `>`, or `` ` ``. + +A [single-quoted attribute value](#single-quoted-attribute-value) +<a id="single-quoted-attribute-value"></a> consists of `'`, zero or more +characters not including `'`, and a final `'`. + +A [double-quoted attribute value](#double-quoted-attribute-value) +<a id="double-quoted-attribute-value"></a> consists of `"`, zero or more +characters not including `"`, and a final `"`. + +An [open tag](#open-tag) <a id="open-tag"></a> consists of a `<` character, +a [tag name](#tag-name), zero or more [attributes](#attribute), +optional whitespace, an optional `/` character, and a `>` character. + +A [closing tag](#closing-tag) <a id="closing-tag"></a> consists of the +string `</`, a [tag name](#tag-name), optional whitespace, and the +character `>`. + +An [HTML comment](#html-comment) <a id="html-comment"></a> consists of the +string `<!--`, a string of characters not including the string `--`, and +the string `-->`. + +A [processing instruction](#processing-instruction) +<a id="processing-instruction"></a> consists of the string `<?`, a string +of characters not including the string `?>`, and the string +`?>`. + +A [declaration](#declaration) <a id="declaration"></a> consists of the +string `<!`, a name consisting of one or more uppercase ASCII letters, +whitespace, a string of characters not including the character `>`, and +the character `>`. + +A [CDATA section](#cdata-section) <a id="cdata-section"></a> consists of +the string `<![CDATA[`, a string of characters not including the string +`]]>`, and the string `]]>`. + +An [HTML tag](#html-tag) <a id="html-tag"></a> consists of an [open +tag](#open-tag), a [closing tag](#closing-tag), an [HTML +comment](#html-comment), a [processing +instruction](#processing-instruction), an [element type +declaration](#element-type-declaration), or a [CDATA +section](#cdata-section). + +Here are some simple open tags: + +. +<a><bab><c2c> +. +<p><a><bab><c2c></p> +. + +Empty elements: + +. +<a/><b2/> +. +<p><a/><b2/></p> +. + +Whitespace is allowed: + +. +<a /><b2 +data="foo" > +. +<p><a /><b2 +data="foo" ></p> +. + +With attributes: + +. +<a foo="bar" bam = 'baz <em>"</em>' +_boolean zoop:33=zoop:33 /> +. +<p><a foo="bar" bam = 'baz <em>"</em>' +_boolean zoop:33=zoop:33 /></p> +. + +Illegal tag names, not parsed as HTML: + +. +<33> <__> +. +<p><33> <__></p> +. + +Illegal attribute names: + +. +<a h*#ref="hi"> +. +<p><a h*#ref="hi"></p> +. + +Illegal attribute values: + +. +<a href="hi'> <a href=hi'> +. +<p><a href="hi'> <a href=hi'></p> +. + +Illegal whitespace: + +. +< a>< +foo><bar/ > +. +<p>< a>< +foo><bar/ ></p> +. + +Missing whitespace: + +. +<a href='bar'title=title> +. +<p><a href='bar'title=title></p> +. + +Closing tags: + +. +</a> +</foo > +. +<p></a> +</foo ></p> +. + +Illegal attributes in closing tag: + +. +</a href="foo"> +. +<p></a href="foo"></p> +. + +Comments: + +. +foo <!-- this is a +comment - with hyphen --> +. +<p>foo <!-- this is a +comment - with hyphen --></p> +. + +. +foo <!-- not a comment -- two hyphens --> +. +<p>foo <!-- not a comment -- two hyphens --></p> +. + +Processing instructions: + +. +foo <?php echo $a; ?> +. +<p>foo <?php echo $a; ?></p> +. + +Declarations: + +. +foo <!ELEMENT br EMPTY> +. +<p>foo <!ELEMENT br EMPTY></p> +. + +CDATA sections: + +. +foo <![CDATA[>&<]]> +. +<p>foo <![CDATA[>&<]]></p> +. + +Entities are preserved in HTML attributes: + +. +<a href="ö"> +. +<p><a href="ö"></p> +. + +Backslash escapes do not work in HTML attributes: + +. +<a href="\*"> +. +<p><a href="\*"></p> +. + +. +<a href="\""> +. +<p><a href="""></p> +. + +## Hard line breaks + +A line break (not in a code span or HTML tag) that is preceded +by two or more spaces is parsed as a linebreak (rendered +in HTML as a `<br />` tag): + +. +foo +baz +. +<p>foo<br /> +baz</p> +. + +For a more visible alternative, a backslash before the newline may be +used instead of two spaces: + +. +foo\ +baz +. +<p>foo<br /> +baz</p> +. + +More than two spaces can be used: + +. +foo +baz +. +<p>foo<br /> +baz</p> +. + +Leading spaces at the beginning of the next line are ignored: + +. +foo + bar +. +<p>foo<br /> +bar</p> +. + +. +foo\ + bar +. +<p>foo<br /> +bar</p> +. + +Line breaks can occur inside emphasis, links, and other constructs +that allow inline content: + +. +*foo +bar* +. +<p><em>foo<br /> +bar</em></p> +. + +. +*foo\ +bar* +. +<p><em>foo<br /> +bar</em></p> +. + +Line breaks do not occur inside code spans + +. +`code +span` +. +<p><code>code span</code></p> +. + +. +`code\ +span` +. +<p><code>code\ span</code></p> +. + +or HTML tags: + +. +<a href="foo +bar"> +. +<p><a href="foo +bar"></p> +. + +. +<a href="foo\ +bar"> +. +<p><a href="foo\ +bar"></p> +. + +## Soft line breaks + +A regular line break (not in a code span or HTML tag) that is not +preceded by two or more spaces is parsed as a softbreak. (A +softbreak may be rendered in HTML either as a newline or as a space. +The result will be the same in browsers. In the examples here, a +newline will be used.) + +. +foo +baz +. +<p>foo +baz</p> +. + +Spaces at the end of the line and beginning of the next line are +removed: + +. +foo + baz +. +<p>foo +baz</p> +. + +A conforming parser may render a soft line break in HTML either as a +line break or as a space. + +A renderer may also provide an option to render soft line breaks +as hard line breaks. + +## Strings + +Any characters not given an interpretation by the above rules will +be parsed as string content. + +. +hello $.;'there +. +<p>hello $.;'there</p> +. + +. +Foo χρῆν +. +<p>Foo χρῆν</p> +. + +Internal spaces are preserved verbatim: + +. +Multiple spaces +. +<p>Multiple spaces</p> +. + +<!-- END TESTS --> + +# Appendix A: A parsing strategy {-} + +## Overview {-} + +Parsing has two phases: + +1. In the first phase, lines of input are consumed and the block +structure of the document---its division into paragraphs, block quotes, +list items, and so on---is constructed. Text is assigned to these +blocks but not parsed. Link reference definitions are parsed and a +map of links is constructed. + +2. In the second phase, the raw text contents of paragraphs and headers +are parsed into sequences of Markdown inline elements (strings, +code spans, links, emphasis, and so on), using the map of link +references constructed in phase 1. + +## The document tree {-} + +At each point in processing, the document is represented as a tree of +**blocks**. The root of the tree is a `document` block. The `document` +may have any number of other blocks as **children**. These children +may, in turn, have other blocks as children. The last child of a block +is normally considered **open**, meaning that subsequent lines of input +can alter its contents. (Blocks that are not open are **closed**.) +Here, for example, is a possible document tree, with the open blocks +marked by arrows: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## How source lines alter the document tree {-} + +Each line that is processed has an effect on this tree. The line is +analyzed and, depending on its contents, the document may be altered +in one or more of the following ways: + +1. One or more open blocks may be closed. +2. One or more new blocks may be created as children of the + last open block. +3. Text may be added to the last (deepest) open block remaining + on the tree. + +Once a line has been incorporated into the tree in this way, +it can be discarded, so input can be read in a stream. + +We can see how this works by considering how the tree above is +generated by four lines of Markdown: + +``` markdown +> Lorem ipsum dolor +sit amet. +> - Qui *quodsi iracundia* +> - aliquando id +``` + +At the outset, our document model is just + +``` tree +-> document +``` + +The first line of our text, + +``` markdown +> Lorem ipsum dolor +``` + +causes a `block_quote` block to be created as a child of our +open `document` block, and a `paragraph` block as a child of +the `block_quote`. Then the text is added to the last open +block, the `paragraph`: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor" +``` + +The next line, + +``` markdown +sit amet. +``` + +is a "lazy continuation" of the open `paragraph`, so it gets added +to the paragraph's text: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor\nsit amet." +``` + +The third line, + +``` markdown +> - Qui *quodsi iracundia* +``` + +causes the `paragraph` block to be closed, and a new `list` block +opened as a child of the `block_quote`. A `list_item` is also +added as a child of the `list`, and a `paragraph` as a child of +the `list_item`. The text is then added to the new `paragraph`: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + -> list_item + -> paragraph + "Qui *quodsi iracundia*" +``` + +The fourth line, + +``` markdown +> - aliquando id +``` + +causes the `list_item` (and its child the `paragraph`) to be closed, +and a new `list_item` opened up as child of the `list`. A `paragraph` +is added as a child of the new `list_item`, to contain the text. +We thus obtain the final tree: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## From block structure to the final document {-} + +Once all of the input has been parsed, all open blocks are closed. + +We then "walk the tree," visiting every node, and parse raw +string contents of paragraphs and headers as inlines. At this +point we have seen all the link reference definitions, so we can +resolve reference links as we go. + +``` tree +document + block_quote + paragraph + str "Lorem ipsum dolor" + softbreak + str "sit amet." + list (type=bullet tight=true bullet_char=-) + list_item + paragraph + str "Qui " + emph + str "quodsi iracundia" + list_item + paragraph + str "aliquando id" +``` + +Notice how the newline in the first paragraph has been parsed as +a `softbreak`, and the asterisks in the first list item have become +an `emph`. + +The document can be rendered as HTML, or in any other format, given +an appropriate renderer. + + diff --git a/test/src/markdown/MarkdownTestRunner.kt b/test/src/markdown/MarkdownTestRunner.kt new file mode 100644 index 00000000..bf1d9516 --- /dev/null +++ b/test/src/markdown/MarkdownTestRunner.kt @@ -0,0 +1,130 @@ +package org.jetbrains.kmark.test + +import org.junit.runner.* +import org.junit.runner.notification.* +import java.io.File +import org.junit.runners.ParentRunner +import java.io.Serializable +import kotlin.properties.Delegates +import org.junit.ComparisonFailure + +data class MarkdownTestUniqueId(val id: Int) : Serializable { + class object { + var id = 0 + fun next() = MarkdownTestUniqueId(id++) + } +} + +public open class MarkdownSpecification(val path: String, val processor: (String) -> String) + + +trait MarkdownTest { + fun description(): Description +} + +public open class MarkdownTestCase(val spec: MarkdownSpecification, val input: String, val expected: String) : MarkdownTest, Runner() { + val _description by Delegates.lazy { + Description.createSuiteDescription(input, MarkdownTestUniqueId.next())!! + } + + override fun description(): Description = _description + + override fun getDescription(): Description? = description() + override fun run(notifier: RunNotifier?) { + notifier!! + + notifier.fireTestStarted(_description) + val result = spec.processor(input) + when (result) { + expected -> notifier.fireTestFinished(_description) + else -> notifier.fireTestFailure(Failure(_description, ComparisonFailure("Output mismatch", expected, result))) + } + } +} + +public open class MarkdownTestSection(val spec: MarkdownSpecification, val title: String) : MarkdownTest, ParentRunner<MarkdownTest>(spec.javaClass) { + val children = arrayListOf<MarkdownTest>(); + + val _description by Delegates.lazy { + val desc = Description.createSuiteDescription(title, MarkdownTestUniqueId.next())!! + for (item in getChildren()!!) { + desc.addChild(describeChild(item)) + } + desc + } + + override fun description(): Description = _description + + override fun getChildren(): MutableList<MarkdownTest>? = children + + override fun describeChild(child: MarkdownTest?): Description? = child!!.description() + + override fun runChild(child: MarkdownTest?, notifier: RunNotifier?) { + notifier!! + when (child) { + is MarkdownTestCase -> child.run(notifier) + is MarkdownTestSection -> { + if (child.children.size == 0) { + notifier.fireTestStarted(child.description()) + notifier.fireTestFinished(child.description()) + } else { + child.run(notifier) + } + } + } + } +} + +public class MarkdownTestRunner(specificationClass: Class<MarkdownSpecification>) : MarkdownTestSection(specificationClass.newInstance(), "Tests") { + { + val lines = File(spec.path).readLines() + createSections(this, lines, 1) + } + + private fun createTests(parent: MarkdownTestSection, lines: List<String>): Int { + val testMark = lines.takeWhile { it.trim() != "." } + val testHtml = lines.drop(testMark.size).drop(1).takeWhile { it.trim() != "." } + parent.children.add(MarkdownTestCase(spec, testMark.join("\n", postfix = "\n"), testHtml.join("\n", postfix = "\n"))) + return testMark.size + testHtml.size + 3 + } + + private fun createSections(parent: MarkdownTestSection, lines: List<String>, level: Int): Int { + var sectionNumber = 1 + var index = 0 + while (index < lines.size) { + val line = lines[index] + + if (line.trim() == ".") { + index = createTests(parent, lines.subList(index + 1, lines.lastIndex)) + index + 1 + continue + } + + val head = line.takeWhile { it == '#' }.length + if (head == 0) { + index++ + continue + } + + if (head < level) { + return index + } + + if (head == level) { + val title = lines[index].dropWhile { it == '#' }.dropWhile { it.isWhitespace() } + sectionNumber++ + val section = MarkdownTestSection(spec, title) + val lastIndex = createSections(section, lines.subList(index + 1, lines.lastIndex), level + 1) + index + 1 + if (section.children.size > 0) + parent.children.add(section) + val nextHead = lines[lastIndex].takeWhile { it == '#' }.length + if (nextHead < level) { + return lastIndex + } + index = lastIndex + continue + } + index++ + } + return lines.size + } +}
\ No newline at end of file diff --git a/test/src/markdown/ParserTest.kt b/test/src/markdown/ParserTest.kt new file mode 100644 index 00000000..b4538b07 --- /dev/null +++ b/test/src/markdown/ParserTest.kt @@ -0,0 +1,52 @@ +package org.jetbrains.dokka.tests + +import org.junit.Test +import org.jetbrains.dokka.* + +public class ParserTest { + Test fun text() { + val markdown = MarkdownProcessor().parse("text") + println(markdown.dump()) + } + + Test fun textWithSpaces() { + val markdown = MarkdownProcessor().parse("text and string") + println(markdown.dump()) + } + + Test fun multiline() { + val markdown = MarkdownProcessor().parse( +""" +text +and +string +""") + println(markdown.dump()) + } + + Test fun para() { + val markdown = MarkdownProcessor().parse( +"""paragraph number +one + +paragraph +number two +""") + println(markdown.dump()) + } + + Test fun bulletList() { + val markdown = MarkdownProcessor().parse( +""" +* list item 1 +* list item 2 +""") + println(markdown.dump()) + } + + Test fun emph() { + val markdown = MarkdownProcessor().parse("*text*") + println(markdown.dump()) + } +} + diff --git a/test/src/markdown/Specification.kt b/test/src/markdown/Specification.kt new file mode 100644 index 00000000..e0cda024 --- /dev/null +++ b/test/src/markdown/Specification.kt @@ -0,0 +1,10 @@ +package org.jetbrains.kmark.test + +import org.junit.runner.* +import org.jetbrains.kmark.test.* +import org.jetbrains.dokka.* + +[RunWith(javaClass<MarkdownTestRunner>())] +class Specification : MarkdownSpecification("test/data/markdown/spec.txt", { + markdownToHtml(it.replace("→", "\t")) +})
\ No newline at end of file diff --git a/test/src/model/CommentTest.kt b/test/src/model/CommentTest.kt index 7f56f688..28f717db 100644 --- a/test/src/model/CommentTest.kt +++ b/test/src/model/CommentTest.kt @@ -77,8 +77,8 @@ public class CommentTest { with(model.members.single().members.single()) { assertEquals(NormalStyle, NormalStyle) assertEquals("Summary".toRichString(), doc.summary) - assertEquals(1, doc.sections.count()) - with (doc.sections.elementAt(0)) { + assertEquals(2, doc.sections.count()) + with (doc.sections["one"]!!) { assertEquals("one", label) assertEquals(RichString.empty, text) } @@ -90,8 +90,8 @@ public class CommentTest { verifyModel("test/data/comments/section1.kt") { model -> with(model.members.single().members.single()) { assertEquals("Summary".toRichString(), doc.summary) - assertEquals(1, doc.sections.count()) - with (doc.sections.elementAt(0)) { + assertEquals(2, doc.sections.count()) + with (doc.sections["one"]!!) { assertEquals("one", label) assertEquals("section one".toRichString(), text) } @@ -103,12 +103,12 @@ public class CommentTest { verifyModel("test/data/comments/section2.kt") { model -> with(model.members.single().members.single()) { assertEquals("Summary".toRichString(), doc.summary) - assertEquals(2, doc.sections.count()) - with (doc.sections.elementAt(0)) { + assertEquals(3, doc.sections.count()) + with (doc.sections["one"]!!) { assertEquals("one", label) assertEquals("section one".toRichString(), text) } - with (doc.sections.elementAt(1)) { + with (doc.sections["two"]!!) { assertEquals("two", label) assertEquals("section two".toRichString(), text) } @@ -120,8 +120,8 @@ public class CommentTest { verifyModel("test/data/comments/sectionOnOneLine.kt") { model -> with(model.members.single().members.single()) { assertEquals("Summary".toRichString(), doc.summary) - assertEquals(1, doc.sections.count()) - with (doc.sections.elementAt(0)) { + assertEquals(2, doc.sections.count()) + with (doc.sections["one"]!!) { assertEquals("one", label) assertEquals("same line".toRichString(), text) } @@ -133,8 +133,8 @@ public class CommentTest { verifyModel("test/data/comments/emptySectionOnOneLine.kt") { model -> with(model.members.single().members.single()) { assertEquals("Summary".toRichString(), doc.summary) - assertEquals(1, doc.sections.count()) - with (doc.sections.elementAt(0)) { + assertEquals(2, doc.sections.count()) + with (doc.sections["one"]!!) { assertEquals("one", label) assertEquals(RichString.empty, text) } @@ -146,8 +146,8 @@ public class CommentTest { verifyModel("test/data/comments/multilineSection.kt") { model -> with(model.members.single().members.single()) { assertEquals("Summary".toRichString(), doc.summary) - assertEquals(1, doc.sections.count()) - with (doc.sections.elementAt(0)) { + assertEquals(2, doc.sections.count()) + with (doc.sections["one"]!!) { assertEquals("one", label) assertEquals("""line one line two""".toRichString(), text) @@ -160,8 +160,8 @@ line two""".toRichString(), text) verifyModel("test/data/comments/sectionWithBracedLabel.kt") { model -> with(model.members.single().members.single()) { assertEquals("Summary".toRichString(), doc.summary) - assertEquals(1, doc.sections.count()) - with (doc.sections.elementAt(0)) { + assertEquals(2, doc.sections.count()) + with (doc.sections["this.label.is.really.long"]!!) { assertEquals("this.label.is.really.long", label) assertEquals("section one".toRichString(), text) } |