1 #include "MarkdownTokenizer.h"
7 static const QChar DUMMY_CHAR(
'$');
9 static const int MAX_MARKDOWN_HEADING_LEVEL = 6;
11 MarkdownTokenizer::MarkdownTokenizer():
12 m_state(TokenState::Unknown),
15 paragraphBreakRegex.setPattern(
"^\\s*$");
16 heading1SetextRegex.setPattern(
"^===+\\s*$");
17 heading2SetextRegex.setPattern(
"^---+\\s*$");
18 blockquoteRegex.setPattern(
"^ {0,3}>.*$");
19 githubCodeFenceStartRegex.setPattern(
"^```+.*$");
20 githubCodeFenceEndRegex.setPattern(
"^```+\\s*$");
21 pandocCodeFenceStartRegex.setPattern(
"^~~~+.*$");
22 pandocCodeFenceEndRegex.setPattern(
"^~~~+\\s*$");
23 numberedListRegex.setPattern(
"^ {0,3}[0-9]+[.)]\\s+.*$");
24 numberedNestedListRegex.setPattern(
"^\\s*[0-9]+[.)]\\s+.*$");
25 hruleRegex.setPattern(
"\\s*(\\*\\s*){3,}|(\\s*(_\\s*){3,})|((\\s*(-\\s*){3,}))");
26 emphasisRegex.setPattern(
"(\\*(?![\\s*]).*[^\\s*]\\*)|_(?![\\s_]).*[^\\s_]_");
27 emphasisRegex.setMinimal(
true);
28 strongRegex.setPattern(
"\\*\\*(?=\\S).*\\S\\*\\*(?!\\*)|__(?=\\S).*\\S__(?!_)");
29 strongRegex.setMinimal(
true);
30 strikethroughRegex.setPattern(
"~~[^\\s]+.*[^\\s]+~~");
31 strikethroughRegex.setMinimal(
true);
32 verbatimRegex.setPattern(
"`+");
33 htmlTagRegex.setPattern(
"<[^<>]+>");
34 htmlTagRegex.setMinimal(
true);
35 htmlEntityRegex.setPattern(
"&[a-zA-Z]+;|&#x?[0-9]+;");
36 automaticLinkRegex.setPattern(
"(<[a-zA-Z]+\\:.+>)|(<.+@.+>)");
37 automaticLinkRegex.setMinimal(
true);
38 inlineLinkRegex.setPattern(
"\\[.+\\]\\(.+\\)");
39 inlineLinkRegex.setMinimal(
true);
40 referenceLinkRegex.setPattern(
"\\[(.+)\\]");
41 referenceLinkRegex.setMinimal(
true);
42 referenceDefinitionRegex.setPattern(
"^\\s*\\[.+\\]:");
43 imageRegex.setPattern(
"!\\[.*\\]\\(.+\\)");
44 imageRegex.setMinimal(
true);
45 htmlInlineCommentRegex.setPattern(
"<!--.*-->");
46 htmlInlineCommentRegex.setMinimal(
true);
47 mentionRegex.setPattern(
"\\B@\\w+(\\-\\w+)*(/\\w+(\\-\\w+)*)?");
48 pipeTableDividerRegex.setPattern(
"^ {0,3}(\\|[ :]?)?-{3,}([ :]?\\|[ :]?-{3,}([ :]?\\|)?)+\\s*$");
51 MarkdownTokenizer::~MarkdownTokenizer()
56 void MarkdownTokenizer::tokenize(
const QString &text, MarkdownTokenizer::TokenState current, MarkdownTokenizer::TokenState previous, MarkdownTokenizer::TokenState next)
59 m_previous = previous;
62 if (m_previous == TokenState::GFMCodeFence && tokenizeCodeBlock(text)) {
64 }
else if (m_previous != TokenState::Comment && paragraphBreakRegex.exactMatch(text)) {
66 if (m_previous == TokenState::ListLineBreak ||
67 m_previous == TokenState::NumList ||
68 m_previous == TokenState::BulletList) {
69 setState(TokenState::ListLineBreak);
70 }
else if (m_previous != TokenState::CodeBlock ||
71 (!text.startsWith(QChar(
'\t')) && !text.endsWith(
" "))) {
72 setState(TokenState::ParagraphBreak);
74 }
else if (tokenizeSetextHeadingLine2(text)
75 || tokenizeCodeBlock(text)
76 || tokenizeMultilineComment(text)
77 || tokenizeHorizontalRule(text)
78 || tokenizeTableDivider(text)) {
80 }
else if (tokenizeAtxHeading(text)
81 || tokenizeSetextHeadingLine1(text)
82 || tokenizeBlockquote(text)
83 || tokenizeNumberedList(text)
84 || tokenizeBulletPointList(text)) {
87 if (m_previous == TokenState::ListLineBreak ||
88 m_previous == TokenState::NumList ||
89 m_previous == TokenState::BulletList) {
91 if (!tokenizeNumberedList(text) &&
92 !tokenizeBulletPointList(text) &&
93 (text.startsWith(QChar(
'\t')) || text.startsWith(
" "))) {
96 setState(TokenState::Paragraph);
100 setState(TokenState::Paragraph);
102 tokenizeInline(text);
105 const bool reprocess1 = (m_previous == TokenState::SetextHead1Line1 && m_state != TokenState::SetextHead1Line2);
106 const bool reprocess2 = (m_previous == TokenState::SetextHead2Line1 && m_state != TokenState::SetextHead2Line2);
107 if (reprocess1 || reprocess2) {
108 setStartEarlier(
true);
110 setStartEarlier(
false);
114 MarkdownTokenizer::TokenPositionMap MarkdownTokenizer::tokens()
const
119 MarkdownTokenizer::TokenState MarkdownTokenizer::state()
const
124 bool MarkdownTokenizer::shouldBackTrack()
const
126 return m_startEarlier;
129 void MarkdownTokenizer::clear()
132 m_startEarlier =
false;
133 m_state = TokenState::Unknown;
136 void MarkdownTokenizer::tokenizeMatches(MarkdownToken::TokenType tokenType, QString &text, QRegExp ®ex,
const int markupStartCount,
const int markupEndCount,
const bool replaceMarkupChars,
const bool replaceAllChars)
138 int index = text.indexOf(regex);
142 int length = regex.matchedLength();
145 token.setType(tokenType);
146 token.setPosition(index);
147 token.setLength(length);
149 if (markupStartCount > 0)
151 token.setOpeningLenth(markupStartCount);
154 if (markupEndCount > 0)
156 token.setClosingLength(markupEndCount);
161 for (
int i = index; i < (index + length); i++)
163 text[i] = DUMMY_CHAR;
166 else if (replaceMarkupChars)
168 for (
int i = index; i < (index + markupStartCount); i++)
170 text[i] = DUMMY_CHAR;
173 for (
int i = (index + length - markupEndCount); i < (index + length); i++)
175 text[i] = DUMMY_CHAR;
180 index = text.indexOf(regex, index + length);
186 m_tokens.insertMulti(token.position(), token);
189 void MarkdownTokenizer::setState(MarkdownTokenizer::TokenState state)
194 void MarkdownTokenizer::setStartEarlier(
const bool startEarlier)
196 m_startEarlier = startEarlier;
199 bool MarkdownTokenizer::tokenizeSetextHeadingLine1(
const QString &text)
204 if (m_next == TokenState::SetextHead1Line2) {
206 setState(TokenState::SetextHead1Line1);
207 token.setType(MarkdownToken::SetextHead1Line1);
209 else if (m_next == TokenState::SetextHead2Line2) {
211 setState(TokenState::SetextHead2Line1);
212 token.setType(MarkdownToken::SetextHead2Line1);
217 token.setLength(text.length());
218 token.setPosition(0);
226 bool MarkdownTokenizer::tokenizeSetextHeadingLine2(
const QString &text)
229 bool isMatch =
false;
232 if (m_previous == TokenState::SetextHead1Line1) {
234 isMatch = heading1SetextRegex.exactMatch(text);
235 setState(TokenState::SetextHead1Line2);
236 token.setType(MarkdownToken::SetextHead1Line2);
237 }
else if (m_previous == TokenState::SetextHead2Line1) {
239 isMatch = heading2SetextRegex.exactMatch(text);
240 setState(TokenState::SetextHead2Line2);
241 token.setType(MarkdownToken::SetextHead2Line2);
242 }
else if (m_previous == TokenState::Paragraph) {
243 bool h1Line2 = heading1SetextRegex.exactMatch(text);
244 bool h2Line2 = heading2SetextRegex.exactMatch(text);
246 if (h1Line2 || h2Line2) {
247 setStartEarlier(
true);
248 token.setLength(text.length());
249 token.setPosition(0);
252 setState(TokenState::SetextHead1Line2);
253 token.setType(MarkdownToken::SetextHead1Line2);
255 setState(TokenState::SetextHead2Line2);
256 token.setType(MarkdownToken::SetextHead2Line2);
265 token.setLength(text.length());
266 token.setPosition(0);
270 setStartEarlier(
true);
279 QString dummyOutEscapeCharacters(
const QString& text)
282 QString escapedText = text;
284 for (
int i = 0; i < text.length(); i++)
288 escapedText[i] = DUMMY_CHAR;
291 else if (QChar(
'\\') == text[i])
302 bool MarkdownTokenizer::tokenizeAtxHeading(
const QString &text)
304 QString escapedText = dummyOutEscapeCharacters(text);
305 int trailingPoundCount = 0;
315 ((i < escapedText.length()) && (i < MAX_MARKDOWN_HEADING_LEVEL));
319 if (QChar(
'#') == escapedText[i])
330 if ((level > 0) && (level < text.length()))
333 for (
int i = escapedText.length() - 1; i > level; i--)
335 if (QChar(
'#') == escapedText[i])
337 trailingPoundCount++;
347 token.setPosition(0);
348 token.setLength(text.length());
349 token.setType((MarkdownToken::TokenType) (MarkdownToken::AtxHeading1 + level - 1));
350 token.setOpeningLenth(level);
351 token.setClosingLength(trailingPoundCount);
353 setState((TokenState)(TokenState::AtxHeading1 + level - 1));
360 bool MarkdownTokenizer::tokenizeNumberedList(
const QString &text)
362 if ((m_previous == TokenState::ParagraphBreak ||
363 m_previous == TokenState::Unknown ||
364 m_previous == TokenState::CodeBlock ||
365 m_previous == TokenState::CodeFenceEnd &&
366 numberedListRegex.exactMatch(text)) ||
367 (m_previous == TokenState::ListLineBreak ||
368 m_previous == TokenState::NumList ||
369 m_previous == TokenState::BulletList &&
370 numberedNestedListRegex.exactMatch(text))) {
372 int periodIndex = text.indexOf(QChar(
'.'));
373 int parenthIndex = text.indexOf(QChar(
')'));
378 index = parenthIndex;
380 else if (parenthIndex < 0)
384 else if (parenthIndex > periodIndex)
390 index = parenthIndex;
396 token.setType(MarkdownToken::NumberedList);
397 token.setPosition(0);
398 token.setLength(text.length());
399 token.setOpeningLenth(index + 1);
401 setState(TokenState::NumList);
410 bool MarkdownTokenizer::tokenizeBulletPointList(
const QString &text)
412 bool foundBulletChar =
false;
413 int bulletCharIndex = -1;
415 bool whitespaceFoundAfterBulletChar =
false;
417 if (m_previous != TokenState::Unknown &&
418 m_previous != TokenState::ParagraphBreak &&
419 m_previous != TokenState::ListLineBreak &&
420 m_previous != TokenState::NumList &&
421 m_previous != TokenState::BulletList &&
422 m_previous != TokenState::CodeBlock &&
423 m_previous != TokenState::CodeFenceEnd) {
430 for (
int i = 0; i < text.length(); i++)
432 if (QChar(
' ') == text[i])
440 whitespaceFoundAfterBulletChar =
true;
455 && (m_previous != TokenState::NumList)
456 && (m_previous != TokenState::BulletList)
457 && (m_previous != TokenState::ListLineBreak)
460 (m_previous == TokenState::ParagraphBreak)
461 || (m_previous == TokenState::Unknown)
462 || (m_previous == TokenState::CodeBlock)
463 || (m_previous == TokenState::CodeFenceEnd)
471 else if (QChar(
'\t') == text[i])
479 whitespaceFoundAfterBulletChar =
true;
484 (m_previous == TokenState::ParagraphBreak)
485 || (m_previous == TokenState::Unknown)
497 (QChar(
'+') == text[i])
498 || (QChar(
'-') == text[i])
499 || (QChar(
'*') == text[i])
502 foundBulletChar =
true;
511 if ((bulletCharIndex >= 0) && whitespaceFoundAfterBulletChar)
514 token.setType(MarkdownToken::BulletList);
515 token.setPosition(0);
516 token.setLength(text.length());
517 token.setOpeningLenth(bulletCharIndex + 1);
519 setState(TokenState::BulletList);
525 bool MarkdownTokenizer::tokenizeHorizontalRule(
const QString &text)
527 if (hruleRegex.exactMatch(text))
530 token.setType(MarkdownToken::HorizontalRule);
531 token.setPosition(0);
532 token.setLength(text.length());
534 setState(TokenState::HorizontalRule);
540 bool MarkdownTokenizer::tokenizeBlockquote(
const QString &text)
542 if (m_previous == TokenState::Blockquote || blockquoteRegex.exactMatch(text)) {
544 int markupLength = 0;
546 for (
int i = 0; i < text.length(); i++)
548 if (QChar(
'>') == text[i])
550 markupLength = i + 1;
552 else if (!text[i].isSpace())
562 token.setType(MarkdownToken::Blockquote);
563 token.setPosition(0);
564 token.setLength(text.length());
566 if (markupLength > 0)
568 token.setOpeningLenth(markupLength);
572 setState(TokenState::Blockquote);
578 bool MarkdownTokenizer::tokenizeCodeBlock(
const QString &text)
580 if (m_previous == TokenState::GFMCodeFence) {
581 setState(m_previous);
583 if (m_previous == TokenState::GFMCodeFence && githubCodeFenceEndRegex.exactMatch(text)) {
585 token.setType(MarkdownToken::CodeFenceEnd);
586 token.setPosition(0);
587 token.setLength(text.length());
589 setState(TokenState::CodeFenceEnd);
594 token.setType(MarkdownToken::CodeBlock);
595 token.setPosition(0);
596 token.setLength(text.length());
602 else if ((m_previous == TokenState::CodeBlock ||
603 m_previous == TokenState::ParagraphBreak ||
604 m_previous == TokenState::Unknown)
606 (text.startsWith(QChar(
'\t')) || text.startsWith(
" "))) {
609 token.setType(MarkdownToken::CodeBlock);
610 token.setPosition(0);
611 token.setLength(text.length());
613 setState(TokenState::CodeBlock);
616 else if (m_previous == TokenState::ParagraphBreak ||
617 m_previous == TokenState::Paragraph ||
618 m_previous == TokenState::Unknown ||
619 m_previous == TokenState::ListLineBreak) {
620 bool foundCodeFenceStart =
false;
623 if (githubCodeFenceStartRegex.exactMatch(text))
625 foundCodeFenceStart =
true;
626 token.setType(MarkdownToken::GFMCodeFence);
627 setState(TokenState::GFMCodeFence);
630 if (foundCodeFenceStart)
632 token.setPosition(0);
633 token.setLength(text.length());
641 bool MarkdownTokenizer::tokenizeMultilineComment(
const QString &text)
643 if (m_previous == TokenState::Comment) {
645 int index = text.indexOf(
"-->");
647 token.setType(MarkdownToken::HtmlComment);
648 token.setPosition(0);
652 token.setLength(index + 3);
663 token.setLength(text.length());
665 setState(TokenState::Comment);
672 bool MarkdownTokenizer::tokenizeInline(
const QString &text)
674 QString escapedText = dummyOutEscapeCharacters(text);
677 if (referenceDefinitionRegex.exactMatch(escapedText))
679 int colonIndex = escapedText.indexOf(
':');
681 token.setType(MarkdownToken::ReferenceDefinition);
682 token.setPosition(0);
683 token.setLength(colonIndex + 1);
689 int firstBracketIndex = escapedText.indexOf(QChar(
'['));
691 if (firstBracketIndex >= 0)
693 escapedText[firstBracketIndex] = DUMMY_CHAR;
697 tokenizeVerbatim(escapedText);
698 tokenizeHtmlComments(escapedText);
699 tokenizeTableHeaderRow(escapedText);
700 tokenizeTableRow(escapedText);
701 tokenizeMatches(MarkdownToken::Image, escapedText, imageRegex, 0, 0,
false,
true);
702 tokenizeMatches(MarkdownToken::InlineLink, escapedText, inlineLinkRegex, 0, 0,
false,
true);
703 tokenizeMatches(MarkdownToken::ReferenceLink, escapedText, referenceLinkRegex, 0, 0,
false,
true);
704 tokenizeMatches(MarkdownToken::HtmlEntity, escapedText, htmlEntityRegex);
705 tokenizeMatches(MarkdownToken::AutomaticLink, escapedText, automaticLinkRegex, 0, 0,
false,
true);
706 tokenizeMatches(MarkdownToken::Strikethrough, escapedText, strikethroughRegex, 2, 2);
707 tokenizeMatches(MarkdownToken::Strong, escapedText, strongRegex, 2, 2,
true);
708 tokenizeMatches(MarkdownToken::Emphasis, escapedText, emphasisRegex, 1, 1,
true);
709 tokenizeMatches(MarkdownToken::HtmlTag, escapedText, htmlTagRegex);
710 tokenizeMatches(MarkdownToken::Mention, escapedText, mentionRegex, 0, 0,
false,
true);
714 void MarkdownTokenizer::tokenizeVerbatim(QString &text)
716 int index = verbatimRegex.indexIn(text);
721 int count = verbatimRegex.matchedLength();
726 for (
int i = 0; i < count; i++)
731 int endIndex = text.indexOf(end, index + count);
738 token.setType(MarkdownToken::Verbatim);
739 token.setPosition(index);
740 token.setLength(endIndex + count - index);
741 token.setOpeningLenth(count);
742 token.setClosingLength(count);
749 for (
int i = index; i < (index + token.length()); i++)
751 text[i] = DUMMY_CHAR;
754 index += token.length();
762 index = verbatimRegex.indexIn(text, index);
766 void MarkdownTokenizer::tokenizeHtmlComments(QString &text)
772 if (m_previous == TokenState::Comment) {
773 int commentEnd = text.indexOf(
"-->");
775 for (
int i = 0; i < commentEnd + 3; i++) {
776 text[i] = DUMMY_CHAR;
781 int commentStart = text.indexOf(htmlInlineCommentRegex);
783 while (commentStart >= 0)
785 int commentLength = htmlInlineCommentRegex.matchedLength();
788 token.setType(MarkdownToken::HtmlComment);
789 token.setPosition(commentStart);
790 token.setLength(commentLength);
796 for (
int i = commentStart; i < (commentStart + commentLength); i++) {
797 text[i] = DUMMY_CHAR;
800 commentStart = text.indexOf(htmlInlineCommentRegex, commentStart + commentLength);
804 commentStart = text.indexOf(
"<!--");
806 if (commentStart >= 0)
810 token.setType(MarkdownToken::HtmlComment);
811 token.setPosition(commentStart);
812 token.setLength(text.length() - commentStart);
814 setState(TokenState::Comment);
819 for (
int i = commentStart; i < text.length(); i++)
821 text[i] = DUMMY_CHAR;
826 void MarkdownTokenizer::tokenizeTableHeaderRow(QString &text)
828 if ((m_previous == TokenState::ParagraphBreak ||
829 m_previous == TokenState::ListLineBreak ||
830 m_previous == TokenState::SetextHead1Line2 ||
831 m_previous == TokenState::SetextHead2Line2 ||
832 m_previous == TokenState::AtxHeading1 ||
833 m_previous == TokenState::AtxHeading2 ||
834 m_previous == TokenState::AtxHeading3 ||
835 m_previous == TokenState::AtxHeading4 ||
836 m_previous == TokenState::AtxHeading5 ||
837 m_previous == TokenState::AtxHeading6 ||
838 m_previous == TokenState::HorizontalRule ||
839 m_previous == TokenState::CodeFenceEnd ||
840 m_previous == TokenState::Unknown)
842 (m_state == TokenState::Paragraph ||
843 m_state == TokenState::Unknown)
845 (m_next == TokenState::TableDiv)) {
847 setState(TokenState::TableHeader);
851 for (
int i = 0; i < text.length(); i++)
853 if (QChar(
'|') == text[i])
867 token.setType(MarkdownToken::TableHeader);
868 token.setPosition(headerStart);
869 token.setLength(i - headerStart);
873 token.setType(MarkdownToken::TablePipe);
874 token.setPosition(i);
881 if (headerStart < text.length())
884 token.setType(MarkdownToken::TableHeader);
885 token.setPosition(headerStart);
886 token.setLength(text.length() - headerStart);
892 bool MarkdownTokenizer::tokenizeTableDivider(
const QString &text)
894 if (m_previous == TokenState::TableHeader)
896 if (pipeTableDividerRegex.exactMatch(text))
898 setState(TokenState::TableDiv);
901 token.setType(MarkdownToken::TableDiv);
902 token.setLength(text.length());
903 token.setPosition(0);
911 setStartEarlier(
true);
914 else if (m_previous == TokenState::Paragraph)
916 if (pipeTableDividerRegex.exactMatch(text))
919 setStartEarlier(
true);
921 setState(TokenState::TableDiv);
924 token.setLength(text.length());
925 token.setPosition(0);
926 token.setType(MarkdownToken::TableDiv);
934 void MarkdownTokenizer::tokenizeTableRow(QString &text)
936 if (m_previous == TokenState::TableDiv || m_previous == TokenState::TableRow) {
938 setState(TokenState::TableRow);
940 for (
int i = 0; i < text.length(); i++)
942 if (QChar(
'|') == text[i])
953 token.setType(MarkdownToken::TablePipe);
954 token.setPosition(i);