Dekko
MarkdownTokenizer.cpp
1 #include "MarkdownTokenizer.h"
2 
3 // This character is used to replace escape characters and other characters
4 // with special meaning in a dummy copy of the current line being parsed,
5 // for ease of parsing.
6 //
7 static const QChar DUMMY_CHAR('$');
8 
9 static const int MAX_MARKDOWN_HEADING_LEVEL = 6;
10 
11 MarkdownTokenizer::MarkdownTokenizer():
12  m_state(TokenState::Unknown),
13  m_startEarlier(false)
14 {
15  paragraphBreakRegex.setPattern("^\\s*$");
16  heading1SetextRegex.setPattern("^===+\\s*$");
17  heading2SetextRegex.setPattern("^---+\\s*$");
18  blockquoteRegex.setPattern("^ {0,3}>.*$");
19  githubCodeFenceStartRegex.setPattern("^```+.*$");
20  githubCodeFenceEndRegex.setPattern("^```+\\s*$");
21  pandocCodeFenceStartRegex.setPattern("^~~~+.*$");
22  pandocCodeFenceEndRegex.setPattern("^~~~+\\s*$");
23  numberedListRegex.setPattern("^ {0,3}[0-9]+[.)]\\s+.*$");
24  numberedNestedListRegex.setPattern("^\\s*[0-9]+[.)]\\s+.*$");
25  hruleRegex.setPattern("\\s*(\\*\\s*){3,}|(\\s*(_\\s*){3,})|((\\s*(-\\s*){3,}))");
26  emphasisRegex.setPattern("(\\*(?![\\s*]).*[^\\s*]\\*)|_(?![\\s_]).*[^\\s_]_");
27  emphasisRegex.setMinimal(true);
28  strongRegex.setPattern("\\*\\*(?=\\S).*\\S\\*\\*(?!\\*)|__(?=\\S).*\\S__(?!_)");
29  strongRegex.setMinimal(true);
30  strikethroughRegex.setPattern("~~[^\\s]+.*[^\\s]+~~");
31  strikethroughRegex.setMinimal(true);
32  verbatimRegex.setPattern("`+");
33  htmlTagRegex.setPattern("<[^<>]+>");
34  htmlTagRegex.setMinimal(true);
35  htmlEntityRegex.setPattern("&[a-zA-Z]+;|&#x?[0-9]+;");
36  automaticLinkRegex.setPattern("(<[a-zA-Z]+\\:.+>)|(<.+@.+>)");
37  automaticLinkRegex.setMinimal(true);
38  inlineLinkRegex.setPattern("\\[.+\\]\\(.+\\)");
39  inlineLinkRegex.setMinimal(true);
40  referenceLinkRegex.setPattern("\\[(.+)\\]");
41  referenceLinkRegex.setMinimal(true);
42  referenceDefinitionRegex.setPattern("^\\s*\\[.+\\]:");
43  imageRegex.setPattern("!\\[.*\\]\\(.+\\)");
44  imageRegex.setMinimal(true);
45  htmlInlineCommentRegex.setPattern("<!--.*-->");
46  htmlInlineCommentRegex.setMinimal(true);
47  mentionRegex.setPattern("\\B@\\w+(\\-\\w+)*(/\\w+(\\-\\w+)*)?");
48  pipeTableDividerRegex.setPattern("^ {0,3}(\\|[ :]?)?-{3,}([ :]?\\|[ :]?-{3,}([ :]?\\|)?)+\\s*$");
49 }
50 
51 MarkdownTokenizer::~MarkdownTokenizer()
52 {
53 
54 }
55 
56 void MarkdownTokenizer::tokenize(const QString &text, MarkdownTokenizer::TokenState current, MarkdownTokenizer::TokenState previous, MarkdownTokenizer::TokenState next)
57 {
58  m_current = current;
59  m_previous = previous;
60  m_next = next;
61 
62  if (m_previous == TokenState::GFMCodeFence && tokenizeCodeBlock(text)) {
63  // Nothing to do.
64  } else if (m_previous != TokenState::Comment && paragraphBreakRegex.exactMatch(text)) {
65  // Let's look for a list first
66  if (m_previous == TokenState::ListLineBreak ||
67  m_previous == TokenState::NumList ||
68  m_previous == TokenState::BulletList) {
69  setState(TokenState::ListLineBreak);
70  } else if (m_previous != TokenState::CodeBlock ||
71  (!text.startsWith(QChar('\t')) && !text.endsWith(" "))) {
72  setState(TokenState::ParagraphBreak);
73  }
74  } else if (tokenizeSetextHeadingLine2(text)
75  || tokenizeCodeBlock(text)
76  || tokenizeMultilineComment(text)
77  || tokenizeHorizontalRule(text)
78  || tokenizeTableDivider(text)) {
79  // DO NOTHING
80  } else if (tokenizeAtxHeading(text)
81  || tokenizeSetextHeadingLine1(text)
82  || tokenizeBlockquote(text)
83  || tokenizeNumberedList(text)
84  || tokenizeBulletPointList(text)) {
85  tokenizeInline(text);
86  } else {
87  if (m_previous == TokenState::ListLineBreak ||
88  m_previous == TokenState::NumList ||
89  m_previous == TokenState::BulletList) {
90 
91  if (!tokenizeNumberedList(text) &&
92  !tokenizeBulletPointList(text) &&
93  (text.startsWith(QChar('\t')) || text.startsWith(" "))) {
94  setState(m_previous);
95  } else {
96  setState(TokenState::Paragraph);
97  }
98 
99  } else {
100  setState(TokenState::Paragraph);
101  }
102  tokenizeInline(text);
103  }
104 
105  const bool reprocess1 = (m_previous == TokenState::SetextHead1Line1 && m_state != TokenState::SetextHead1Line2);
106  const bool reprocess2 = (m_previous == TokenState::SetextHead2Line1 && m_state != TokenState::SetextHead2Line2);
107  if (reprocess1 || reprocess2) {
108  setStartEarlier(true);
109  } else {
110  setStartEarlier(false);
111  }
112 }
113 
114 MarkdownTokenizer::TokenPositionMap MarkdownTokenizer::tokens() const
115 {
116  return m_tokens;
117 }
118 
119 MarkdownTokenizer::TokenState MarkdownTokenizer::state() const
120 {
121  return m_state;
122 }
123 
124 bool MarkdownTokenizer::shouldBackTrack() const
125 {
126  return m_startEarlier;
127 }
128 
129 void MarkdownTokenizer::clear()
130 {
131  m_tokens.clear();
132  m_startEarlier = false;
133  m_state = TokenState::Unknown;
134 }
135 
136 void MarkdownTokenizer::tokenizeMatches(MarkdownToken::TokenType tokenType, QString &text, QRegExp &regex, const int markupStartCount, const int markupEndCount, const bool replaceMarkupChars, const bool replaceAllChars)
137 {
138  int index = text.indexOf(regex);
139 
140  while (index >= 0)
141  {
142  int length = regex.matchedLength();
143  MarkdownToken token;
144 
145  token.setType(tokenType);
146  token.setPosition(index);
147  token.setLength(length);
148 
149  if (markupStartCount > 0)
150  {
151  token.setOpeningLenth(markupStartCount);
152  }
153 
154  if (markupEndCount > 0)
155  {
156  token.setClosingLength(markupEndCount);
157  }
158 
159  if (replaceAllChars)
160  {
161  for (int i = index; i < (index + length); i++)
162  {
163  text[i] = DUMMY_CHAR;
164  }
165  }
166  else if (replaceMarkupChars)
167  {
168  for (int i = index; i < (index + markupStartCount); i++)
169  {
170  text[i] = DUMMY_CHAR;
171  }
172 
173  for (int i = (index + length - markupEndCount); i < (index + length); i++)
174  {
175  text[i] = DUMMY_CHAR;
176  }
177  }
178 
179  addToken(token);
180  index = text.indexOf(regex, index + length);
181  }
182 }
183 
184 void MarkdownTokenizer::addToken(const MarkdownToken &token)
185 {
186  m_tokens.insertMulti(token.position(), token);
187 }
188 
189 void MarkdownTokenizer::setState(MarkdownTokenizer::TokenState state)
190 {
191  m_state = state;
192 }
193 
194 void MarkdownTokenizer::setStartEarlier(const bool startEarlier)
195 {
196  m_startEarlier = startEarlier;
197 }
198 
199 bool MarkdownTokenizer::tokenizeSetextHeadingLine1(const QString &text)
200 {
201  int level = 0;
202  MarkdownToken token;
203 
204  if (m_next == TokenState::SetextHead1Line2) {
205  level = 1;
206  setState(TokenState::SetextHead1Line1);
207  token.setType(MarkdownToken::SetextHead1Line1);
208  }
209  else if (m_next == TokenState::SetextHead2Line2) {
210  level = 2;
211  setState(TokenState::SetextHead2Line1);
212  token.setType(MarkdownToken::SetextHead2Line1);
213  }
214 
215  if (level > 0)
216  {
217  token.setLength(text.length());
218  token.setPosition(0);
219  addToken(token);
220  return true;
221  }
222 
223  return false;
224 }
225 
226 bool MarkdownTokenizer::tokenizeSetextHeadingLine2(const QString &text)
227 {
228  int level = 0;
229  bool isMatch = false;
230  MarkdownToken token;
231 
232  if (m_previous == TokenState::SetextHead1Line1) {
233  level = 1;
234  isMatch = heading1SetextRegex.exactMatch(text);
235  setState(TokenState::SetextHead1Line2);
236  token.setType(MarkdownToken::SetextHead1Line2);
237  } else if (m_previous == TokenState::SetextHead2Line1) {
238  level = 2;
239  isMatch = heading2SetextRegex.exactMatch(text);
240  setState(TokenState::SetextHead2Line2);
241  token.setType(MarkdownToken::SetextHead2Line2);
242  } else if (m_previous == TokenState::Paragraph) {
243  bool h1Line2 = heading1SetextRegex.exactMatch(text);
244  bool h2Line2 = heading2SetextRegex.exactMatch(text);
245 
246  if (h1Line2 || h2Line2) {
247  setStartEarlier(true);
248  token.setLength(text.length());
249  token.setPosition(0);
250 
251  if (h1Line2) {
252  setState(TokenState::SetextHead1Line2);
253  token.setType(MarkdownToken::SetextHead1Line2);
254  } else {
255  setState(TokenState::SetextHead2Line2);
256  token.setType(MarkdownToken::SetextHead2Line2);
257  }
258  addToken(token);
259  return true;
260  }
261  }
262 
263  if (level > 0) {
264  if (isMatch) {
265  token.setLength(text.length());
266  token.setPosition(0);
267  addToken(token);
268  return true;
269  } else {
270  setStartEarlier(true);
271  return false;
272  }
273  }
274  return false;
275 }
276 
277 namespace {
278 
279 QString dummyOutEscapeCharacters(const QString& text)
280 {
281  bool escape = false;
282  QString escapedText = text;
283 
284  for (int i = 0; i < text.length(); i++)
285  {
286  if (escape)
287  {
288  escapedText[i] = DUMMY_CHAR; // Use a dummy character.
289  escape = false;
290  }
291  else if (QChar('\\') == text[i])
292  {
293  escape = true;
294  }
295  }
296 
297  return escapedText;
298 }
299 
300 }
301 
302 bool MarkdownTokenizer::tokenizeAtxHeading(const QString &text)
303 {
304  QString escapedText = dummyOutEscapeCharacters(text);
305  int trailingPoundCount = 0;
306 
307  int level = 0;
308 
309  // Count the number of pound signs at the front of the string,
310  // up to the maximum allowed, to determine the heading level.
311  //
312  for
313  (
314  int i = 0;
315  ((i < escapedText.length()) && (i < MAX_MARKDOWN_HEADING_LEVEL));
316  i++
317  )
318  {
319  if (QChar('#') == escapedText[i])
320  {
321  level++;
322  }
323  else
324  {
325  // We're done counting, as no more pound signs are available.
326  break;
327  }
328  }
329 
330  if ((level > 0) && (level < text.length()))
331  {
332  // Count how many pound signs are at the end of the text.
333  for (int i = escapedText.length() - 1; i > level; i--)
334  {
335  if (QChar('#') == escapedText[i])
336  {
337  trailingPoundCount++;
338  }
339  else
340  {
341  // We're done counting, as no more pound signs are available.
342  break;
343  }
344  }
345 
346  MarkdownToken token;
347  token.setPosition(0);
348  token.setLength(text.length());
349  token.setType((MarkdownToken::TokenType) (MarkdownToken::AtxHeading1 + level - 1));
350  token.setOpeningLenth(level);
351  token.setClosingLength(trailingPoundCount);
352  addToken(token);
353  setState((TokenState)(TokenState::AtxHeading1 + level - 1));
354  return true;
355  }
356 
357  return false;
358 }
359 
360 bool MarkdownTokenizer::tokenizeNumberedList(const QString &text)
361 {
362  if ((m_previous == TokenState::ParagraphBreak ||
363  m_previous == TokenState::Unknown ||
364  m_previous == TokenState::CodeBlock ||
365  m_previous == TokenState::CodeFenceEnd &&
366  numberedListRegex.exactMatch(text)) ||
367  (m_previous == TokenState::ListLineBreak ||
368  m_previous == TokenState::NumList ||
369  m_previous == TokenState::BulletList &&
370  numberedNestedListRegex.exactMatch(text))) {
371 
372  int periodIndex = text.indexOf(QChar('.'));
373  int parenthIndex = text.indexOf(QChar(')'));
374  int index = -1;
375 
376  if (periodIndex < 0)
377  {
378  index = parenthIndex;
379  }
380  else if (parenthIndex < 0)
381  {
382  index = periodIndex;
383  }
384  else if (parenthIndex > periodIndex)
385  {
386  index = periodIndex;
387  }
388  else
389  {
390  index = parenthIndex;
391  }
392 
393  if (index >= 0)
394  {
395  MarkdownToken token;
396  token.setType(MarkdownToken::NumberedList);
397  token.setPosition(0);
398  token.setLength(text.length());
399  token.setOpeningLenth(index + 1);
400  addToken(token);
401  setState(TokenState::NumList);
402  return true;
403  }
404 
405  return false;
406  }
407  return false;
408 }
409 
410 bool MarkdownTokenizer::tokenizeBulletPointList(const QString &text)
411 {
412  bool foundBulletChar = false;
413  int bulletCharIndex = -1;
414  int spaceCount = 0;
415  bool whitespaceFoundAfterBulletChar = false;
416 
417  if (m_previous != TokenState::Unknown &&
418  m_previous != TokenState::ParagraphBreak &&
419  m_previous != TokenState::ListLineBreak &&
420  m_previous != TokenState::NumList &&
421  m_previous != TokenState::BulletList &&
422  m_previous != TokenState::CodeBlock &&
423  m_previous != TokenState::CodeFenceEnd) {
424  return false;
425  }
426 
427  // Search for the bullet point character, which can
428  // be either a '+', '-', or '*'.
429  //
430  for (int i = 0; i < text.length(); i++)
431  {
432  if (QChar(' ') == text[i])
433  {
434  if (foundBulletChar)
435  {
436  // We've confirmed it's a bullet point by the whitespace that
437  // follows the bullet point character, and can now exit the
438  // loop.
439  //
440  whitespaceFoundAfterBulletChar = true;
441  break;
442  }
443  else
444  {
445  spaceCount++;
446 
447  // If this list item is the first in the list, ensure the
448  // number of spaces preceeding the bullet point does not
449  // exceed three, as that would indicate a code block rather
450  // than a bullet point list.
451  //
452  if
453  (
454  (spaceCount > 3)
455  && (m_previous != TokenState::NumList)
456  && (m_previous != TokenState::BulletList)
457  && (m_previous != TokenState::ListLineBreak)
458  &&
459  (
460  (m_previous == TokenState::ParagraphBreak)
461  || (m_previous == TokenState::Unknown)
462  || (m_previous == TokenState::CodeBlock)
463  || (m_previous == TokenState::CodeFenceEnd)
464  )
465  )
466  {
467  return false;
468  }
469  }
470  }
471  else if (QChar('\t') == text[i])
472  {
473  if (foundBulletChar)
474  {
475  // We've confirmed it's a bullet point by the whitespace that
476  // follows the bullet point character, and can now exit the
477  // loop.
478  //
479  whitespaceFoundAfterBulletChar = true;
480  break;
481  }
482  else if
483  (
484  (m_previous == TokenState::ParagraphBreak)
485  || (m_previous == TokenState::Unknown)
486  )
487  {
488  // If this list item is the first in the list, ensure that
489  // no tab character preceedes the bullet point, as that would
490  // indicate a code block rather than a bullet point list.
491  //
492  return false;
493  }
494  }
495  else if
496  (
497  (QChar('+') == text[i])
498  || (QChar('-') == text[i])
499  || (QChar('*') == text[i])
500  )
501  {
502  foundBulletChar = true;
503  bulletCharIndex = i;
504  }
505  else
506  {
507  return false;
508  }
509  }
510 
511  if ((bulletCharIndex >= 0) && whitespaceFoundAfterBulletChar)
512  {
513  MarkdownToken token;
514  token.setType(MarkdownToken::BulletList);
515  token.setPosition(0);
516  token.setLength(text.length());
517  token.setOpeningLenth(bulletCharIndex + 1);
518  addToken(token);
519  setState(TokenState::BulletList);
520  return true;
521  }
522  return false;
523 }
524 
525 bool MarkdownTokenizer::tokenizeHorizontalRule(const QString &text)
526 {
527  if (hruleRegex.exactMatch(text))
528  {
529  MarkdownToken token;
530  token.setType(MarkdownToken::HorizontalRule);
531  token.setPosition(0);
532  token.setLength(text.length());
533  addToken(token);
534  setState(TokenState::HorizontalRule);
535  return true;
536  }
537  return false;
538 }
539 
540 bool MarkdownTokenizer::tokenizeBlockquote(const QString &text)
541 {
542  if (m_previous == TokenState::Blockquote || blockquoteRegex.exactMatch(text)) {
543  // Find any '>' characters at the front of the line.
544  int markupLength = 0;
545 
546  for (int i = 0; i < text.length(); i++)
547  {
548  if (QChar('>') == text[i])
549  {
550  markupLength = i + 1;
551  }
552  else if (!text[i].isSpace())
553  {
554  // There are no more '>' characters at the front of the line,
555  // so stop processing.
556  //
557  break;
558  }
559  }
560 
561  MarkdownToken token;
562  token.setType(MarkdownToken::Blockquote);
563  token.setPosition(0);
564  token.setLength(text.length());
565 
566  if (markupLength > 0)
567  {
568  token.setOpeningLenth(markupLength);
569  }
570 
571  addToken(token);
572  setState(TokenState::Blockquote);
573  return true;
574  }
575  return false;
576 }
577 
578 bool MarkdownTokenizer::tokenizeCodeBlock(const QString &text)
579 {
580  if (m_previous == TokenState::GFMCodeFence) {
581  setState(m_previous);
582 
583  if (m_previous == TokenState::GFMCodeFence && githubCodeFenceEndRegex.exactMatch(text)) {
584  MarkdownToken token;
585  token.setType(MarkdownToken::CodeFenceEnd);
586  token.setPosition(0);
587  token.setLength(text.length());
588  addToken(token);
589  setState(TokenState::CodeFenceEnd);
590  }
591  else
592  {
593  MarkdownToken token;
594  token.setType(MarkdownToken::CodeBlock);
595  token.setPosition(0);
596  token.setLength(text.length());
597  addToken(token);
598  }
599 
600  return true;
601  }
602  else if ((m_previous == TokenState::CodeBlock ||
603  m_previous == TokenState::ParagraphBreak ||
604  m_previous == TokenState::Unknown)
605  &&
606  (text.startsWith(QChar('\t')) || text.startsWith(" "))) {
607 
608  MarkdownToken token;
609  token.setType(MarkdownToken::CodeBlock);
610  token.setPosition(0);
611  token.setLength(text.length());
612  addToken(token);
613  setState(TokenState::CodeBlock);
614  return true;
615  }
616  else if (m_previous == TokenState::ParagraphBreak ||
617  m_previous == TokenState::Paragraph ||
618  m_previous == TokenState::Unknown ||
619  m_previous == TokenState::ListLineBreak) {
620  bool foundCodeFenceStart = false;
621  MarkdownToken token;
622 
623  if (githubCodeFenceStartRegex.exactMatch(text))
624  {
625  foundCodeFenceStart = true;
626  token.setType(MarkdownToken::GFMCodeFence);
627  setState(TokenState::GFMCodeFence);
628  }
629 
630  if (foundCodeFenceStart)
631  {
632  token.setPosition(0);
633  token.setLength(text.length());
634  addToken(token);
635  return true;
636  }
637  }
638  return false;
639 }
640 
641 bool MarkdownTokenizer::tokenizeMultilineComment(const QString &text)
642 {
643  if (m_previous == TokenState::Comment) {
644  // Find the end of the comment, if any.
645  int index = text.indexOf("-->");
646  MarkdownToken token;
647  token.setType(MarkdownToken::HtmlComment);
648  token.setPosition(0);
649 
650  if (index >= 0)
651  {
652  token.setLength(index + 3);
653  addToken(token);
654 
655  // Return false so that the rest of the line that isn't within
656  // the commented segment can be highlighted as normal paragraph
657  // text.
658  //
659  return false;
660  }
661  else
662  {
663  token.setLength(text.length());
664  addToken(token);
665  setState(TokenState::Comment);
666  return true;
667  }
668  }
669  return false;
670 }
671 
672 bool MarkdownTokenizer::tokenizeInline(const QString &text)
673 {
674  QString escapedText = dummyOutEscapeCharacters(text);
675 
676  // Check if the line is a reference definition.
677  if (referenceDefinitionRegex.exactMatch(escapedText))
678  {
679  int colonIndex = escapedText.indexOf(':');
680  MarkdownToken token;
681  token.setType(MarkdownToken::ReferenceDefinition);
682  token.setPosition(0);
683  token.setLength(colonIndex + 1);
684  addToken(token);
685 
686  // Replace the first bracket so that the '[...]:' reference definition
687  // start doesn't get highlighted as a reference link.
688  //
689  int firstBracketIndex = escapedText.indexOf(QChar('['));
690 
691  if (firstBracketIndex >= 0)
692  {
693  escapedText[firstBracketIndex] = DUMMY_CHAR;
694  }
695  }
696 
697  tokenizeVerbatim(escapedText);
698  tokenizeHtmlComments(escapedText);
699  tokenizeTableHeaderRow(escapedText);
700  tokenizeTableRow(escapedText);
701  tokenizeMatches(MarkdownToken::Image, escapedText, imageRegex, 0, 0, false, true);
702  tokenizeMatches(MarkdownToken::InlineLink, escapedText, inlineLinkRegex, 0, 0, false, true);
703  tokenizeMatches(MarkdownToken::ReferenceLink, escapedText, referenceLinkRegex, 0, 0, false, true);
704  tokenizeMatches(MarkdownToken::HtmlEntity, escapedText, htmlEntityRegex);
705  tokenizeMatches(MarkdownToken::AutomaticLink, escapedText, automaticLinkRegex, 0, 0, false, true);
706  tokenizeMatches(MarkdownToken::Strikethrough, escapedText, strikethroughRegex, 2, 2);
707  tokenizeMatches(MarkdownToken::Strong, escapedText, strongRegex, 2, 2, true);
708  tokenizeMatches(MarkdownToken::Emphasis, escapedText, emphasisRegex, 1, 1, true);
709  tokenizeMatches(MarkdownToken::HtmlTag, escapedText, htmlTagRegex);
710  tokenizeMatches(MarkdownToken::Mention, escapedText, mentionRegex, 0, 0, false, true);
711  return true;
712 }
713 
714 void MarkdownTokenizer::tokenizeVerbatim(QString &text)
715 {
716  int index = verbatimRegex.indexIn(text);
717 
718  while (index >= 0)
719  {
720  QString end = "";
721  int count = verbatimRegex.matchedLength();
722 
723  // Search for the matching end, which should have the same number
724  // of back ticks as the start.
725  //
726  for (int i = 0; i < count; i++)
727  {
728  end += '`';
729  }
730 
731  int endIndex = text.indexOf(end, index + count);
732 
733  // If the end was found, add the verbatim token.
734  if (endIndex >= 0)
735  {
736  MarkdownToken token;
737 
738  token.setType(MarkdownToken::Verbatim);
739  token.setPosition(index);
740  token.setLength(endIndex + count - index);
741  token.setOpeningLenth(count);
742  token.setClosingLength(count);
743  addToken(token);
744 
745  // Fill out the token match in the string with the dummy
746  // character so that searches for other Markdown elements
747  // don't find anything within this token's range in the string.
748  //
749  for (int i = index; i < (index + token.length()); i++)
750  {
751  text[i] = DUMMY_CHAR;
752  }
753 
754  index += token.length();
755  }
756  // Else start searching again at the very next character.
757  else
758  {
759  index++;
760  }
761 
762  index = verbatimRegex.indexIn(text, index);
763  }
764 }
765 
766 void MarkdownTokenizer::tokenizeHtmlComments(QString &text)
767 {
768  // Check for the end of a multiline comment so that it doesn't get further
769  // tokenized. Don't bother formatting the comment itself, however, because
770  // it should have already been tokenized in tokenizeMultilineComment().
771  //
772  if (m_previous == TokenState::Comment) {
773  int commentEnd = text.indexOf("-->");
774 
775  for (int i = 0; i < commentEnd + 3; i++) {
776  text[i] = DUMMY_CHAR;
777  }
778  }
779 
780  // Now check for inline comments (non-multiline).
781  int commentStart = text.indexOf(htmlInlineCommentRegex);
782 
783  while (commentStart >= 0)
784  {
785  int commentLength = htmlInlineCommentRegex.matchedLength();
786  MarkdownToken token;
787 
788  token.setType(MarkdownToken::HtmlComment);
789  token.setPosition(commentStart);
790  token.setLength(commentLength);
791  addToken(token);
792 
793  // Replace comment segment with dummy characters so that it doesn't
794  // get tokenized again.
795  //
796  for (int i = commentStart; i < (commentStart + commentLength); i++) {
797  text[i] = DUMMY_CHAR;
798  }
799 
800  commentStart = text.indexOf(htmlInlineCommentRegex, commentStart + commentLength);
801  }
802 
803  // Find multiline comment start, if any.
804  commentStart = text.indexOf("<!--");
805 
806  if (commentStart >= 0)
807  {
808  MarkdownToken token;
809 
810  token.setType(MarkdownToken::HtmlComment);
811  token.setPosition(commentStart);
812  token.setLength(text.length() - commentStart);
813  addToken(token);
814  setState(TokenState::Comment);
815 
816  // Replace comment segment with dummy characters so that it doesn't
817  // get tokenized again.
818  //
819  for (int i = commentStart; i < text.length(); i++)
820  {
821  text[i] = DUMMY_CHAR;
822  }
823  }
824 }
825 
826 void MarkdownTokenizer::tokenizeTableHeaderRow(QString &text)
827 {
828  if ((m_previous == TokenState::ParagraphBreak ||
829  m_previous == TokenState::ListLineBreak ||
830  m_previous == TokenState::SetextHead1Line2 ||
831  m_previous == TokenState::SetextHead2Line2 ||
832  m_previous == TokenState::AtxHeading1 ||
833  m_previous == TokenState::AtxHeading2 ||
834  m_previous == TokenState::AtxHeading3 ||
835  m_previous == TokenState::AtxHeading4 ||
836  m_previous == TokenState::AtxHeading5 ||
837  m_previous == TokenState::AtxHeading6 ||
838  m_previous == TokenState::HorizontalRule ||
839  m_previous == TokenState::CodeFenceEnd ||
840  m_previous == TokenState::Unknown)
841  &&
842  (m_state == TokenState::Paragraph ||
843  m_state == TokenState::Unknown)
844  &&
845  (m_next == TokenState::TableDiv)) {
846 
847  setState(TokenState::TableHeader);
848 
849  int headerStart = 0;
850 
851  for (int i = 0; i < text.length(); i++)
852  {
853  if (QChar('|') == text[i])
854  {
855  // Replace pipe with space so that it doesn't get formatted
856  // again with, for example, strong or emphasis formatting.
857  // Note that we use a space rather than DUMMY_CHAR for this,
858  // to prevent formatting such as strong and emphasis from
859  // picking it up.
860  //
861  text[i] = ' ';
862 
863  MarkdownToken token;
864 
865  if (i > 0)
866  {
867  token.setType(MarkdownToken::TableHeader);
868  token.setPosition(headerStart);
869  token.setLength(i - headerStart);
870  addToken(token);
871  }
872 
873  token.setType(MarkdownToken::TablePipe);
874  token.setPosition(i);
875  token.setLength(1);
876  addToken(token);
877  headerStart = i + 1;
878  }
879  }
880 
881  if (headerStart < text.length())
882  {
883  MarkdownToken token;
884  token.setType(MarkdownToken::TableHeader);
885  token.setPosition(headerStart);
886  token.setLength(text.length() - headerStart);
887  addToken(token);
888  }
889  }
890 }
891 
892 bool MarkdownTokenizer::tokenizeTableDivider(const QString &text)
893 {
894  if (m_previous == TokenState::TableHeader)
895  {
896  if (pipeTableDividerRegex.exactMatch(text))
897  {
898  setState(TokenState::TableDiv);
899 
900  MarkdownToken token;
901  token.setType(MarkdownToken::TableDiv);
902  token.setLength(text.length());
903  token.setPosition(0);
904  addToken(token);
905 
906  return true;
907  }
908  else
909  {
910  // Restart tokenizing on the previous line.
911  setStartEarlier(true);
912  }
913  }
914  else if (m_previous == TokenState::Paragraph)
915  {
916  if (pipeTableDividerRegex.exactMatch(text))
917  {
918  // Restart tokenizing on the previous line.
919  setStartEarlier(true);
920 
921  setState(TokenState::TableDiv);
922 
923  MarkdownToken token;
924  token.setLength(text.length());
925  token.setPosition(0);
926  token.setType(MarkdownToken::TableDiv);
927  addToken(token);
928  return true;
929  }
930  }
931  return false;
932 }
933 
934 void MarkdownTokenizer::tokenizeTableRow(QString &text)
935 {
936  if (m_previous == TokenState::TableDiv || m_previous == TokenState::TableRow) {
937 
938  setState(TokenState::TableRow);
939 
940  for (int i = 0; i < text.length(); i++)
941  {
942  if (QChar('|') == text[i])
943  {
944  // Replace pipe with space so that it doesn't get formatted
945  // again with, for example, strong or emphasis formatting.
946  // Note that we use a space rather than DUMMY_CHAR for this,
947  // to prevent formatting such as strong and emphasis from
948  // picking it up.
949  //
950  text[i] = ' ';
951 
952  MarkdownToken token;
953  token.setType(MarkdownToken::TablePipe);
954  token.setPosition(i);
955  token.setLength(1);
956  addToken(token);
957  }
958  }
959  }
960 }
MarkdownToken
Definition: MarkdownToken.h:4