mirror of
https://github.com/marktext/marktext.git
synced 2025-05-02 23:43:41 +08:00
664 lines
18 KiB
JavaScript
664 lines
18 KiB
JavaScript
import { normal, gfm, pedantic } from './blockRules'
|
|
import options from './options'
|
|
import { splitCells, rtrim, getUniqueId } from './utils'
|
|
|
|
/**
|
|
* Block Lexer
|
|
*/
|
|
|
|
function Lexer (opts) {
|
|
this.tokens = []
|
|
this.tokens.links = Object.create(null)
|
|
this.tokens.footnotes = Object.create(null)
|
|
this.footnoteOrder = 0
|
|
this.options = Object.assign({}, options, opts)
|
|
this.rules = normal
|
|
|
|
if (this.options.pedantic) {
|
|
this.rules = pedantic
|
|
} else if (this.options.gfm) {
|
|
this.rules = gfm
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Preprocessing
|
|
*/
|
|
|
|
Lexer.prototype.lex = function (src) {
|
|
src = src
|
|
.replace(/\r\n|\r/g, '\n')
|
|
.replace(/\t/g, ' ')
|
|
this.checkFrontmatter = true
|
|
this.footnoteOrder = 0
|
|
this.token(src, true)
|
|
|
|
// Move footnote token to the end of tokens.
|
|
const { tokens } = this
|
|
const hasNoFootnoteTokens = []
|
|
const footnoteTokens = []
|
|
let isInFootnote = false
|
|
for (const token of tokens) {
|
|
const { type } = token
|
|
if (type === 'footnote_start') {
|
|
isInFootnote = true
|
|
footnoteTokens.push(token)
|
|
} else if (type === 'footnote_end') {
|
|
isInFootnote = false
|
|
footnoteTokens.push(token)
|
|
} else if (isInFootnote) {
|
|
footnoteTokens.push(token)
|
|
} else {
|
|
hasNoFootnoteTokens.push(token)
|
|
}
|
|
}
|
|
|
|
const result = [...hasNoFootnoteTokens, ...footnoteTokens]
|
|
result.links = tokens.links
|
|
result.footnotes = tokens.footnotes
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Lexing
|
|
*/
|
|
|
|
Lexer.prototype.token = function (src, top) {
|
|
const {
|
|
footnote,
|
|
frontMatter,
|
|
isGitlabCompatibilityEnabled,
|
|
math
|
|
} = this.options
|
|
src = src.replace(/^ +$/gm, '')
|
|
|
|
let loose
|
|
let cap
|
|
let bull
|
|
let b
|
|
let item
|
|
let space
|
|
let i
|
|
let tag
|
|
let l
|
|
|
|
// Only check front matter at the begining of a markdown file.
|
|
// Please see note in "blockquote" why we need "checkFrontmatter" and "top".
|
|
if (frontMatter) {
|
|
cap = this.rules.frontmatter.exec(src)
|
|
if (this.checkFrontmatter && top && cap) {
|
|
src = src.substring(cap[0].length)
|
|
let lang
|
|
let style
|
|
let text
|
|
if (cap[1]) {
|
|
lang = 'yaml'
|
|
style = '-'
|
|
text = cap[1]
|
|
} else if (cap[2]) {
|
|
lang = 'toml'
|
|
style = '+'
|
|
text = cap[2]
|
|
} else if (cap[3] || cap[4]) {
|
|
lang = 'json'
|
|
style = cap[3] ? ';' : '{'
|
|
text = cap[3] || cap[4]
|
|
}
|
|
this.tokens.push({
|
|
type: 'frontmatter',
|
|
text,
|
|
style,
|
|
lang
|
|
})
|
|
}
|
|
this.checkFrontmatter = false
|
|
}
|
|
|
|
while (src) {
|
|
// newline
|
|
cap = this.rules.newline.exec(src)
|
|
if (cap) {
|
|
src = src.substring(cap[0].length)
|
|
if (cap[0].length > 1) {
|
|
this.tokens.push({
|
|
type: 'space'
|
|
})
|
|
}
|
|
}
|
|
|
|
// code
|
|
// An indented code block cannot interrupt a paragraph.
|
|
cap = this.rules.code.exec(src)
|
|
if (cap) {
|
|
const lastToken = this.tokens[this.tokens.length - 1]
|
|
src = src.substring(cap[0].length)
|
|
if (lastToken && lastToken.type === 'paragraph') {
|
|
lastToken.text += `\n${cap[0].trimRight()}`
|
|
} else {
|
|
cap = cap[0].replace(/^ {4}/gm, '')
|
|
this.tokens.push({
|
|
type: 'code',
|
|
codeBlockStyle: 'indented',
|
|
text: !this.options.pedantic
|
|
? rtrim(cap, '\n')
|
|
: cap
|
|
})
|
|
}
|
|
continue
|
|
}
|
|
|
|
// multiple line math
|
|
if (math) {
|
|
cap = this.rules.multiplemath.exec(src)
|
|
if (cap) {
|
|
src = src.substring(cap[0].length)
|
|
this.tokens.push({
|
|
type: 'multiplemath',
|
|
text: cap[1],
|
|
mathStyle: ''
|
|
})
|
|
continue
|
|
}
|
|
|
|
// match GitLab display math blocks (```math)
|
|
if (isGitlabCompatibilityEnabled) {
|
|
cap = this.rules.multiplemathGitlab.exec(src)
|
|
if (cap) {
|
|
src = src.substring(cap[0].length)
|
|
this.tokens.push({
|
|
type: 'multiplemath',
|
|
text: cap[2] || '',
|
|
mathStyle: 'gitlab'
|
|
})
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// footnote
|
|
if (footnote) {
|
|
cap = this.rules.footnote.exec(src)
|
|
if (top && cap) {
|
|
src = src.substring(cap[0].length)
|
|
const identifier = cap[1]
|
|
this.tokens.push({
|
|
type: 'footnote_start',
|
|
identifier
|
|
})
|
|
|
|
// NOTE: Order is wrong if footnote identifier 1 is behind footnote identifier 2 in text.
|
|
this.tokens.footnotes[identifier] = {
|
|
order: ++this.footnoteOrder,
|
|
identifier,
|
|
footnoteId: getUniqueId()
|
|
}
|
|
|
|
/* eslint-disable no-useless-escape */
|
|
// Remove the footnote identifer prefix. eg: `[^identifier]: `.
|
|
cap = cap[0].replace(/^\[\^[^\^\[\]\s]+?(?<!\\)\]:\s+/gm, '')
|
|
// Remove the four whitespace before each block of footnote.
|
|
cap = cap.replace(/\n {4}(?=[^\s])/g, '\n')
|
|
/* eslint-enable no-useless-escape */
|
|
|
|
this.token(cap, top)
|
|
|
|
this.tokens.push({
|
|
type: 'footnote_end'
|
|
})
|
|
|
|
continue
|
|
}
|
|
}
|
|
|
|
// fences
|
|
cap = this.rules.fences.exec(src)
|
|
if (cap) {
|
|
src = src.substring(cap[0].length)
|
|
const raw = cap[0]
|
|
const text = indentCodeCompensation(raw, cap[3] || '')
|
|
this.tokens.push({
|
|
type: 'code',
|
|
codeBlockStyle: 'fenced',
|
|
lang: cap[2] ? cap[2].trim() : cap[2],
|
|
text
|
|
})
|
|
continue
|
|
}
|
|
|
|
// heading
|
|
cap = this.rules.heading.exec(src)
|
|
if (cap) {
|
|
src = src.substring(cap[0].length)
|
|
let text = cap[2] ? cap[2].trim() : ''
|
|
|
|
if (text.endsWith('#')) {
|
|
var trimmed = rtrim(text, '#')
|
|
|
|
if (this.options.pedantic) {
|
|
text = trimmed.trim()
|
|
} else if (!trimmed || trimmed.endsWith(' ')) {
|
|
// CommonMark requires space before trailing #s
|
|
text = trimmed.trim()
|
|
}
|
|
}
|
|
|
|
this.tokens.push({
|
|
type: 'heading',
|
|
headingStyle: 'atx',
|
|
depth: cap[1].length,
|
|
text
|
|
})
|
|
continue
|
|
}
|
|
|
|
// table no leading pipe (gfm)
|
|
cap = this.rules.nptable.exec(src)
|
|
if (cap) {
|
|
item = {
|
|
type: 'table',
|
|
header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')),
|
|
align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */),
|
|
cells: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : []
|
|
}
|
|
|
|
if (item.header.length === item.align.length) {
|
|
src = src.substring(cap[0].length)
|
|
|
|
for (i = 0; i < item.align.length; i++) {
|
|
if (/^ *-+: *$/.test(item.align[i])) {
|
|
item.align[i] = 'right'
|
|
} else if (/^ *:-+: *$/.test(item.align[i])) {
|
|
item.align[i] = 'center'
|
|
} else if (/^ *:-+ *$/.test(item.align[i])) {
|
|
item.align[i] = 'left'
|
|
} else {
|
|
item.align[i] = null
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < item.cells.length; i++) {
|
|
item.cells[i] = splitCells(item.cells[i], item.header.length)
|
|
}
|
|
|
|
this.tokens.push(item)
|
|
|
|
continue
|
|
}
|
|
}
|
|
|
|
// hr
|
|
cap = this.rules.hr.exec(src)
|
|
if (cap) {
|
|
const marker = cap[0].replace(/\n*$/, '')
|
|
src = src.substring(cap[0].length)
|
|
this.tokens.push({
|
|
type: 'hr',
|
|
marker
|
|
})
|
|
continue
|
|
}
|
|
|
|
// blockquote
|
|
cap = this.rules.blockquote.exec(src)
|
|
if (cap) {
|
|
src = src.substring(cap[0].length)
|
|
|
|
this.tokens.push({
|
|
type: 'blockquote_start'
|
|
})
|
|
|
|
cap = cap[0].replace(/^ *> ?/gm, '')
|
|
|
|
// Pass `top` to keep the current
|
|
// "toplevel" state. This is exactly
|
|
// how markdown.pl works.
|
|
this.token(cap, top)
|
|
|
|
this.tokens.push({
|
|
type: 'blockquote_end'
|
|
})
|
|
|
|
continue
|
|
}
|
|
|
|
// NOTE: Complete list lexer part is a custom implementation based on an older marked.js version.
|
|
|
|
// list
|
|
cap = this.rules.list.exec(src)
|
|
if (cap) {
|
|
let checked
|
|
src = src.substring(cap[0].length)
|
|
bull = cap[2]
|
|
let isOrdered = bull.length > 1
|
|
this.tokens.push({
|
|
type: 'list_start',
|
|
ordered: isOrdered,
|
|
listType: bull.length > 1 ? 'order' : (/^( {0,3})([-*+]) \[[xX ]\]/.test(cap[0]) ? 'task' : 'bullet'),
|
|
start: isOrdered ? +(bull.slice(0, -1)) : ''
|
|
})
|
|
|
|
let next = false
|
|
let prevNext = true
|
|
let listItemIndices = []
|
|
let isTaskList = false
|
|
|
|
// Get each top-level item.
|
|
cap = cap[0].match(this.rules.item)
|
|
l = cap.length
|
|
i = 0
|
|
|
|
for (; i < l; i++) {
|
|
const itemWithBullet = cap[i]
|
|
item = itemWithBullet
|
|
let newIsTaskListItem = false
|
|
|
|
// Remove the list item's bullet so it is seen as the next token.
|
|
space = item.length
|
|
let newBull
|
|
item = item.replace(/^ *([*+-]|\d+(?:\.|\))) {0,4}/, function (m, p1) {
|
|
// Get and remove list item bullet
|
|
newBull = p1 || bull
|
|
return ''
|
|
})
|
|
|
|
const newIsOrdered = bull.length > 1 && /\d{1,9}/.test(newBull)
|
|
if (!newIsOrdered && this.options.gfm) {
|
|
checked = this.rules.checkbox.exec(item)
|
|
if (checked) {
|
|
checked = checked[1] === 'x' || checked[1] === 'X'
|
|
newIsTaskListItem = true
|
|
|
|
// Remove the list item's checkbox and adjust indentation by removing checkbox length.
|
|
item = item.replace(this.rules.checkbox, '')
|
|
space -= 4
|
|
} else {
|
|
checked = undefined
|
|
}
|
|
}
|
|
|
|
if (i === 0) {
|
|
isTaskList = newIsTaskListItem
|
|
} else if (
|
|
// Changing the bullet or ordered list delimiter starts a new list (CommonMark 264 and 265)
|
|
// - unordered, unordered --> bull !== newBull --> new list (e.g "-" --> "*")
|
|
// - ordered, ordered --> lastChar !== lastChar --> new list (e.g "." --> ")")
|
|
// - else --> new list (e.g. ordered --> unordered)
|
|
i !== 0 &&
|
|
(
|
|
(!isOrdered && !newIsOrdered && bull !== newBull) ||
|
|
(isOrdered && newIsOrdered && bull.slice(-1) !== newBull.slice(-1)) ||
|
|
(isOrdered !== newIsOrdered) ||
|
|
// Changing to/from task list item from/to bullet, starts a new list(work for marktext issue #870)
|
|
// Because we distinguish between task list and bullet list in Mark Text,
|
|
// the parsing here is somewhat different from the commonmark Spec,
|
|
// and the task list needs to be a separate list.
|
|
(isTaskList !== newIsTaskListItem)
|
|
)
|
|
) {
|
|
this.tokens.push({
|
|
type: 'list_end'
|
|
})
|
|
|
|
// Start a new list
|
|
bull = newBull
|
|
isOrdered = newIsOrdered
|
|
isTaskList = newIsTaskListItem
|
|
this.tokens.push({
|
|
type: 'list_start',
|
|
ordered: isOrdered,
|
|
listType: bull.length > 1 ? 'order' : (/^( {0,3})([-*+]) \[[xX ]\]/.test(itemWithBullet) ? 'task' : 'bullet'),
|
|
start: isOrdered ? +(bull.slice(0, -1)) : ''
|
|
})
|
|
}
|
|
|
|
// Outdent whatever the
|
|
// list item contains. Hacky.
|
|
if (~item.indexOf('\n ')) {
|
|
space -= item.length
|
|
item = !this.options.pedantic
|
|
? item.replace(new RegExp('^ {1,' + space + '}', 'gm'), '')
|
|
: item.replace(/^ {1,4}/gm, '')
|
|
}
|
|
|
|
// Determine whether the next list item belongs here.
|
|
// Backpedal if it does not belong in this list.
|
|
if (i !== l - 1) {
|
|
b = this.rules.bullet.exec(cap[i + 1])[0]
|
|
if (bull.length > 1 ? b.length === 1
|
|
: (b.length > 1 || (this.options.smartLists && b !== bull))) {
|
|
src = cap.slice(i + 1).join('\n') + src
|
|
i = l - 1
|
|
}
|
|
}
|
|
|
|
let prevItem = ''
|
|
if (i === 0) {
|
|
prevItem = item
|
|
} else {
|
|
prevItem = cap[i - 1]
|
|
}
|
|
|
|
// Determine whether item is loose or not. If previous item is loose
|
|
// this item is also loose.
|
|
// A list is loose if any of its constituent list items are separated by blank lines,
|
|
// or if any of its constituent list items directly contain two block-level elements with a blank line between them.
|
|
// loose = next = next || /^ *([*+-]|\d{1,9}(?:\.|\)))( +\S+\n\n(?!\s*$)|\n\n(?!\s*$))/.test(itemWithBullet)
|
|
loose = next = next || /\n\n(?!\s*$)/.test(item)
|
|
// Check if previous line ends with a new line.
|
|
if (!loose && (i !== 0 || l > 1) && prevItem.length !== 0 && prevItem.charAt(prevItem.length - 1) === '\n') {
|
|
loose = next = true
|
|
}
|
|
|
|
// A list is either loose or tight, so update previous list items but not nested list items.
|
|
if (next && prevNext !== next) {
|
|
for (const index of listItemIndices) {
|
|
this.tokens[index].type = 'loose_item_start'
|
|
}
|
|
listItemIndices = []
|
|
}
|
|
prevNext = next
|
|
|
|
if (!loose) {
|
|
listItemIndices.push(this.tokens.length)
|
|
}
|
|
|
|
const isOrderedListItem = /\d/.test(bull)
|
|
this.tokens.push({
|
|
checked,
|
|
listItemType: bull.length > 1 ? 'order' : (isTaskList ? 'task' : 'bullet'),
|
|
bulletMarkerOrDelimiter: isOrderedListItem ? bull.slice(-1) : bull.charAt(0),
|
|
type: loose ? 'loose_item_start' : 'list_item_start'
|
|
})
|
|
|
|
if (/^\s*$/.test(item)) {
|
|
this.tokens.push({
|
|
type: 'text',
|
|
text: ''
|
|
})
|
|
} else {
|
|
// Recurse.
|
|
this.token(item, false)
|
|
}
|
|
|
|
this.tokens.push({
|
|
type: 'list_item_end'
|
|
})
|
|
}
|
|
|
|
this.tokens.push({
|
|
type: 'list_end'
|
|
})
|
|
continue
|
|
}
|
|
|
|
// html
|
|
cap = this.rules.html.exec(src)
|
|
if (cap) {
|
|
src = src.substring(cap[0].length)
|
|
this.tokens.push({
|
|
type: this.options.sanitize
|
|
? 'paragraph'
|
|
: 'html',
|
|
pre: !this.options.sanitizer &&
|
|
(cap[1] === 'pre' || cap[1] === 'script' || cap[1] === 'style'),
|
|
text: this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0]
|
|
})
|
|
continue
|
|
}
|
|
|
|
// def
|
|
cap = this.rules.def.exec(src)
|
|
if (top && cap) {
|
|
let text = ''
|
|
do {
|
|
src = src.substring(cap[0].length)
|
|
if (cap[3]) cap[3] = cap[3].substring(1, cap[3].length - 1)
|
|
tag = cap[1].toLowerCase().replace(/\s+/g, ' ')
|
|
if (!this.tokens.links[tag]) {
|
|
this.tokens.links[tag] = {
|
|
href: cap[2],
|
|
title: cap[3]
|
|
}
|
|
}
|
|
|
|
text += cap[0]
|
|
if (cap[0].endsWith('\n\n')) break
|
|
cap = this.rules.def.exec(src)
|
|
} while (cap)
|
|
|
|
if (this.options.disableInline) {
|
|
this.tokens.push({
|
|
type: 'paragraph',
|
|
text: text.replace(/\n*$/, '')
|
|
})
|
|
}
|
|
continue
|
|
}
|
|
|
|
// table (gfm)
|
|
cap = this.rules.table.exec(src)
|
|
if (cap) {
|
|
item = {
|
|
type: 'table',
|
|
header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')),
|
|
align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */),
|
|
cells: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : []
|
|
}
|
|
|
|
if (item.header.length === item.align.length) {
|
|
src = src.substring(cap[0].length)
|
|
|
|
for (i = 0; i < item.align.length; i++) {
|
|
if (/^ *-+: *$/.test(item.align[i])) {
|
|
item.align[i] = 'right'
|
|
} else if (/^ *:-+: *$/.test(item.align[i])) {
|
|
item.align[i] = 'center'
|
|
} else if (/^ *:-+ *$/.test(item.align[i])) {
|
|
item.align[i] = 'left'
|
|
} else {
|
|
item.align[i] = null
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < item.cells.length; i++) {
|
|
item.cells[i] = splitCells(
|
|
item.cells[i].replace(/^ *\| *| *\| *$/g, ''),
|
|
item.header.length)
|
|
}
|
|
|
|
this.tokens.push(item)
|
|
|
|
continue
|
|
}
|
|
}
|
|
|
|
// lheading
|
|
cap = this.rules.lheading.exec(src)
|
|
if (cap) {
|
|
const precededToken = this.tokens[this.tokens.length - 1]
|
|
const chops = cap[0].trim().split(/\n/)
|
|
const marker = chops[chops.length - 1]
|
|
src = src.substring(cap[0].length)
|
|
|
|
if (precededToken && precededToken.type === 'paragraph') {
|
|
this.tokens.pop()
|
|
this.tokens.push({
|
|
type: 'heading',
|
|
headingStyle: 'setext',
|
|
depth: cap[2].charAt(0) === '=' ? 1 : 2,
|
|
text: precededToken.text + '\n' + cap[1],
|
|
marker
|
|
})
|
|
} else {
|
|
this.tokens.push({
|
|
type: 'heading',
|
|
headingStyle: 'setext',
|
|
depth: cap[2].charAt(0) === '=' ? 1 : 2,
|
|
text: cap[1],
|
|
marker
|
|
})
|
|
}
|
|
continue
|
|
}
|
|
|
|
// top-level paragraph
|
|
cap = this.rules.paragraph.exec(src)
|
|
if (top && cap) {
|
|
src = src.substring(cap[0].length)
|
|
this.tokens.push({
|
|
type: 'paragraph',
|
|
text: cap[1].charAt(cap[1].length - 1) === '\n'
|
|
? cap[1].slice(0, -1)
|
|
: cap[1]
|
|
})
|
|
continue
|
|
}
|
|
|
|
// text
|
|
cap = this.rules.text.exec(src)
|
|
if (cap) {
|
|
// Top-level should never reach here.
|
|
src = src.substring(cap[0].length)
|
|
this.tokens.push({
|
|
type: 'text',
|
|
text: cap[0]
|
|
})
|
|
continue
|
|
}
|
|
|
|
if (src) {
|
|
throw new Error('Infinite loop on byte: ' + src.charCodeAt(0))
|
|
}
|
|
}
|
|
}
|
|
|
|
function indentCodeCompensation (raw, text) {
|
|
const matchIndentToCode = raw.match(/^(\s+)(?:```)/)
|
|
|
|
if (matchIndentToCode === null) {
|
|
return text
|
|
}
|
|
|
|
const indentToCode = matchIndentToCode[1]
|
|
|
|
return text
|
|
.split('\n')
|
|
.map(node => {
|
|
const matchIndentInNode = node.match(/^\s+/)
|
|
if (matchIndentInNode === null) {
|
|
return node
|
|
}
|
|
|
|
const [indentInNode] = matchIndentInNode
|
|
|
|
if (indentInNode.length >= indentToCode.length) {
|
|
return node.slice(indentToCode.length)
|
|
}
|
|
|
|
return node
|
|
})
|
|
.join('\n')
|
|
}
|
|
|
|
export default Lexer
|