import { normal, gfm, tables } from './blockRules' import { options } from './utils' /** * Block Lexer */ function Lexer (opts) { this.tokens = [] this.tokens.links = {} this.options = Object.assign({}, options, opts) this.rules = normal if (this.options.gfm) { if (this.options.tables) { this.rules = tables } else { this.rules = gfm } } } /** * Preprocessing */ Lexer.prototype.lex = function (src) { src = src .replace(/\r\n|\r/g, '\n') .replace(/\t/g, ' ') .replace(/\u00a0/g, ' ') .replace(/\u2424/g, '\n') return this.token(src, true) } /** * Lexing */ Lexer.prototype.token = function (src, top, bq) { src = src.replace(/^ +$/gm, '') let loose let cap let bull let b let item let space let i let l let checked // Only check front matter at the begining of markdown file cap = this.rules.frontmatter.exec(src) if (!bq && top && cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'frontmatter', text: cap[1] }) } while (src) { // newline cap = this.rules.newline.exec(src) if (cap) { src = src.substring(cap[0].length) if (cap[0].length > 1) { this.tokens.push({ type: 'space' }) } } // code cap = this.rules.code.exec(src) if (cap) { src = src.substring(cap[0].length) cap = cap[0].replace(/^ {4}/gm, '') this.tokens.push({ type: 'code', codeBlockStyle: 'indented', text: !this.options.pedantic ? cap.replace(/\n+$/, '') : cap }) continue } // multiple line math cap = this.rules.multiplemath.exec(src) if (cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'multiplemath', text: cap[1] }) continue } // fences (gfm) cap = this.rules.fences.exec(src) if (cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'code', codeBlockStyle: 'fenced', lang: cap[2] ? cap[2].trim() : cap[2], text: cap[3] || '' }) continue } // heading cap = this.rules.heading.exec(src) if (cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'heading', headingStyle: 'atx', depth: cap[1].length, text: cap[2] }) continue } // table no leading pipe (gfm) cap = this.rules.nptable.exec(src) if (top && cap) { item = { type: 'table', header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')), align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */), cells: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : [] } if (item.header.length === item.align.length) { src = src.substring(cap[0].length) for (i = 0; i < item.align.length; i++) { if (/^ *-+: *$/.test(item.align[i])) { item.align[i] = 'right' } else if (/^ *:-+: *$/.test(item.align[i])) { item.align[i] = 'center' } else if (/^ *:-+ *$/.test(item.align[i])) { item.align[i] = 'left' } else { item.align[i] = null } } for (i = 0; i < item.cells.length; i++) { item.cells[i] = splitCells(item.cells[i], item.header.length) } this.tokens.push(item) continue } } // hr cap = this.rules.hr.exec(src) if (cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'hr' }) continue } // blockquote cap = this.rules.blockquote.exec(src) if (cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'blockquote_start' }) cap = cap[0].replace(/^ *> ?/gm, '') // Pass `top` to keep the current // "toplevel" state. This is exactly // how markdown.pl works. this.token(cap, top, true) this.tokens.push({ type: 'blockquote_end' }) continue } // list cap = this.rules.tasklist.exec(src) || this.rules.orderlist.exec(src) || this.rules.bulletlist.exec(src) if (cap) { src = src.substring(cap[0].length) bull = cap[2] const ordered = bull.length > 1 && /\d/.test(bull) this.tokens.push({ type: 'list_start', ordered, listType: bull.length > 1 ? (/\d/.test(bull) ? 'order' : 'task') : 'bullet', start: ordered ? +bull : '' }) let next = false let prevNext = true let listItemIndices = [] // Get each top-level item. cap = cap[0].match(this.rules.item) l = cap.length i = 0 for (; i < l; i++) { const itemWithBullet = cap[i] item = itemWithBullet // Remove the list item's bullet // so it is seen as the next token. space = item.length item = item.replace(/^ *([*+-]|\d+\.) +/, '') if (this.options.gfm) { checked = this.rules.checkbox.exec(item) if (checked) { checked = checked[1] === 'x' item = item.replace(this.rules.checkbox, '') } else { checked = undefined } } // Outdent whatever the // list item contains. Hacky. if (~item.indexOf('\n ')) { space -= item.length item = !this.options.pedantic ? item.replace(new RegExp('^ {1,' + space + '}', 'gm'), '') : item.replace(/^ {1,4}/gm, '') } // Determine whether the next list item belongs here. // Backpedal if it does not belong in this list. if (this.options.smartLists && i !== l - 1) { b = this.rules.bullet.exec(cap[i + 1])[0] if (bull !== b && !(bull.length > 1 && b.length > 1)) { src = cap.slice(i + 1).join('\n') + src i = l - 1 } } let prevItem = '' if (i === 0) { prevItem = item } else { prevItem = cap[i - 1] } // Determine whether item is loose or not. If previous item is loose // this item is also loose. loose = next = next || /^ *([*+-]|\d+\.) +\S+\n\n(?!\s*$)/.test(itemWithBullet) // Check if previous line ends with a new line. if (!loose && (i !== 0 || l > 1) && prevItem.length !== 0 && prevItem.charAt(prevItem.length - 1) === '\n') { loose = next = true } // A list is either loose or tight, so update previous list items. if (next && prevNext !== next) { for (const index of listItemIndices) { this.tokens[index].type = 'loose_item_start' } listItemIndices = [] } prevNext = next if (!loose) { listItemIndices.push(this.tokens.length) } this.tokens.push({ checked: checked, listItemType: bull.length > 1 ? (/\d/.test(bull) ? 'order' : 'task') : 'bullet', bulletListItemMarker: /\d/.test(bull) ? '' : bull.charAt(0), type: loose ? 'loose_item_start' : 'list_item_start' }) if (/^\s*$/.test(item)) { this.tokens.push({ type: 'text', text: '' }) } else { // Recurse. this.token(item, false, bq) } this.tokens.push({ type: 'list_item_end' }) } this.tokens.push({ type: 'list_end' }) continue } // html cap = this.rules.html.exec(src) if (cap) { src = src.substring(cap[0].length) this.tokens.push({ type: this.options.sanitize ? 'paragraph' : 'html', pre: !this.options.sanitizer && (cap[1] === 'pre' || cap[1] === 'script' || cap[1] === 'style'), text: cap[0] }) continue } // def cap = this.rules.def.exec(src) if (!bq && top && cap) { let text = '' do { src = src.substring(cap[0].length) this.tokens.links[cap[1].toLowerCase()] = { href: cap[2], title: cap[3] } text += cap[0] if (cap[0].endsWith('\n\n')) break cap = this.rules.def.exec(src) } while (cap) if (this.options.disableInline) { this.tokens.push({ type: 'paragraph', text: text.replace(/\n*$/, '') }) } continue } // table (gfm) cap = this.rules.table.exec(src) if (cap) { item = { type: 'table', header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')), align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */), cells: cap[3] ? cap[3].replace(/(?: *\| *)?\n$/, '').split('\n') : [] } if (item.header.length === item.align.length) { src = src.substring(cap[0].length) for (i = 0; i < item.align.length; i++) { if (/^ *-+: *$/.test(item.align[i])) { item.align[i] = 'right' } else if (/^ *:-+: *$/.test(item.align[i])) { item.align[i] = 'center' } else if (/^ *:-+ *$/.test(item.align[i])) { item.align[i] = 'left' } else { item.align[i] = null } } for (i = 0; i < item.cells.length; i++) { item.cells[i] = splitCells( item.cells[i].replace(/^ *\| *| *\| *$/g, ''), item.header.length) } this.tokens.push(item) continue } } // lheading cap = this.rules.lheading.exec(src) if (cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'heading', headingStyle: 'setext', depth: cap[2] === '=' ? 1 : 2, text: cap[1] }) continue } // top-level paragraph cap = this.rules.paragraph.exec(src) if (top && cap) { src = src.substring(cap[0].length) this.tokens.push({ type: 'paragraph', text: cap[1].charAt(cap[1].length - 1) === '\n' ? cap[1].slice(0, -1) : cap[1] }) continue } // text cap = this.rules.text.exec(src) if (cap) { // Top-level should never reach here. src = src.substring(cap[0].length) this.tokens.push({ type: 'text', text: cap[0] }) continue } if (src) { throw new Error('Infinite loop on byte: ' + src.charCodeAt(0)) } } return this.tokens } function splitCells (tableRow, count) { // ensure that every cell-delimiting pipe has a space // before it to distinguish it from an escaped pipe let row = tableRow.replace(/\|/g, function (match, offset, str) { let escaped = false let curr = offset while (--curr >= 0 && str[curr] === '\\') escaped = !escaped if (escaped) { // odd number of slashes means | is escaped // so we leave it alone return '|' } else { // add space before unescaped | return ' |' } }) let cells = row.split(/ \|/) if (cells.length > count) { cells.splice(count) } else { while (cells.length < count) cells.push('') } for (let i = 0; i < cells.length; i++) { // leading or trailing whitespace is ignored per the gfm spec cells[i] = cells[i].trim().replace(/\\\|/g, '|') } return cells } export default Lexer