import { beginRules, inlineRules } from './rules' import { isLengthEven, union } from '../utils' import { punctuation, WHITELIST_ATTRIBUTES } from '../config' const CAN_NEST_RULES = ['strong', 'em', 'link', 'del', 'image', 'a_link'] // image can not nest but it has children // disallowed html tags in https://github.github.com/gfm/#raw-html const disallowedHtmlTag = /(?:title|textarea|style|xmp|iframe|noembed|noframes|script|plaintext)/i const validateRules = Object.assign({}, inlineRules) delete validateRules.em delete validateRules.strong delete validateRules['tail_header'] delete validateRules['backlash'] const validWidthAndHeight = value => { if (!/^\d{1,}$/.test(value)) return '' value = parseInt(value) return value >= 0 ? value : '' } const getAttributes = html => { const parser = new DOMParser() const doc = parser.parseFromString(html, 'text/html') const target = doc.querySelector('body').firstElementChild if (!target) return null const attrs = {} for (const attr of target.getAttributeNames()) { if (!WHITELIST_ATTRIBUTES.includes(attr)) continue if (/width|height/.test(attr)) { attrs[attr] = validWidthAndHeight(target.getAttribute(attr)) } else { attrs[attr] = target.getAttribute(attr) } } return attrs } const lowerPriority = (src, offset) => { let i for (i = 0; i < offset; i++) { const text = src.substring(i) for (const rule of Object.keys(validateRules)) { const to = validateRules[rule].exec(text) if (to && to[0].length > offset - i) { return false } } } return true } const validateEmphasize = (src, offset, marker, pending) => { /** * Intraword emphasis is disallowed for _ */ const lastChar = pending.charAt(pending.length - 1) const followedChar = src[offset] const ALPHA_REG = /[a-zA-Z]{1}/ if (/_/.test(marker)) { if (ALPHA_REG.test(lastChar)) return false if (followedChar && ALPHA_REG.test(followedChar)) return false } /** * 1. This is not emphasis, because the second * is preceded by punctuation and followed by an alphanumeric * (hence it is not part of a right-flanking delimiter run: * 2. This is not emphasis, because the opening * is preceded by an alphanumeric and followed by punctuation, * and hence not part of a left-flanking delimiter run: */ if (ALPHA_REG.test(lastChar) && punctuation.indexOf(src[marker.length]) > -1) { return false } if (followedChar && ALPHA_REG.test(followedChar) && punctuation.indexOf(src[offset - marker.length - 1]) > -1) { return false } /** * When there are two potential emphasis or strong emphasis spans with the same closing delimiter, * the shorter one (the one that opens later) takes precedence. Thus, for example, **foo **bar baz** * is parsed as **foo bar baz rather than foo **bar baz. */ const mLen = marker.length const emphasizeText = src.substring(mLen, offset - mLen) const index = emphasizeText.indexOf(marker) if (index > -1 && /\S/.test(emphasizeText[index + mLen])) { return false } /** * Inline code spans, links, images, and HTML tags group more tightly than emphasis. * So, when there is a choice between an interpretation that contains one of these elements * and one that does not, the former always wins. Thus, for example, *[foo*](bar) is parsed * as *foo* rather than as [foo](bar). */ return lowerPriority(src, offset) } const tokenizerFac = (src, beginRules, inlineRules, pos = 0, top) => { const tokens = [] let pending = '' let pendingStartPos = pos const pushPending = () => { if (pending) { tokens.push({ type: 'text', raw: pending, content: pending, range: { start: pendingStartPos, end: pos } }) } pendingStartPos = pos pending = '' } if (beginRules && pos === 0) { const beginRuleList = ['header', 'hr', 'code_fense', 'multiple_math'] for (const ruleName of beginRuleList) { const to = beginRules[ruleName].exec(src) if (to) { const token = { type: ruleName, raw: to[0], parent: tokens, marker: to[1], content: to[2] || '', backlash: to[3] || '', range: { start: pos, end: pos + to[0].length } } tokens.push(token) src = src.substring(to[0].length) pos = pos + to[0].length break } } const def = beginRules['reference_definition'].exec(src) if (def && isLengthEven(def[3])) { const token = { type: 'reference_definition', parent: tokens, leftBracket: def[1], label: def[2], backlash: def[3] || '', rightBracket: def[4], leftHrefMarker: def[5] || '', href: def[6], rightHrefMarker: def[7] || '', leftTitlespace: def[8], titleMarker: def[9] || '', title: def[10] || '', rightTitleSpace: def[11] || '', raw: def[0], range: { start: pos, end: pos + def[0].length } } tokens.push(token) src = src.substring(def[0].length) pos = pos + def[0].length } } while (src.length) { // backlash const backTo = inlineRules.backlash.exec(src) if (backTo) { pushPending() tokens.push({ type: 'backlash', raw: backTo[1], marker: backTo[1], parent: tokens, content: '', range: { start: pos, end: pos + backTo[1].length } }) pending += pending + backTo[2] pendingStartPos = pos + backTo[1].length src = src.substring(backTo[0].length) pos = pos + backTo[0].length continue } // strong | em const emRules = ['strong', 'em'] let inChunk for (const rule of emRules) { const to = inlineRules[rule].exec(src) if (to && isLengthEven(to[3])) { const isValid = validateEmphasize(src, to[0].length, to[1], pending) if (isValid) { inChunk = true pushPending() const range = { start: pos, end: pos + to[0].length } const marker = to[1] tokens.push({ type: rule, raw: to[0], range, marker, parent: tokens, children: tokenizerFac(to[2], undefined, inlineRules, pos + to[1].length, false), backlash: to[3] }) src = src.substring(to[0].length) pos = pos + to[0].length } break } } if (inChunk) continue // strong | em | emoji | inline_code | del | inline_math const chunks = ['inline_code', 'del', 'emoji', 'inline_math'] for (const rule of chunks) { const to = inlineRules[rule].exec(src) if (to && isLengthEven(to[3])) { if (rule === 'emoji' && !lowerPriority(src, to[0].length)) break inChunk = true pushPending() const range = { start: pos, end: pos + to[0].length } const marker = to[1] if (rule === 'inline_code' || rule === 'emoji' || rule === 'inline_math') { tokens.push({ type: rule, raw: to[0], range, marker, parent: tokens, content: to[2], backlash: to[3] }) } else { tokens.push({ type: rule, raw: to[0], range, marker, parent: tokens, children: tokenizerFac(to[2], undefined, inlineRules, pos + to[1].length, false), backlash: to[3] }) } src = src.substring(to[0].length) pos = pos + to[0].length break } } if (inChunk) continue // image const imageTo = inlineRules.image.exec(src) if (imageTo && isLengthEven(imageTo[3]) && isLengthEven(imageTo[5])) { pushPending() tokens.push({ type: 'image', raw: imageTo[0], marker: imageTo[1], src: imageTo[4], parent: tokens, range: { start: pos, end: pos + imageTo[0].length }, alt: imageTo[2], backlash: { first: imageTo[3], second: imageTo[5] } }) src = src.substring(imageTo[0].length) pos = pos + imageTo[0].length continue } // link const linkTo = inlineRules.link.exec(src) if (linkTo && isLengthEven(linkTo[3]) && isLengthEven(linkTo[5])) { pushPending() tokens.push({ type: 'link', raw: linkTo[0], marker: linkTo[1], href: linkTo[4], parent: tokens, anchor: linkTo[2], range: { start: pos, end: pos + linkTo[0].length }, children: tokenizerFac(linkTo[2], undefined, inlineRules, pos + linkTo[1].length, false), backlash: { first: linkTo[3], second: linkTo[5] } }) src = src.substring(linkTo[0].length) pos = pos + linkTo[0].length continue } const rLinkTo = inlineRules['reference_link'].exec(src) if (rLinkTo && isLengthEven(rLinkTo[2]) && isLengthEven(rLinkTo[4])) { pushPending() tokens.push({ type: 'reference_link', raw: rLinkTo[0], isFullLink: !!rLinkTo[3], parent: tokens, anchor: rLinkTo[1], backlash: { first: rLinkTo[2], second: rLinkTo[4] || '' }, label: rLinkTo[3] || rLinkTo[1], range: { start: pos, end: pos + rLinkTo[0].length }, children: tokenizerFac(rLinkTo[1], undefined, inlineRules, pos + 1, false) }) src = src.substring(rLinkTo[0].length) pos = pos + rLinkTo[0].length continue } const rImageTo = inlineRules['reference_image'].exec(src) if (rImageTo && isLengthEven(rImageTo[2]) && isLengthEven(rImageTo[4])) { pushPending() tokens.push({ type: 'reference_image', raw: rImageTo[0], isFullLink: !!rImageTo[3], parent: tokens, alt: rImageTo[1], backlash: { first: rImageTo[2], second: rImageTo[4] || '' }, label: rImageTo[3] || rImageTo[1], range: { start: pos, end: pos + rImageTo[0].length } }) src = src.substring(rImageTo[0].length) pos = pos + rImageTo[0].length continue } // html escape const htmlEscapeTo = inlineRules['html_escape'].exec(src) if (htmlEscapeTo) { const len = htmlEscapeTo[0].length pushPending() tokens.push({ type: 'html_escape', raw: htmlEscapeTo[0], escapeCharacter: htmlEscapeTo[1], parent: tokens, range: { start: pos, end: pos + len } }) src = src.substring(len) pos = pos + len continue } // html-tag const htmlTo = inlineRules['html_tag'].exec(src) let attrs // handle comment if (htmlTo && htmlTo[1] && !htmlTo[3]) { const len = htmlTo[0].length pushPending() tokens.push({ type: 'html_tag', raw: htmlTo[0], tag: '', openTag: htmlTo[1], parent: tokens, attrs: {}, range: { start: pos, end: pos + len } }) src = src.substring(len) pos = pos + len continue } if (htmlTo && !(disallowedHtmlTag.test(htmlTo[3])) && (attrs = getAttributes(htmlTo[0]))) { const tag = htmlTo[3] const html = htmlTo[0] const len = htmlTo[0].length pushPending() tokens.push({ type: 'html_tag', raw: html, tag, openTag: htmlTo[2], closeTag: htmlTo[5], parent: tokens, attrs, content: htmlTo[4], children: htmlTo[4] ? tokenizerFac(htmlTo[4], undefined, inlineRules, pos + htmlTo[2].length, false) : '', range: { start: pos, end: pos + len } }) src = src.substring(len) pos = pos + len continue } // auto link const autoLTo = inlineRules['auto_link'].exec(src) if (autoLTo) { pushPending() tokens.push({ type: 'auto_link', raw: autoLTo[0], href: autoLTo[0], parent: tokens, range: { start: pos, end: pos + autoLTo[0].length } }) src = src.substring(autoLTo[0].length) pos = pos + autoLTo[0].length continue } // hard line break const hardTo = inlineRules['hard_line_break'].exec(src) if (hardTo && top) { const len = hardTo[0].length pushPending() tokens.push({ type: 'hard_line_break', raw: hardTo[0], spaces: hardTo[1], parent: tokens, range: { start: pos, end: pos + len } }) src = src.substring(len) pos += len continue } // tail header const tailTo = inlineRules['tail_header'].exec(src) if (tailTo && top) { pushPending() tokens.push({ type: 'tail_header', raw: tailTo[1], marker: tailTo[1], parent: tokens, range: { start: pos, end: pos + tailTo[1].length } }) src = src.substring(tailTo[1].length) pos += tailTo[1].length continue } if (!pending) pendingStartPos = pos pending += src[0] src = src.substring(1) pos++ } pushPending() return tokens } export const tokenizer = (src, highlights = [], hasBeginRules = true) => { const tokens = tokenizerFac(src, hasBeginRules ? beginRules : null, inlineRules, 0, true) const postTokenizer = tokens => { for (const token of tokens) { for (const light of highlights) { const highlight = union(token.range, light) if (highlight) { if (token.highlights && Array.isArray(token.highlights)) { token.highlights.push(highlight) } else { token.highlights = [highlight] } } } if (CAN_NEST_RULES.indexOf(token.type) > -1) { postTokenizer(token.children) } } } if (highlights.length) { postTokenizer(tokens) } return tokens } // transform `tokens` to text ignore the range of token // the opposite of tokenizer export const generator = tokens => { let result = '' for (const token of tokens) { result += token.raw } return result }