[290] | 1 | /* |
---|
| 2 | * to-markdown - an HTML to Markdown converter |
---|
| 3 | * |
---|
| 4 | * Copyright 2011, Dom Christie |
---|
| 5 | * Licenced under the MIT licence |
---|
| 6 | * |
---|
| 7 | */ |
---|
| 8 | |
---|
| 9 | var toMarkdown = function(string) { |
---|
| 10 | |
---|
| 11 | var ELEMENTS = [ |
---|
| 12 | { |
---|
| 13 | patterns: 'p', |
---|
| 14 | replacement: function(str, attrs, innerHTML) { |
---|
| 15 | return innerHTML ? '\n\n' + innerHTML + '\n' : ''; |
---|
| 16 | } |
---|
| 17 | }, |
---|
| 18 | { |
---|
| 19 | patterns: 'br', |
---|
| 20 | type: 'void', |
---|
| 21 | replacement: '\n' |
---|
| 22 | }, |
---|
| 23 | { |
---|
| 24 | patterns: 'h([1-6])', |
---|
| 25 | replacement: function(str, hLevel, attrs, innerHTML) { |
---|
| 26 | var hPrefix = ''; |
---|
| 27 | for(var i = 0; i < hLevel; i++) { |
---|
| 28 | hPrefix += '#'; |
---|
| 29 | } |
---|
| 30 | return '\n\n' + hPrefix + ' ' + innerHTML + '\n'; |
---|
| 31 | } |
---|
| 32 | }, |
---|
| 33 | { |
---|
| 34 | patterns: 'hr', |
---|
| 35 | type: 'void', |
---|
| 36 | replacement: '\n\n* * *\n' |
---|
| 37 | }, |
---|
| 38 | { |
---|
| 39 | patterns: 'a', |
---|
| 40 | replacement: function(str, attrs, innerHTML) { |
---|
| 41 | var href = attrs.match(attrRegExp('href')), |
---|
| 42 | title = attrs.match(attrRegExp('title')); |
---|
| 43 | return href ? '[' + innerHTML + ']' + '(' + href[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')' : str; |
---|
| 44 | } |
---|
| 45 | }, |
---|
| 46 | { |
---|
| 47 | patterns: ['b', 'strong'], |
---|
| 48 | replacement: function(str, attrs, innerHTML) { |
---|
| 49 | return innerHTML ? '**' + innerHTML + '**' : ''; |
---|
| 50 | } |
---|
| 51 | }, |
---|
| 52 | { |
---|
| 53 | patterns: ['i', 'em'], |
---|
| 54 | replacement: function(str, attrs, innerHTML) { |
---|
| 55 | return innerHTML ? '_' + innerHTML + '_' : ''; |
---|
| 56 | } |
---|
| 57 | }, |
---|
| 58 | { |
---|
| 59 | patterns: 'code', |
---|
| 60 | replacement: function(str, attrs, innerHTML) { |
---|
| 61 | return innerHTML ? '`' + innerHTML + '`' : ''; |
---|
| 62 | } |
---|
| 63 | }, |
---|
| 64 | { |
---|
| 65 | patterns: 'img', |
---|
| 66 | type: 'void', |
---|
| 67 | replacement: function(str, attrs, innerHTML) { |
---|
| 68 | var src = attrs.match(attrRegExp('src')), |
---|
| 69 | alt = attrs.match(attrRegExp('alt')), |
---|
| 70 | title = attrs.match(attrRegExp('title')); |
---|
| 71 | return '![' + (alt && alt[1] ? alt[1] : '') + ']' + '(' + src[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')'; |
---|
| 72 | } |
---|
| 73 | } |
---|
| 74 | ]; |
---|
| 75 | |
---|
| 76 | for(var i = 0, len = ELEMENTS.length; i < len; i++) { |
---|
| 77 | if(typeof ELEMENTS[i].patterns === 'string') { |
---|
| 78 | string = replaceEls(string, { tag: ELEMENTS[i].patterns, replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type }); |
---|
| 79 | } |
---|
| 80 | else { |
---|
| 81 | for(var j = 0, pLen = ELEMENTS[i].patterns.length; j < pLen; j++) { |
---|
| 82 | string = replaceEls(string, { tag: ELEMENTS[i].patterns[j], replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type }); |
---|
| 83 | } |
---|
| 84 | } |
---|
| 85 | } |
---|
| 86 | |
---|
| 87 | function replaceEls(html, elProperties) { |
---|
| 88 | var pattern = elProperties.type === 'void' ? '<' + elProperties.tag + '\\b([^>]*)\\/?>' : '<' + elProperties.tag + '\\b([^>]*)>([\\s\\S]*?)<\\/' + elProperties.tag + '>', |
---|
| 89 | regex = new RegExp(pattern, 'gi'), |
---|
| 90 | markdown = ''; |
---|
| 91 | if(typeof elProperties.replacement === 'string') { |
---|
| 92 | markdown = html.replace(regex, elProperties.replacement); |
---|
| 93 | } |
---|
| 94 | else { |
---|
| 95 | markdown = html.replace(regex, function(str, p1, p2, p3) { |
---|
| 96 | return elProperties.replacement.call(this, str, p1, p2, p3); |
---|
| 97 | }); |
---|
| 98 | } |
---|
| 99 | return markdown; |
---|
| 100 | } |
---|
| 101 | |
---|
| 102 | function attrRegExp(attr) { |
---|
| 103 | return new RegExp(attr + '\\s*=\\s*["\']?([^"\']*)["\']?', 'i'); |
---|
| 104 | } |
---|
| 105 | |
---|
| 106 | // Pre code blocks |
---|
| 107 | |
---|
| 108 | string = string.replace(/<pre\b[^>]*>`([\s\S]*)`<\/pre>/gi, function(str, innerHTML) { |
---|
| 109 | innerHTML = innerHTML.replace(/^\t+/g, ' '); // convert tabs to spaces (you know it makes sense) |
---|
| 110 | innerHTML = innerHTML.replace(/\n/g, '\n '); |
---|
| 111 | return '\n\n ' + innerHTML + '\n'; |
---|
| 112 | }); |
---|
| 113 | |
---|
| 114 | // Lists |
---|
| 115 | |
---|
| 116 | // Escape numbers that could trigger an ol |
---|
| 117 | // If there are more than three spaces before the code, it would be in a pre tag |
---|
| 118 | // Make sure we are escaping the period not matching any character |
---|
| 119 | string = string.replace(/^(\s{0,3}\d+)\. /g, '$1\\. '); |
---|
| 120 | |
---|
| 121 | // Converts lists that have no child lists (of same type) first, then works it's way up |
---|
| 122 | var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi; |
---|
| 123 | while(string.match(noChildrenRegex)) { |
---|
| 124 | string = string.replace(noChildrenRegex, function(str) { |
---|
| 125 | return replaceLists(str); |
---|
| 126 | }); |
---|
| 127 | } |
---|
| 128 | |
---|
| 129 | function replaceLists(html) { |
---|
| 130 | |
---|
| 131 | html = html.replace(/<(ul|ol)\b[^>]*>([\s\S]*?)<\/\1>/gi, function(str, listType, innerHTML) { |
---|
| 132 | var lis = innerHTML.split('</li>'); |
---|
| 133 | lis.splice(lis.length - 1, 1); |
---|
| 134 | |
---|
| 135 | for(i = 0, len = lis.length; i < len; i++) { |
---|
| 136 | if(lis[i]) { |
---|
| 137 | var prefix = (listType === 'ol') ? (i + 1) + ". " : "* "; |
---|
| 138 | lis[i] = lis[i].replace(/\s*<li[^>]*>([\s\S]*)/i, function(str, innerHTML) { |
---|
| 139 | |
---|
| 140 | innerHTML = innerHTML.replace(/^\s+/, ''); |
---|
| 141 | innerHTML = innerHTML.replace(/\n\n/g, '\n\n '); |
---|
| 142 | // indent nested lists |
---|
| 143 | innerHTML = innerHTML.replace(/\n([ ]*)+(\*|\d+\.) /g, '\n$1 $2 '); |
---|
| 144 | return prefix + innerHTML; |
---|
| 145 | }); |
---|
| 146 | } |
---|
| 147 | } |
---|
| 148 | return lis.join('\n'); |
---|
| 149 | }); |
---|
| 150 | return '\n\n' + html.replace(/[ \t]+\n|\s+$/g, ''); |
---|
| 151 | } |
---|
| 152 | |
---|
| 153 | // Blockquotes |
---|
| 154 | var deepest = /<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)<\/blockquote>/gi; |
---|
| 155 | while(string.match(deepest)) { |
---|
| 156 | string = string.replace(deepest, function(str) { |
---|
| 157 | return replaceBlockquotes(str); |
---|
| 158 | }); |
---|
| 159 | } |
---|
| 160 | |
---|
| 161 | function replaceBlockquotes(html) { |
---|
| 162 | html = html.replace(/<blockquote\b[^>]*>([\s\S]*?)<\/blockquote>/gi, function(str, inner) { |
---|
| 163 | inner = inner.replace(/^\s+|\s+$/g, ''); |
---|
| 164 | inner = cleanUp(inner); |
---|
| 165 | inner = inner.replace(/^/gm, '> '); |
---|
| 166 | inner = inner.replace(/^(>([ \t]{2,}>)+)/gm, '> >'); |
---|
| 167 | return inner; |
---|
| 168 | }); |
---|
| 169 | return html; |
---|
| 170 | } |
---|
| 171 | |
---|
| 172 | function cleanUp(string) { |
---|
| 173 | string = string.replace(/^[\t\r\n]+|[\t\r\n]+$/g, ''); // trim leading/trailing whitespace |
---|
| 174 | string = string.replace(/\n\s+\n/g, '\n\n'); |
---|
| 175 | string = string.replace(/\n{3,}/g, '\n\n'); // limit consecutive linebreaks to 2 |
---|
| 176 | return string; |
---|
| 177 | } |
---|
| 178 | |
---|
| 179 | return cleanUp(string); |
---|
| 180 | }; |
---|
| 181 | |
---|
| 182 | if (typeof exports === 'object') { |
---|
| 183 | exports.toMarkdown = toMarkdown; |
---|
| 184 | } |
---|