1 | /* |
---|
2 | * to-markdown - an HTML to Markdown converter |
---|
3 | * |
---|
4 | * Copyright 2011, Dom Christie |
---|
5 | * Licenced under the MIT licence |
---|
6 | * |
---|
7 | */ |
---|
8 | |
---|
9 | var toMarkdown = function(string) { |
---|
10 | |
---|
11 | var ELEMENTS = [ |
---|
12 | { |
---|
13 | patterns: 'p', |
---|
14 | replacement: function(str, attrs, innerHTML) { |
---|
15 | return innerHTML ? '\n\n' + innerHTML + '\n' : ''; |
---|
16 | } |
---|
17 | }, |
---|
18 | { |
---|
19 | patterns: 'br', |
---|
20 | type: 'void', |
---|
21 | replacement: '\n' |
---|
22 | }, |
---|
23 | { |
---|
24 | patterns: 'h([1-6])', |
---|
25 | replacement: function(str, hLevel, attrs, innerHTML) { |
---|
26 | var hPrefix = ''; |
---|
27 | for(var i = 0; i < hLevel; i++) { |
---|
28 | hPrefix += '#'; |
---|
29 | } |
---|
30 | return '\n\n' + hPrefix + ' ' + innerHTML + '\n'; |
---|
31 | } |
---|
32 | }, |
---|
33 | { |
---|
34 | patterns: 'hr', |
---|
35 | type: 'void', |
---|
36 | replacement: '\n\n* * *\n' |
---|
37 | }, |
---|
38 | { |
---|
39 | patterns: 'a', |
---|
40 | replacement: function(str, attrs, innerHTML) { |
---|
41 | var href = attrs.match(attrRegExp('href')), |
---|
42 | title = attrs.match(attrRegExp('title')); |
---|
43 | return href ? '[' + innerHTML + ']' + '(' + href[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')' : str; |
---|
44 | } |
---|
45 | }, |
---|
46 | { |
---|
47 | patterns: ['b', 'strong'], |
---|
48 | replacement: function(str, attrs, innerHTML) { |
---|
49 | return innerHTML ? '**' + innerHTML + '**' : ''; |
---|
50 | } |
---|
51 | }, |
---|
52 | { |
---|
53 | patterns: ['i', 'em'], |
---|
54 | replacement: function(str, attrs, innerHTML) { |
---|
55 | return innerHTML ? '_' + innerHTML + '_' : ''; |
---|
56 | } |
---|
57 | }, |
---|
58 | { |
---|
59 | patterns: 'code', |
---|
60 | replacement: function(str, attrs, innerHTML) { |
---|
61 | return innerHTML ? '`' + innerHTML + '`' : ''; |
---|
62 | } |
---|
63 | }, |
---|
64 | { |
---|
65 | patterns: 'img', |
---|
66 | type: 'void', |
---|
67 | replacement: function(str, attrs, innerHTML) { |
---|
68 | var src = attrs.match(attrRegExp('src')), |
---|
69 | alt = attrs.match(attrRegExp('alt')), |
---|
70 | title = attrs.match(attrRegExp('title')); |
---|
71 | return '![' + (alt && alt[1] ? alt[1] : '') + ']' + '(' + src[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')'; |
---|
72 | } |
---|
73 | } |
---|
74 | ]; |
---|
75 | |
---|
76 | for(var i = 0, len = ELEMENTS.length; i < len; i++) { |
---|
77 | if(typeof ELEMENTS[i].patterns === 'string') { |
---|
78 | string = replaceEls(string, { tag: ELEMENTS[i].patterns, replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type }); |
---|
79 | } |
---|
80 | else { |
---|
81 | for(var j = 0, pLen = ELEMENTS[i].patterns.length; j < pLen; j++) { |
---|
82 | string = replaceEls(string, { tag: ELEMENTS[i].patterns[j], replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type }); |
---|
83 | } |
---|
84 | } |
---|
85 | } |
---|
86 | |
---|
87 | function replaceEls(html, elProperties) { |
---|
88 | var pattern = elProperties.type === 'void' ? '<' + elProperties.tag + '\\b([^>]*)\\/?>' : '<' + elProperties.tag + '\\b([^>]*)>([\\s\\S]*?)<\\/' + elProperties.tag + '>', |
---|
89 | regex = new RegExp(pattern, 'gi'), |
---|
90 | markdown = ''; |
---|
91 | if(typeof elProperties.replacement === 'string') { |
---|
92 | markdown = html.replace(regex, elProperties.replacement); |
---|
93 | } |
---|
94 | else { |
---|
95 | markdown = html.replace(regex, function(str, p1, p2, p3) { |
---|
96 | return elProperties.replacement.call(this, str, p1, p2, p3); |
---|
97 | }); |
---|
98 | } |
---|
99 | return markdown; |
---|
100 | } |
---|
101 | |
---|
102 | function attrRegExp(attr) { |
---|
103 | return new RegExp(attr + '\\s*=\\s*["\']?([^"\']*)["\']?', 'i'); |
---|
104 | } |
---|
105 | |
---|
106 | // Pre code blocks |
---|
107 | |
---|
108 | string = string.replace(/<pre\b[^>]*>`([\s\S]*)`<\/pre>/gi, function(str, innerHTML) { |
---|
109 | innerHTML = innerHTML.replace(/^\t+/g, ' '); // convert tabs to spaces (you know it makes sense) |
---|
110 | innerHTML = innerHTML.replace(/\n/g, '\n '); |
---|
111 | return '\n\n ' + innerHTML + '\n'; |
---|
112 | }); |
---|
113 | |
---|
114 | // Lists |
---|
115 | |
---|
116 | // Escape numbers that could trigger an ol |
---|
117 | // If there are more than three spaces before the code, it would be in a pre tag |
---|
118 | // Make sure we are escaping the period not matching any character |
---|
119 | string = string.replace(/^(\s{0,3}\d+)\. /g, '$1\\. '); |
---|
120 | |
---|
121 | // Converts lists that have no child lists (of same type) first, then works it's way up |
---|
122 | var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi; |
---|
123 | while(string.match(noChildrenRegex)) { |
---|
124 | string = string.replace(noChildrenRegex, function(str) { |
---|
125 | return replaceLists(str); |
---|
126 | }); |
---|
127 | } |
---|
128 | |
---|
129 | function replaceLists(html) { |
---|
130 | |
---|
131 | html = html.replace(/<(ul|ol)\b[^>]*>([\s\S]*?)<\/\1>/gi, function(str, listType, innerHTML) { |
---|
132 | var lis = innerHTML.split('</li>'); |
---|
133 | lis.splice(lis.length - 1, 1); |
---|
134 | |
---|
135 | for(i = 0, len = lis.length; i < len; i++) { |
---|
136 | if(lis[i]) { |
---|
137 | var prefix = (listType === 'ol') ? (i + 1) + ". " : "* "; |
---|
138 | lis[i] = lis[i].replace(/\s*<li[^>]*>([\s\S]*)/i, function(str, innerHTML) { |
---|
139 | |
---|
140 | innerHTML = innerHTML.replace(/^\s+/, ''); |
---|
141 | innerHTML = innerHTML.replace(/\n\n/g, '\n\n '); |
---|
142 | // indent nested lists |
---|
143 | innerHTML = innerHTML.replace(/\n([ ]*)+(\*|\d+\.) /g, '\n$1 $2 '); |
---|
144 | return prefix + innerHTML; |
---|
145 | }); |
---|
146 | } |
---|
147 | } |
---|
148 | return lis.join('\n'); |
---|
149 | }); |
---|
150 | return '\n\n' + html.replace(/[ \t]+\n|\s+$/g, ''); |
---|
151 | } |
---|
152 | |
---|
153 | // Blockquotes |
---|
154 | var deepest = /<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)<\/blockquote>/gi; |
---|
155 | while(string.match(deepest)) { |
---|
156 | string = string.replace(deepest, function(str) { |
---|
157 | return replaceBlockquotes(str); |
---|
158 | }); |
---|
159 | } |
---|
160 | |
---|
161 | function replaceBlockquotes(html) { |
---|
162 | html = html.replace(/<blockquote\b[^>]*>([\s\S]*?)<\/blockquote>/gi, function(str, inner) { |
---|
163 | inner = inner.replace(/^\s+|\s+$/g, ''); |
---|
164 | inner = cleanUp(inner); |
---|
165 | inner = inner.replace(/^/gm, '> '); |
---|
166 | inner = inner.replace(/^(>([ \t]{2,}>)+)/gm, '> >'); |
---|
167 | return inner; |
---|
168 | }); |
---|
169 | return html; |
---|
170 | } |
---|
171 | |
---|
172 | function cleanUp(string) { |
---|
173 | string = string.replace(/^[\t\r\n]+|[\t\r\n]+$/g, ''); // trim leading/trailing whitespace |
---|
174 | string = string.replace(/\n\s+\n/g, '\n\n'); |
---|
175 | string = string.replace(/\n{3,}/g, '\n\n'); // limit consecutive linebreaks to 2 |
---|
176 | return string; |
---|
177 | } |
---|
178 | |
---|
179 | return cleanUp(string); |
---|
180 | }; |
---|
181 | |
---|
182 | if (typeof exports === 'object') { |
---|
183 | exports.toMarkdown = toMarkdown; |
---|
184 | } |
---|