2 """html2text: Turn HTML into equivalent Markdown-structured text."""
4 __author__ = "Aaron Swartz (me@aaronsw.com)"
5 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
9 # Support decoded entities with unifiable.
14 setattr(__builtins__, 'True', 1)
15 setattr(__builtins__, 'False', 0)
18 if hasattr(x, 'has_key'): return x.has_key(y)
25 except ImportError: #Python3
26 import html.entities as htmlentitydefs
27 import urllib.parse as urlparse
28 import html.parser as HTMLParser
30 import urllib.request as urllib
33 import re, sys, codecs, types
35 try: from textwrap import wrap
38 # Use Unicode characters instead of their ascii psuedo-replacements
41 # Put the links after each paragraph instead of at the end.
42 LINKS_EACH_PARAGRAPH = 0
44 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
47 # Don't show internal links (href="#local-anchor") -- corresponding link targets
48 # won't be visible in the plain text file anyway.
49 SKIP_INTERNAL_LINKS = False
51 ### Entity Nonsense ###
54 if k == 'apos': return ord("'")
55 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
56 return htmlentitydefs.name2codepoint[k]
58 k = htmlentitydefs.entitydefs[k]
59 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
60 return ord(codecs.latin_1_decode(k)[0])
62 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
63 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
64 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
65 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
66 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
67 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
68 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
69 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
73 for k in unifiable.keys():
74 unifiable_n[name2cp(k)] = unifiable[k]
77 if name[0] in ['x','X']:
82 if not UNICODE_SNOB and c in unifiable_n.keys():
87 except NameError: #Python3
91 if not UNICODE_SNOB and c in unifiable.keys():
95 except KeyError: return "&" + c + ';'
98 return unichr(name2cp(c))
99 except NameError: #Python3
100 return chr(name2cp(c))
102 def replaceEntities(s):
105 return charref(s[1:])
106 else: return entityref(s)
108 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
110 return r_unescape.sub(replaceEntities, s)
112 ### End Entity Nonsense ###
115 """Return true if the line does only consist of whitespace characters."""
117 if c is not ' ' and c is not ' ':
122 """Wrap all paragraphs in the provided text."""
126 assert wrap, "Requires Python 2.3."
129 for para in text.split("\n"):
131 if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
132 for line in wrap(para, BODY_WIDTH):
133 result += line + "\n"
137 if not onlywhite(para):
138 result += para + "\n"
147 if tag[0] == 'h' and len(tag) == 2:
150 if n in range(1, 10): return n
151 except ValueError: return 0
153 class _html2text(HTMLParser.HTMLParser):
154 def __init__(self, out=None, baseurl=''):
155 HTMLParser.HTMLParser.__init__(self)
157 if out is None: self.out = self.outtextf
160 self.outtext = unicode()
161 except NameError: # Python3
176 self.abbr_title = None # current abbreviation definition
177 self.abbr_data = None # last inner HTML (for abbr being defined)
178 self.abbr_list = {} # stack of abbreviations to write later
179 self.baseurl = baseurl
181 def outtextf(self, s):
185 HTMLParser.HTMLParser.close(self)
192 def handle_charref(self, c):
195 def handle_entityref(self, c):
198 def handle_starttag(self, tag, attrs):
199 self.handle_tag(tag, attrs, 1)
201 def handle_endtag(self, tag):
202 self.handle_tag(tag, None, 0)
204 def previousIndex(self, attrs):
205 """ returns the index of certain set of attributes (of a link) in the
208 If the set of attributes is not found, returns None
210 if not has_key(attrs, 'href'): return None
217 if has_key(a, 'href') and a['href'] == attrs['href']:
218 if has_key(a, 'title') or has_key(attrs, 'title'):
219 if (has_key(a, 'title') and has_key(attrs, 'title') and
220 a['title'] == attrs['title']):
227 def handle_tag(self, tag, attrs, start):
228 #attrs = fixattrs(attrs)
232 if start: self.o(hn(tag)*"#" + ' ')
234 if tag in ['p', 'div']: self.p()
236 if tag == "br" and start: self.o(" \n")
238 if tag == "hr" and start:
243 if tag in ["head", "style", 'script']:
244 if start: self.quiet += 1
245 else: self.quiet -= 1
248 self.quiet = 0 # sites like 9rules.com never close <head>
250 if tag == "blockquote":
252 self.p(); self.o('> ', 0, 1); self.start = 1
258 if tag in ['em', 'i', 'u']: self.o("_")
259 if tag in ['strong', 'b']: self.o("**")
260 if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
264 for (x, y) in attrs: attrsD[x] = y
267 self.abbr_title = None
269 if has_key(attrs, 'title'):
270 self.abbr_title = attrs['title']
272 if self.abbr_title != None:
273 self.abbr_list[self.abbr_data] = self.abbr_title
274 self.abbr_title = None
280 for (x, y) in attrs: attrsD[x] = y
282 if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
283 self.astack.append(attrs)
286 self.astack.append(None)
289 a = self.astack.pop()
291 i = self.previousIndex(a)
296 a['count'] = self.acount
297 a['outcount'] = self.outcount
299 self.o("][" + str(a['count']) + "]")
301 if tag == "img" and start:
303 for (x, y) in attrs: attrsD[x] = y
305 if has_key(attrs, 'src'):
306 attrs['href'] = attrs['src']
307 alt = attrs.get('alt', '')
308 alt = re.sub('\n', ' ', alt)
309 i = self.previousIndex(attrs)
314 attrs['count'] = self.acount
315 attrs['outcount'] = self.outcount
319 self.o("]["+ str(attrs['count']) +"]")
321 if tag == 'dl' and start: self.p()
322 if tag == 'dt' and not start: self.pbr()
323 if tag == 'dd' and start: self.o(' ')
324 if tag == 'dd' and not start: self.pbr()
326 if tag in ["ol", "ul"]:
328 self.list.append({'name':tag, 'num':0})
330 if self.list: self.list.pop()
337 if self.list: li = self.list[-1]
338 else: li = {'name':'ul', 'num':0}
339 self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
340 if li['name'] == "ul": self.o("* ")
341 elif li['name'] == "ol":
343 self.o(str(li['num'])+". ")
348 if tag in ["table", "tr"] and start: self.p()
349 if tag == 'td': self.pbr()
360 if self.p_p == 0: self.p_p = 1
362 def p(self): self.p_p = 2
364 def o(self, data, puredata=0, force=0):
365 if self.abbr_data is not None: self.abbr_data += data
368 if puredata and not self.pre:
369 data = re.sub('\s+', ' ', data)
370 if data and data[0] == ' ':
373 if not data and not force: return
376 #self.out(" :") #TODO: not output when already one there
379 bq = (">" * self.blockquote)
380 if not (force and data and data[0] == ">") and self.blockquote: bq += " "
384 data = data.replace("\n", "\n"+bq)
399 self.out(('\n'+bq)*self.p_p)
403 if not self.lastWasNL: self.out(' ')
406 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
407 if force == "end": self.out("\n")
411 if self.outcount > link['outcount']:
412 self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
413 if has_key(link, 'title'): self.out(" ("+link['title']+")")
418 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
422 if self.abbr_list and force == "end":
423 for abbr, definition in self.abbr_list.items():
424 self.out(" *[" + abbr + "]: " + definition + "\n")
428 self.lastWasNL = data and data[-1] == '\n'
431 def handle_data(self, data):
432 if r'\/script>' in data: self.quiet -= 1
435 def unknown_decl(self, data): pass
437 def wrapwrite(text): sys.stdout.write(text)
439 def html2text_file(html, out=wrapwrite, baseurl=''):
440 h = _html2text(out, baseurl)
445 def html2text(html, baseurl=''):
446 return optwrap(html2text_file(html, None, baseurl))
448 if __name__ == "__main__":
452 if arg.startswith('http://') or arg.startswith('https://'):
454 j = urllib.urlopen(baseurl)
456 from feedparser import _getCharacterEncoding as enc
458 enc = lambda x, y: ('utf-8', 1)
460 encoding = enc(j.headers, text)[0]
461 if encoding == 'us-ascii': encoding = 'utf-8'
462 data = text.decode(encoding)
466 if len(sys.argv) > 2:
467 encoding = sys.argv[2]
469 data = open(arg, 'r', encoding=encoding).read()
471 data = open(arg, 'r').read().decode(encoding)
473 data = sys.stdin.read()
474 wrapwrite(html2text(data, baseurl))