#! /usr/bin/env python """ html2text.py convert an html doc to text """ # system libraries import os, sys, string, time, getopt import re WIDTH = 80 def tag_replace (data,center,indent, use_ansi = 0): data = re.sub ("\s+", " ", data) data = re.sub ("(?s)", "", data) data = string.replace (data, "\n", " ") output = [] # modified 6/17/99 splits on all cases of "img" tags # imgs = re.split ("(?s)()", data) imgs = re.split ("(?si)()", data) for img in imgs: if string.lower(img[:4]) == "", "\n", data) data = re.sub ("(?i)]*>", "\n" + "-"*50 + "\n", data) data = re.sub ("(?i)
  • ", "\n* ", data) if use_ansi: data = re.sub ("(?i)", "\n", data) else: data = re.sub ("(?i)", "\n", data) if use_ansi: data = re.sub ("(?i)", "\n", data) else: data = re.sub ("(?i)", "\n", data) data = re.sub ("(?i)
      ", "\n
        \n", data) data = re.sub ("(?i)
      ", "\n
    \n", data) data = re.sub ("(?i)
    ", "\n
    \n", data) data = re.sub ("(?i)
    ", "\n
    \n", data) if use_ansi: data = re.sub ("(?i)", "", data) data = re.sub ("(?i)", "", data) data = re.sub ("(?i)", "", data) data = re.sub ("(?i)", "", data) data = re.sub ("(?i)", "\n<CENTER>\n", data) data = re.sub ("(?i)", "\n\n", data) else: data = re.sub ("(?i)", "\n<CENTER>\n", data) data = re.sub ("(?i)", "\n\n", data) data = re.sub ("(?i)

    ", "\n", data) data = re.sub ("(?i)]*>", "\n", data) data = re.sub ("(?i)", "\n", data) data = re.sub ("(?i)]*>", "\t", data) data = re.sub ("(?i)]*>", "\t", data) data = re.sub (" *\n", "\n", data) lines = string.split (data, "\n") output = [] for line in lines: if line == "

      ": indent = indent + 1 elif line == "
    ": indent = indent - 1 if indent < 0: indent = 0 elif line == "
    ": center = center + 1 elif line == "
    ": center = center - 1 if center < 0: center = 0 else: if center: line = " "*indent + string.strip(line) nline = re.sub("\[.*?m", "", line) nline = re.sub ("<[^>]*>", "", nline) c = WIDTH/2 - (len (nline) / 2) output.append (" "*c + line) else: output.append (" "*indent + line) data = string.join (output, "\n") data = re.sub (" *\n", "\n", data) data = re.sub ("\n\n\n*", "\n\n", data) data = re.sub ("<[^>]*>", "", data) return (data, center, indent) def html2text (data, use_ansi = 0, is_latin1 = 0): pre = re.split("(?s)(
    [^<]*
    )", data) out = [] indent = 0 center = 0 for part in pre: if part[:5] != "
    ":
          (res, center, indent) = tag_replace (part,center,indent, use_ansi)
          out.append (res)
        else:
          part = re.sub("(?i)", "", part)
          out.append (part)
      data = string.join (out)
      data = re.sub (">", ">", data)
      data = re.sub ("<", "<", data)
      data = re.sub (" ", " ", data)
      if is_latin1:
        data = re.sub ("©", "©", data)
        data = re.sub ("é", "ι", data)
        data = re.sub ("è", "θ", data)
    
      return data
    
    
    def usage(progname):
      print "usage: %s --help " % progname
      print __doc__
    
    def main(argc, argv):
      progname = argv[0]
      alist, args = getopt.getopt(argv[1:], "", ["help"])
    
      for (field, val) in alist:
        if field == "--help":
          usage(progname)
          return
    
      if len(args):
        file = args[0]
    
      progname = argv[0]
    
      fp = open (file)
      data = fp.read()
      fp.close()
    
      if data:
        print (html2text(data))
      else:
        print "Document contained no data"
    
    
    
    if __name__ == "__main__":
      main(len(sys.argv), sys.argv)