#
# textindex.py - Functions to count the number of words in text or html for
# search indexing
#

import string, re

htmltags = {}
htmltags['div'] = 1
htmltags['align'] = 1
htmltags['font'] = 1
htmltags['arial'] = 1
htmltags['helvetica'] = 1
htmltags['sans-serif'] = 1
htmltags['size'] = 1
htmltags['color'] = 1
htmltags['tr'] = 1
htmltags['td'] = 1
htmltags['table'] = 1
htmltags['width'] = 1
htmltags['border'] = 1
htmltags['height'] = 1
htmltags['colspan'] = 1
htmltags['verdana'] = 1
htmltags['valign'] = 1
htmltags['br'] = 1
htmltags['hr'] = 1
htmltags['&nbsp;'] = 1
htmltags['href'] = 1
htmltags['img'] = 1
htmltags['src'] = 1
htmltags['gifs'] = 1
htmltags['gif'] = 1
htmltags['html'] = 1
htmltags['center'] = 1

amp_escapes = {"&aacute;": "á",
               "&eacute;": "é", "&Eacute;": "é", 
               "&iacute;": "í",
               "&oacute;": "ó",
               "&uacute;": "ú",
               # "&amp;": "&",    ## Handled specifically last
               "&nbsp;": " ",
               "&agrave;": "à",
               "&egrave;": "è",
               "&Egrave;": "è",
               "&igrave;": "ì",
               "&ograve;": "ò",
               "&ugrave;": "ù",
               "&acirc;": "â",
               "&ecirc;": "ê",
               "&icirc;": "î",
               "&ocirc;": "ô",
               "&ucirc;": "û",
               "&ntilde;": "ñ",
               "&auml;": "ä", "&Auml;": "ä",
               "&ouml;": "ö", "&Ouml;": "ö",
               "&uuml;": "ü", "&Uuml;": "ü", 
               "&quot;": '"',
               "&quote;": '"',
               "&ccedil;": 'ç',
               "&szlig;": 'ß',
               # We don't want to translate these, because we strip html
              }

trans8bit = None

def un8bit_word (word):
  global trans8bit
  if not trans8bit:
    trans8bit = string.maketrans("áàâäéèêëíìîïóòôöúùûüçñß", "aaaaeeeeiiiioooouuuucnb")
  return string.translate (word, trans8bit)

def replace_amp (line, ignore_gtlt = 0):
  for escape in amp_escapes.keys():
    line = string.replace (line, escape, amp_escapes[escape])

  line = string.replace (line, "&amp;", "&")

  m = re.search ("(&[^; ]+;)", line)
  if m:
    for amp_es in m.groups():
      if amp_es not in ['&gt;', '&lt;']:
        sys.stderr.write("Unhandled & escape: %s\n" % amp_es)

  return line

_ConvertTrans = None

def convert_words (line, html):
  global _ConvertTrans

  if html:
    line = replace_amp (line)
  if not _ConvertTrans:
    s = ".{}$%&*,;!`^[]/:?@\"'#=()|~\\\t\n\r"
    _ConvertTrans = string.maketrans(s+string.uppercase, " " * len(s) + string.lowercase)
  line = string.translate (line, _ConvertTrans)

  return line

_SplitTrans = None

def split_words (word):
  "split a word containing +-_"
  global _SplitTrans
  if not _SplitTrans:
    s = "-+_"
    _SplitTrans = string.maketrans(s, " " * len(s))
  nword = string.translate(word, _SplitTrans)
  if word == nword:
    return []
  subwords = string.split (nword)
  d = {}
  for sword in subwords:
    d[sword] = 1
    word7 = un8bit_word (sword)
    if word7 != sword:
      d[word7] = 1
  return d.keys()

def count_words (data):
  """
    This function generates a dictionary of words for the given text 
  """
  rindex = {}
  line = convert_words (data, 0)

  words = string.split (line)
  d = {}
  for word in words:
    if len (word) > 1:
      try:
        d[word] = d[word] + 1
      except KeyError:
        d[word] = 1

      for sword in split_words (word):
        try:
          d[sword] = d[sword] + 1
        except KeyError:
          d[sword] = 1
        
      word7 = un8bit_word (word)
      if word7 != word:
        try:
          d[word7] = d[word7] + 1
        except KeyError:
          d[word7] = 1
  return d

def count_words_html (data, info):
  """
    This function generates a dictionary of words for the given html text
  """
  rindex = {}
  line = convert_words (data, 1)

  line = string.replace (line, '<', " < ")
  line = string.replace (line, '>', " > ")
  in_html = 0
  words = string.split (line)
  d = {}
  for word in words:
    if word == '<':
      #if in_html:
      #  log ("Confused in %s, < received when we're already in html" % info)
      #else:
      in_html = in_html + 1
    elif word == '>':
      if in_html:
        in_html = in_html - 1
      else:
        sys.stderr.write("Confused in %s, > received when we're not in html\n" % info)
    elif len (word) > 1:
      if not in_html or not htmltags.has_key (word):
        try:
          d[word] = d[word] + 1
        except KeyError:
          d[word] = 1
        word7 = un8bit_word (word)
        if word7 != word:
          try:
            d[word7] = d[word7] + 1
          except KeyError:
            d[word7] = 1
  return d
