# # textindex.py - Functions to count the number of words in text or html for # search indexing # import string, re htmltags = {} htmltags['div'] = 1 htmltags['align'] = 1 htmltags['font'] = 1 htmltags['arial'] = 1 htmltags['helvetica'] = 1 htmltags['sans-serif'] = 1 htmltags['size'] = 1 htmltags['color'] = 1 htmltags['tr'] = 1 htmltags['td'] = 1 htmltags['table'] = 1 htmltags['width'] = 1 htmltags['border'] = 1 htmltags['height'] = 1 htmltags['colspan'] = 1 htmltags['verdana'] = 1 htmltags['valign'] = 1 htmltags['br'] = 1 htmltags['hr'] = 1 htmltags[' '] = 1 htmltags['href'] = 1 htmltags['img'] = 1 htmltags['src'] = 1 htmltags['gifs'] = 1 htmltags['gif'] = 1 htmltags['html'] = 1 htmltags['center'] = 1 amp_escapes = {"á": "á", "é": "é", "É": "é", "í": "í", "ó": "ó", "ú": "ú", # "&": "&", ## Handled specifically last " ": " ", "à": "à", "è": "è", "È": "è", "ì": "ì", "ò": "ò", "ù": "ù", "â": "â", "ê": "ê", "î": "î", "ô": "ô", "û": "û", "ñ": "ñ", "ä": "ä", "Ä": "ä", "ö": "ö", "Ö": "ö", "ü": "ü", "Ü": "ü", """: '"', ""e;": '"', "ç": 'ç', "ß": 'ß', # We don't want to translate these, because we strip html } trans8bit = None def un8bit_word (word): global trans8bit if not trans8bit: trans8bit = string.maketrans("áàâäéèêëíìîïóòôöúùûüçñß", "aaaaeeeeiiiioooouuuucnb") return string.translate (word, trans8bit) def replace_amp (line, ignore_gtlt = 0): for escape in amp_escapes.keys(): line = string.replace (line, escape, amp_escapes[escape]) line = string.replace (line, "&", "&") m = re.search ("(&[^; ]+;)", line) if m: for amp_es in m.groups(): if amp_es not in ['>', '<']: sys.stderr.write("Unhandled & escape: %s\n" % amp_es) return line _ConvertTrans = None def convert_words (line, html): global _ConvertTrans if html: line = replace_amp (line) if not _ConvertTrans: s = ".{}$%&*,;!`^[]/:?@\"'#=()|~\\\t\n\r" _ConvertTrans = string.maketrans(s+string.uppercase, " " * len(s) + string.lowercase) line = string.translate (line, _ConvertTrans) return line _SplitTrans = None def split_words (word): "split a word containing +-_" global _SplitTrans if not _SplitTrans: s = "-+_" _SplitTrans = string.maketrans(s, " " * len(s)) nword = string.translate(word, _SplitTrans) if word == nword: return [] subwords = string.split (nword) d = {} for sword in subwords: d[sword] = 1 word7 = un8bit_word (sword) if word7 != sword: d[word7] = 1 return d.keys() def count_words (data): """ This function generates a dictionary of words for the given text """ rindex = {} line = convert_words (data, 0) words = string.split (line) d = {} for word in words: if len (word) > 1: try: d[word] = d[word] + 1 except KeyError: d[word] = 1 for sword in split_words (word): try: d[sword] = d[sword] + 1 except KeyError: d[sword] = 1 word7 = un8bit_word (word) if word7 != word: try: d[word7] = d[word7] + 1 except KeyError: d[word7] = 1 return d def count_words_html (data, info): """ This function generates a dictionary of words for the given html text """ rindex = {} line = convert_words (data, 1) line = string.replace (line, '<', " < ") line = string.replace (line, '>', " > ") in_html = 0 words = string.split (line) d = {} for word in words: if word == '<': #if in_html: # log ("Confused in %s, < received when we're already in html" % info) #else: in_html = in_html + 1 elif word == '>': if in_html: in_html = in_html - 1 else: sys.stderr.write("Confused in %s, > received when we're not in html\n" % info) elif len (word) > 1: if not in_html or not htmltags.has_key (word): try: d[word] = d[word] + 1 except KeyError: d[word] = 1 word7 = un8bit_word (word) if word7 != word: try: d[word7] = d[word7] + 1 except KeyError: d[word7] = 1 return d