#!/usr/bin/env python # search_terms.py # # This script will process the log fed on stdin for query terms on the # more popular Web search engines, and give statistics on a page-by-page # basis. # # usage: cat [logfile] | ./search_terms.py > [outfile] from weblog import combined, url, query from string import split, join, lower, strip import regex, sys from regsub import gsub replace_pat = regex.compile('\([\*\,\"\+]\|\-[^ ]*\)') engines = { 'digital.com': 'q', 'yahoo.com': 'p', 'hotbot.com': 'MT', 'excite.com': 'search', 'infoseek.com': 'qt', 'search.com': 'QUERY', 'metacrawler.com': 'general', 'metafind.com': 'q', 'webcrawler.com': 'searchText', 'lycos.com': 'query', 'inference.com': 'query', 'looksmart.com': 'key', 'northernlight.com': 'qr', } o_log = combined.Parser(sys.stdin) p_log = url.Parser(o_log) q_log = query.Parser(p_log) searches = {} while q_log.getlogent(): if q_log.ref_query: host = (split(p_log.ref_host, '.'))[-2:] host = lower(join(host, '.')) if engines.has_key(host): try: [terms] = q_log.ref_query_dict[engines[host]] except KeyError: continue terms = lower(gsub(replace_pat, '', terms)) # uncomment this line to index by words in terms. # terms = split(terms, None) # uncomment this line to index by whole search phrases. terms = [terms] for term in terms: try: searches[p_log.url][term] = searches[p_log.url].get(term, 0) + 1 except KeyError: searches[p_log.url] = {} searches[p_log.url][term] = 1 pages = searches.keys() pages.sort() for page in pages: print "\n%s" % (page) term_nums = searches[page] terms = term_nums.keys() terms.sort(lambda a, b, tn = term_nums: cmp(tn[b], tn[a])) for term in terms: print "%s %s" % (term_nums[term], term)