#!/usr/bin/python ################################################################################# ### Name: bro_daily_googs.py ### ### Description: ### This script takes bro http logs and produces a report of unique ### Google/Bing/Yahoo searches sorted by instance and SafeSearch status ### (SafeSearch status only works for Google). ### ### Usage: ./bro_daily_googs.py ### ### Author: Eric Ooi ################################################################################# import string import sys import re import urllib import gzip import os import time import commands ReportFilename = 'bro_googs_activity.rpt' DataRoot = '/nsm/bro/logs' def generate_data_path(DataRoot): OneDayAgo = time.time() - (60 * 60 * 24) year, month, day = time.localtime(OneDayAgo)[:3] if month < 10: month = '0' + str(month) if day < 10: day = '0' + str(day) DataPath = '%s/%s-%s-%s/' % (DataRoot, year, month, day) date = '%s/%s/%s' % (month, day, year) return(DataPath,date) DataPath,date = generate_data_path(DataRoot) ReportPath = DataPath ReportFile = ReportPath + ReportFilename ReportFH = open(ReportFile, 'wb') class Counter: def __init__(self): self.dict = {} def add(self, item): count = self.dict.get(item, 0) self.dict[item] = count + 1 def counts(self, desc=None): """Returns list of keys sorted by values. Pass desc a 1 if you want a descending sort.""" result = [(value, key) for key, value in self.dict.items()] result.sort() if desc: result.reverse() return result # pattern_terms_g1: Google search query syntax # pattern_terms_g2: Google search query syntax # pattern_terms_g3: Google search query syntax # pattern_terms_g4: Custom Google search query syntax - www.blackle.com # pattern_terms_b : Bing search query syntax # pattern_terms_y : Yahoo search query syntax # pattern_off : Google SafeSearch off pattern_terms_g1 = re.compile("^/{1}[^/]+[?&]{1}q=([^/]*?)&") pattern_terms_g2 = re.compile("^/{1}[^/]+cx=.*[?&]{1}q=([^/]*)") pattern_terms_g3 = re.compile("^/{1}[^/]+[?&]{1}q=([^/]*)&cx=") pattern_terms_b = re.compile("^/{1}[^/]+[?&]{1}q=([^/]*?)&") pattern_terms_y = re.compile("^/{1}[^/]+[?&]{1}p=([^/]*?)&") pattern_off = re.compile("&{1}safe=off&{1}") pattern_img_g1 = re.compile("^/imghover?") pattern_img_g2 = re.compile("^/imgres?") terms = Counter() try: # read http_full.log.gz created by bro_daily_full.py DataFH = gzip.open(DataPath + 'http_full.log.gz') ########## Collect search terms: parse http data for google/bing/yahoo regex ########## for line in DataFH: # initialize match to zero (null) match_terms = 0 match_off = 0 match_img = 0 segments = line.split('\t') # ignore comment lines and truncated lines # focus on www.google.com, www.bing.com, and search.yahoo.com hostnames # extract matching patterns from request-uri field if not segments or segments[0] == '#' or len(segments) < 12: continue # google searches including safesearch=off searches if segments[8] == "www.google.com" and segments[9][:5] != "/url?" and segments[9][:5] != "/gen_": match_terms = pattern_terms_g1.search(segments[9]) or pattern_terms_g2.search(segments[9]) or pattern_terms_g3.search(segments[9]) match_off = pattern_off.search(segments[9]) match_img = pattern_img_g1.search(segments[9]) or pattern_img_g2.search(segments[9]) # bing searches elif segments[8] == "www.bing.com": match_terms = pattern_terms_b.search(segments[9]) # yahoo searches elif segments[8] == "search.yahoo.com": match_terms = pattern_terms_y.search(segments[9]) # no search engine found else: continue # urllib.unquote_plus: convert url encoding to character equivalent # segments[0]: timestamp # segments[2]: requesting IP address # segments[8]: search engine if match_terms and match_off and match_img: term = urllib.unquote_plus(match_terms.group(1)).strip() terms.add(segments[2]+'^^'+segments[8].split('.')[1]+'^^'+'OFF'+'^^'+'IMG'+'^^'+segments[0][11:13]+'^^'+term) if match_terms and match_img: term = urllib.unquote_plus(match_terms.group(1)).strip() terms.add(segments[2]+'^^'+segments[8].split('.')[1]+'^^'+'----'+'^^'+'IMG'+'^^'+segments[0][11:13]+'^^'+term) if match_terms and match_off: term = urllib.unquote_plus(match_terms.group(1)).strip() terms.add(segments[2]+'^^'+segments[8].split('.')[1]+'^^'+'OFF'+'^^'+'TXT'+'^^'+segments[0][11:13]+'^^'+term) elif match_terms: term = urllib.unquote_plus(match_terms.group(1)).strip() terms.add(segments[2]+'^^'+segments[8].split('.')[1]+'^^'+'----'+'^^'+'TXT'+'^^'+segments[0][11:13]+'^^'+term) else: continue DataFH.close() ########## Format search terms: compile safesearch=off searches and search statistics ########## safeoff = [] all = [] # if no searches if not terms.counts(): print "No searches found\n" sys.exit(1) # stats_counter: [Google, Google SafeSearch=OFF, Bing, Yahoo, Total] stats_counter = [0.0, 0.0, 0.0, 0.0, 0.0] # format data and create statistics for term in terms.counts(1): # split out search terms hash - temp: [# hits, requesting IP, search engine, Google SafeSearch status, type, hour, search term] temp = [term[0]] + term[1].split('^^') # format hour temp[5] = temp[5].strip(': ') # httpry sometimes does not log timestamp properly (for example 10:00 may be logged as 0:00) # this will strip out spaces and colons if int(temp[5]) > 12: temp[5] = int(temp[5]) - 12 if temp[5] < 10: temp[5] = ' '+str(temp[5])+'PM' else: temp[5] = str(temp[5])+'PM' elif temp[5] == '12': temp[5] = temp[5]+'PM' elif temp[5] == '00': temp[5] = '12AM' elif int(temp[5]) < 10: temp[5] = ' '+str(int(temp[5]))+'AM' else: temp[5] = temp[5]+'AM' # collect safesearch off data if temp[3] == "OFF": safeoff += [temp] stats_counter[1] += 1 # build full search terms list all += [temp] stats_counter[4] += 1 if temp[2] == "google": stats_counter[0] += 1 elif temp[2] == "bing": stats_counter[2] += 1 else: stats_counter[3] += 1 ########## Display results: statistics, safesearch=off searches, all searches ########## # display statistics at the top ReportFH.write("Searches made "+date+"\n\n") ReportFH.write("%s%s%s\n\n" % ('*'*37, " Statistics ",'*'*37)) ReportFH.write("%-14s%-19s%s\n" % ("Engine", "# Unique Searches", "% Unique Searches")) ReportFH.write("%-14s%17d%18.2f%%\n" % ("Google (OFF)", stats_counter[1], (stats_counter[1]/stats_counter[4])*100)) ReportFH.write("%-14s%17d%18.2f%%\n" % ("Google", stats_counter[0], (stats_counter[0]/stats_counter[4])*100)) ReportFH.write("%-14s%17d%18.2f%%\n" % ("Bing", stats_counter[2], (stats_counter[2]/stats_counter[4])*100)) ReportFH.write("%-14s%17d%18.2f%%\n" % ("Yahoo", stats_counter[3], (stats_counter[3]/stats_counter[4])*100)) ReportFH.write("%-14s%17d%21s\n" % ("Total", stats_counter[4], "100.00%\n\n")) # display "SafeSearch=OFF" second ReportFH.write("%s%s%s\n\n" % ('*'*35, " SafeSearch=OFF ",'*'*35)) if safeoff: # display "SafeSearch=OFF" searches at the top ReportFH.write("%-8s%-17s%-8s%-6s%-6s%-6s%s\n" % ("# Hits", "Computer", "Engine", "Safe", "Type", "Hour", "Search Term")) for term in safeoff: ReportFH.write("%-8d%-17s%-8s%-6s%-6s%-6s%s\n" % (term[0], term[1], term[2], term[3], term[4], term[5], term[6])) else: ReportFH.write("\nNo results\n") # display all searches ReportFH.write("\n\n%s%s%s\n\n%-8s%-17s%-8s%-6s%-6s%-6s%s\n" % ('*'*36, " All Searches ",'*'*36, "# Hits", "Computer", "Engine", "Safe", "Type", "Hour", "Search Term")) for term in all: ReportFH.write("%-8d%-17s%-8s%-6s%-6s%-6s%s\n" % (term[0], term[1], term[2], term[3], term[4], term[5], term[6])) ReportFH.close() except ValueError: print "line:",line print "temp:",temp print "temp[5]:",temp[5] sys.exit(1) except IOError, e: ReportFH.write(e) sys.exit(1) except KeyboardInterrupt: print 'Ctrl-C Caught... Exiting' sys.exit(0)