#!/usr/bin/python ############################################################################################################################## ### Name: Yelper (yelper.py) ### ### Version 1.0, 07/11/2014 ### ### Description: ### Yelper saves all your Yelp reviews and photos. ### ### See help for usage: yelper.py -h ### ### System Requirements: ### - Python v2.7.x: http://www.python.org/ ### - mechanize 0.2.5: http://wwwsearch.sourceforge.net/mechanize/ ### - BeautifulSoup 4: http://www.crummy.com/software/BeautifulSoup/ ### ### Author: Eric Ooi - www.ericooi.com ### ############################################################################################################################## from argparse import ArgumentParser from argparse import RawTextHelpFormatter from datetime import datetime as dt import time import mechanize import cookielib import string from sys import stdout from bs4 import BeautifulSoup import sys import os import re ## Globals # general NAME = 'Yelper' VERSION = 'Version 1.0 (07/11/2014)' AUTHOR = 'By Eric Ooi - http://www.ericooi.com' COPYRIGHT = '(C) Copyright 2014 Eric Ooi' TSFORMAT = '%m-%d-%Y %H-%M-%S' TIMESTAMP = '[' + dt.strftime(dt.now(), TSFORMAT) + ']' # Yelp data YELP_DATA_DIR = "yelp_data" YELP_URL = "http://www.yelp.com" YELP_REVIEWS_URL = "http://www.yelp.com/user_details_reviews_self?userid=" YELP_PICS_URL = "http://www.yelp.com/user_local_photos?userid=" YELP_ID = "_ImRF2A0Nh6x4G82ChY1Gw" # define filenames using timestamp and format REVIEWS_FILENAME = 'yelp_reviews_' + TIMESTAMP + '.txt' PICS_DIRNAME = 'yelp_pics' PICS_FILENAME = 'yelp_pics_' + TIMESTAMP + '.txt' # Yelp object class Yelp: def __init__(self, biz_name="Business", biz_addr="123 Fake Street", rating=1, date="01/01/2001", rotd=None, review=None, prevs=[], ufc=("0", "0", "0")): self.biz_name = biz_name self.biz_addr = biz_addr self.rating = rating self.date = date self.rotd = rotd self.review = review self.prevs = [] self.ufc = ufc ## instantiate browser def browser(): """Create a Yelper browser.""" # browser br = mechanize.Browser() # browser options br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # user-agent br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36')] return br ## browse and save all Yelp pages ## saveReviews() and savePics() call this to generate a list of BeautifulSoup objects to parse through def savePages(mode): """Save Yelp Pages.""" pages = [] # Open a browser br = browser() # determine mode and set variables if mode == "reviews": page_url = YELP_REVIEWS_URL + YELP_ID page_type = "Yelp reviews - Page: " elif mode == "pics": page_url = YELP_PICS_URL + YELP_ID page_type = "Yelp pics - Page: " page_count = 0 # open first page br.open(page_url) counter = 1 # iterate through all user review pages while page_url: stdout.write('\r\t' + page_type + '%s' % counter) stdout.flush() # append each page as a BeautifulSoup object page = BeautifulSoup(br.response().read()) pages.append(page) # find the link to the next page depending on the mode if mode == "reviews": next_page = page.find('a', attrs={'id': 'pager_page_next'}) elif mode == "pics": # yelp uses the same class for both previous and next links # the true next link is always the latter or [-1] link next_page = page.find_all('a', attrs={'class': 'page-option prev-next'})[-1] # to prevent an infinite loop when reaching the last page, compare the page count in the link to the current page count # if the next page count is less, than we know there are no more links left and we should end the loop next_page_count = int(next_page['href'].encode('utf8').split('start=')[1].strip()) if next_page_count < page_count: next_page = False else: page_count = next_page_count # check if next page link exists # if true: open next page # if false: end loop if next_page: next_page = YELP_URL + next_page['href'].encode('utf8').strip() page_url = next_page br.open(page_url) counter = counter + 1 else: page_url = False print return pages ## save Yelp reviews def saveReviews(yelp_pages): """Save Yelp reviews.""" # set reviews text file path and create the yelp_data directory if it doesn't exist REVIEWS_FILE = YELP_DATA_DIR + '/' + REVIEWS_FILENAME if not os.path.exists(YELP_DATA_DIR): os.makedirs(YELP_DATA_DIR) # open REVIEWS_FILE for writing REVIEWS_FH = open(REVIEWS_FILE, 'wb') counter = 1 # initialize list to hold Yelp objects yelp_data = [] # iterate through yelp pages for page in yelp_pages: # parse out business data biz_data = page.find_all('div', attrs={'class': 'review clearfix'}) # iterate through each business "block" (
) for biz in biz_data: # initialize Yelp object to save data y = Yelp() # save business name biz_url = biz.find('h4') biz_name = biz_url.get_text(strip=True).encode('utf8') y.biz_name = biz_name # display saving status stdout.write('\r\t Saving Review %s: %s %s' % (counter, y.biz_name, ' ' * 30)) stdout.flush() # save business address biz_addr = biz.find('address', attrs={'class': 'smaller'}) biz_addr_format = BeautifulSoup(biz_addr.encode('utf8').replace("
","\n")) biz_addr_clean = biz_addr_format.get_text().strip().encode('utf8') y.biz_addr = biz_addr_clean # save business rating rating = biz.find(alt=re.compile("star rating")) rating_clean = rating['alt'][0].encode('utf8') y.rating = rating_clean # save review date: the replace() statements are used to clean up dates for "Updated Reviews" review_date = biz.find('span', attrs={'class': 'smaller date'}) review_date_clean = review_date.text.strip().replace('\n', '').replace(' ', '') y.date = review_date_clean # check for and save Review of the Day (ROTD) status rotd = biz.find('a', attrs={'title': 'Review of the Day'}) if rotd: y.rotd = ': '.join(rotd.text.strip().encode('utf8').split(' ')) # save review content review_content = biz.find('div', attrs={'class': 'review_comment'}) review_format = BeautifulSoup(review_content.encode('utf8').replace("
","\n")) # remove "Show owner comment" links for tag in review_format.find_all('a', attrs={'class':'comment-read-more'}): tag.decompose() review_clean = review_format.get_text().strip().encode('utf8') y.review = review_clean # save useful, funny, and cool vote counts usefuls, funnys, cools = "0", "0", "0" u = biz.find('a', attrs={'rel': 'useful'}).text.encode('utf8').strip() if u != "Useful": usefuls = u.split('\n')[-1] f = biz.find('a', attrs={'rel': 'funny'}).text.encode('utf8').strip() if f != "Funny": funnys = f.split('\n')[-1] c = biz.find('a', attrs={'rel': 'cool'}).text.encode('utf8').strip() if c != "Cool": cools = c.split('\n')[-1] y.ufc = (usefuls, funnys, cools) # check for and save previous reviews prevs_content = biz.find('div', attrs={'class': 'archived_reviews'}) # if there are previous reviews if prevs_content: # parse out each "old-review" into a list prevs = prevs_content.find_all('li', attrs={'class': 'old-review'}) # iterate through each "old-review" for prev in prevs: # save previous review date prev_date = prev.em.text.strip().encode('utf8') # save rating prev_rating = biz.find(alt=re.compile("star rating")) prev_rating_clean = prev_rating['alt'][0].encode('utf8') # previous review content prev_content = prev.find('p', attrs={'class': 'review_comment'}) prev_format = BeautifulSoup(prev_content.encode('utf8').replace("
","\n")) # remove "Show owner comment" links for tag in prev_format.find_all('a', attrs={'class':'comment-read-more'}): tag.decompose() prev_clean = prev_format.get_text().strip().encode('utf8') # append previous review as a tuple: (date, rating, content) y.prevs.append((prev_date, prev_rating_clean, prev_clean)) # append Yelp object to list of review objects yelp_data.append(y) counter = counter + 1 # write Yelp object to disk REVIEWS_FH.write(y.biz_name + '\n\n') REVIEWS_FH.write(y.biz_addr + '\n\n') REVIEWS_FH.write('Rating: ' + y.rating + '\n\n') REVIEWS_FH.write(y.date + '\n\n') if y.rotd: REVIEWS_FH.write(y.rotd + '\n\n') REVIEWS_FH.write("-----\n\n" + y.review + '\n\n') REVIEWS_FH.write("-----\n\n" + "Useful: %s\t Funny: %s\t Cool: %s\t" % y.ufc + '\n\n') # if there are previous reviews, write those out and separate if y.prevs: for prev in y.prevs: REVIEWS_FH.write('----------\n\n') REVIEWS_FH.write(prev[0] + '\n\n') REVIEWS_FH.write('Rating: ' + prev[1] + '\n\n') REVIEWS_FH.write(prev[2] + '\n\n') REVIEWS_FH.write('************************************************************************\n\n') REVIEWS_FH.close() print return ## save Yelp pictures def savePics(yelp_pages): """Save Yelp pictures.""" # set pictures directory and file paths and create the yelp_data/yelp_pics directories if it doesn't exist PICS_DIR = YELP_DATA_DIR + '/' + PICS_DIRNAME PICS_FILE = PICS_DIR + '/' + PICS_FILENAME if not os.path.exists(PICS_DIR): os.makedirs(PICS_DIR) # open PICS_FILE for writing PICS_FH = open(PICS_FILE, 'wb') counter = 1 br = browser() # iterate through yelp pages for page in yelp_pages: # parse "photo blocks" pics = page.find_all(class_="photo") # iterate through each picture URL and save the picture # group pictures by business name for pic in pics: # Yelp uses different filenames to differentiate picture sizes # ss.jpg = small size # ms.jpg = medium size # l.jpg = large # each page of pictures includes a grid of thumbnails in "ms.jpg" format # need to change "ms.jpg" to "l.jpg" to capture the large size format large_pic = pic.img['src'][:-6] + 'l.jpg' # open each picture and write to disk large_pic_file = br.open(large_pic).read() # create directory based on business name as necessary biz_name = pic.find('a', attrs={'class': 'biz-name'}).text.encode('utf8') if not os.path.exists(PICS_DIR + '/' + biz_name): os.makedirs(PICS_DIR + '/' + biz_name) # define picture filename based on picture number and caption pic_name = pic.img['alt'].strip().encode('utf8') # display download status stdout.write('\r\t Downloading Pic %s: %s %s' % (counter, biz_name, ' ' * 30)) stdout.flush() # save file to disk (replace all forward slashes to colons since slashes are interpreted as a deeper-level directory and will throw an error) final_path = os.path.join(PICS_DIR, biz_name, str(counter) + ' - ' + pic_name.replace("/", ":") + '.jpg') save = open(final_path, 'wb') save.write(large_pic_file) save.close() # create index of all pictures (pic number: business name - picture caption) PICS_FH.write(str(counter) + ': ' + biz_name + ' - ' + pic_name + '\n') # increment counter counter = counter + 1 PICS_FH.close() return ## determine which Yelp data to save def saveData(yelp_data): """Save Yelp data.""" # save requested data print "\nSaving: " if yelp_data.r or yelp_data.b: pages = savePages("reviews") saveReviews(pages) if yelp_data.p or yelp_data.b: pages = savePages("pics") savePics(pages) return ## Execute Yelper def main(): """Execute Yelper.""" # start program timer start = dt.now() # parse through command line arguments parser = ArgumentParser(description=NAME+'\n'+VERSION+'\n'+AUTHOR+'\n'+COPYRIGHT, formatter_class=RawTextHelpFormatter) parser.add_argument('-r', action='store_true', help='Save reviews') parser.add_argument('-p', action='store_true', help='Save pictures') parser.add_argument('-b', action='store_true', help='Save both reviews and pictures') # if no arguments provided, print help and/or examples if len(sys.argv) == 1: parser.print_help() sys.exit(1) yelp_data = parser.parse_args() # set filenames and create directories as necessary if not yelp_data.r and not yelp_data.p and not yelp_data.b: parser.error('Invalid mode selected. Please choose -r (reviews), -p (pictures), or -b (both)') # save data saveData(yelp_data) # end program timer end = dt.now() elapsed = end - start print "\n\nTime Elapsed:",elapsed if __name__ == '__main__': try: main() except (IOError, KeyError, AttributeError, ValueError, NameError, TypeError, LookupError), message: error = '%s -- Got Exception: %s\n' % (dt.strftime(dt.now(), TSFORMAT), message) print error