#!/usr/bin/python
##############################################################################################################################
### Name: Yelper (yelper.py)
###
### Version 1.0, 07/11/2014
###
### Description:
### Yelper saves all your Yelp reviews and photos.
###
### See help for usage: yelper.py -h
###
### System Requirements:
### - Python v2.7.x: http://www.python.org/
### - mechanize 0.2.5: http://wwwsearch.sourceforge.net/mechanize/
### - BeautifulSoup 4: http://www.crummy.com/software/BeautifulSoup/
###
### Author: Eric Ooi - www.ericooi.com
###
##############################################################################################################################
from argparse import ArgumentParser
from argparse import RawTextHelpFormatter
from datetime import datetime as dt
import time
import mechanize
import cookielib
import string
from sys import stdout
from bs4 import BeautifulSoup
import sys
import os
import re
## Globals
# general
NAME = 'Yelper'
VERSION = 'Version 1.0 (07/11/2014)'
AUTHOR = 'By Eric Ooi - http://www.ericooi.com'
COPYRIGHT = '(C) Copyright 2014 Eric Ooi'
TSFORMAT = '%m-%d-%Y %H-%M-%S'
TIMESTAMP = '[' + dt.strftime(dt.now(), TSFORMAT) + ']'
# Yelp data
YELP_DATA_DIR = "yelp_data"
YELP_URL = "http://www.yelp.com"
YELP_REVIEWS_URL = "http://www.yelp.com/user_details_reviews_self?userid="
YELP_PICS_URL = "http://www.yelp.com/user_local_photos?userid="
YELP_ID = "_ImRF2A0Nh6x4G82ChY1Gw"
# define filenames using timestamp and format
REVIEWS_FILENAME = 'yelp_reviews_' + TIMESTAMP + '.txt'
PICS_DIRNAME = 'yelp_pics'
PICS_FILENAME = 'yelp_pics_' + TIMESTAMP + '.txt'
# Yelp object
class Yelp:
def __init__(self, biz_name="Business", biz_addr="123 Fake Street", rating=1, date="01/01/2001", rotd=None, review=None, prevs=[], ufc=("0", "0", "0")):
self.biz_name = biz_name
self.biz_addr = biz_addr
self.rating = rating
self.date = date
self.rotd = rotd
self.review = review
self.prevs = []
self.ufc = ufc
## instantiate browser
def browser():
"""Create a Yelper browser."""
# browser
br = mechanize.Browser()
# browser options
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# user-agent
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36')]
return br
## browse and save all Yelp pages
## saveReviews() and savePics() call this to generate a list of BeautifulSoup objects to parse through
def savePages(mode):
"""Save Yelp Pages."""
pages = []
# Open a browser
br = browser()
# determine mode and set variables
if mode == "reviews":
page_url = YELP_REVIEWS_URL + YELP_ID
page_type = "Yelp reviews - Page: "
elif mode == "pics":
page_url = YELP_PICS_URL + YELP_ID
page_type = "Yelp pics - Page: "
page_count = 0
# open first page
br.open(page_url)
counter = 1
# iterate through all user review pages
while page_url:
stdout.write('\r\t' + page_type + '%s' % counter)
stdout.flush()
# append each page as a BeautifulSoup object
page = BeautifulSoup(br.response().read())
pages.append(page)
# find the link to the next page depending on the mode
if mode == "reviews":
next_page = page.find('a', attrs={'id': 'pager_page_next'})
elif mode == "pics":
# yelp uses the same class for both previous and next links
# the true next link is always the latter or [-1] link
next_page = page.find_all('a', attrs={'class': 'page-option prev-next'})[-1]
# to prevent an infinite loop when reaching the last page, compare the page count in the link to the current page count
# if the next page count is less, than we know there are no more links left and we should end the loop
next_page_count = int(next_page['href'].encode('utf8').split('start=')[1].strip())
if next_page_count < page_count:
next_page = False
else:
page_count = next_page_count
# check if next page link exists
# if true: open next page
# if false: end loop
if next_page:
next_page = YELP_URL + next_page['href'].encode('utf8').strip()
page_url = next_page
br.open(page_url)
counter = counter + 1
else:
page_url = False
print
return pages
## save Yelp reviews
def saveReviews(yelp_pages):
"""Save Yelp reviews."""
# set reviews text file path and create the yelp_data directory if it doesn't exist
REVIEWS_FILE = YELP_DATA_DIR + '/' + REVIEWS_FILENAME
if not os.path.exists(YELP_DATA_DIR):
os.makedirs(YELP_DATA_DIR)
# open REVIEWS_FILE for writing
REVIEWS_FH = open(REVIEWS_FILE, 'wb')
counter = 1
# initialize list to hold Yelp objects
yelp_data = []
# iterate through yelp pages
for page in yelp_pages:
# parse out business data
biz_data = page.find_all('div', attrs={'class': 'review clearfix'})
# iterate through each business "block" (
)
for biz in biz_data:
# initialize Yelp object to save data
y = Yelp()
# save business name
biz_url = biz.find('h4')
biz_name = biz_url.get_text(strip=True).encode('utf8')
y.biz_name = biz_name
# display saving status
stdout.write('\r\t Saving Review %s: %s %s' % (counter, y.biz_name, ' ' * 30))
stdout.flush()
# save business address
biz_addr = biz.find('address', attrs={'class': 'smaller'})
biz_addr_format = BeautifulSoup(biz_addr.encode('utf8').replace("
","\n"))
biz_addr_clean = biz_addr_format.get_text().strip().encode('utf8')
y.biz_addr = biz_addr_clean
# save business rating
rating = biz.find(alt=re.compile("star rating"))
rating_clean = rating['alt'][0].encode('utf8')
y.rating = rating_clean
# save review date: the replace() statements are used to clean up dates for "Updated Reviews"
review_date = biz.find('span', attrs={'class': 'smaller date'})
review_date_clean = review_date.text.strip().replace('\n', '').replace(' ', '')
y.date = review_date_clean
# check for and save Review of the Day (ROTD) status
rotd = biz.find('a', attrs={'title': 'Review of the Day'})
if rotd:
y.rotd = ': '.join(rotd.text.strip().encode('utf8').split(' '))
# save review content
review_content = biz.find('div', attrs={'class': 'review_comment'})
review_format = BeautifulSoup(review_content.encode('utf8').replace("
","\n"))
# remove "Show owner comment" links
for tag in review_format.find_all('a', attrs={'class':'comment-read-more'}):
tag.decompose()
review_clean = review_format.get_text().strip().encode('utf8')
y.review = review_clean
# save useful, funny, and cool vote counts
usefuls, funnys, cools = "0", "0", "0"
u = biz.find('a', attrs={'rel': 'useful'}).text.encode('utf8').strip()
if u != "Useful":
usefuls = u.split('\n')[-1]
f = biz.find('a', attrs={'rel': 'funny'}).text.encode('utf8').strip()
if f != "Funny":
funnys = f.split('\n')[-1]
c = biz.find('a', attrs={'rel': 'cool'}).text.encode('utf8').strip()
if c != "Cool":
cools = c.split('\n')[-1]
y.ufc = (usefuls, funnys, cools)
# check for and save previous reviews
prevs_content = biz.find('div', attrs={'class': 'archived_reviews'})
# if there are previous reviews
if prevs_content:
# parse out each "old-review" into a list
prevs = prevs_content.find_all('li', attrs={'class': 'old-review'})
# iterate through each "old-review"
for prev in prevs:
# save previous review date
prev_date = prev.em.text.strip().encode('utf8')
# save rating
prev_rating = biz.find(alt=re.compile("star rating"))
prev_rating_clean = prev_rating['alt'][0].encode('utf8')
# previous review content
prev_content = prev.find('p', attrs={'class': 'review_comment'})
prev_format = BeautifulSoup(prev_content.encode('utf8').replace("
","\n"))
# remove "Show owner comment" links
for tag in prev_format.find_all('a', attrs={'class':'comment-read-more'}):
tag.decompose()
prev_clean = prev_format.get_text().strip().encode('utf8')
# append previous review as a tuple: (date, rating, content)
y.prevs.append((prev_date, prev_rating_clean, prev_clean))
# append Yelp object to list of review objects
yelp_data.append(y)
counter = counter + 1
# write Yelp object to disk
REVIEWS_FH.write(y.biz_name + '\n\n')
REVIEWS_FH.write(y.biz_addr + '\n\n')
REVIEWS_FH.write('Rating: ' + y.rating + '\n\n')
REVIEWS_FH.write(y.date + '\n\n')
if y.rotd: REVIEWS_FH.write(y.rotd + '\n\n')
REVIEWS_FH.write("-----\n\n" + y.review + '\n\n')
REVIEWS_FH.write("-----\n\n" + "Useful: %s\t Funny: %s\t Cool: %s\t" % y.ufc + '\n\n')
# if there are previous reviews, write those out and separate
if y.prevs:
for prev in y.prevs:
REVIEWS_FH.write('----------\n\n')
REVIEWS_FH.write(prev[0] + '\n\n')
REVIEWS_FH.write('Rating: ' + prev[1] + '\n\n')
REVIEWS_FH.write(prev[2] + '\n\n')
REVIEWS_FH.write('************************************************************************\n\n')
REVIEWS_FH.close()
print
return
## save Yelp pictures
def savePics(yelp_pages):
"""Save Yelp pictures."""
# set pictures directory and file paths and create the yelp_data/yelp_pics directories if it doesn't exist
PICS_DIR = YELP_DATA_DIR + '/' + PICS_DIRNAME
PICS_FILE = PICS_DIR + '/' + PICS_FILENAME
if not os.path.exists(PICS_DIR):
os.makedirs(PICS_DIR)
# open PICS_FILE for writing
PICS_FH = open(PICS_FILE, 'wb')
counter = 1
br = browser()
# iterate through yelp pages
for page in yelp_pages:
# parse "photo blocks"
pics = page.find_all(class_="photo")
# iterate through each picture URL and save the picture
# group pictures by business name
for pic in pics:
# Yelp uses different filenames to differentiate picture sizes
# ss.jpg = small size
# ms.jpg = medium size
# l.jpg = large
# each page of pictures includes a grid of thumbnails in "ms.jpg" format
# need to change "ms.jpg" to "l.jpg" to capture the large size format
large_pic = pic.img['src'][:-6] + 'l.jpg'
# open each picture and write to disk
large_pic_file = br.open(large_pic).read()
# create directory based on business name as necessary
biz_name = pic.find('a', attrs={'class': 'biz-name'}).text.encode('utf8')
if not os.path.exists(PICS_DIR + '/' + biz_name):
os.makedirs(PICS_DIR + '/' + biz_name)
# define picture filename based on picture number and caption
pic_name = pic.img['alt'].strip().encode('utf8')
# display download status
stdout.write('\r\t Downloading Pic %s: %s %s' % (counter, biz_name, ' ' * 30))
stdout.flush()
# save file to disk (replace all forward slashes to colons since slashes are interpreted as a deeper-level directory and will throw an error)
final_path = os.path.join(PICS_DIR, biz_name, str(counter) + ' - ' + pic_name.replace("/", ":") + '.jpg')
save = open(final_path, 'wb')
save.write(large_pic_file)
save.close()
# create index of all pictures (pic number: business name - picture caption)
PICS_FH.write(str(counter) + ': ' + biz_name + ' - ' + pic_name + '\n')
# increment counter
counter = counter + 1
PICS_FH.close()
return
## determine which Yelp data to save
def saveData(yelp_data):
"""Save Yelp data."""
# save requested data
print "\nSaving: "
if yelp_data.r or yelp_data.b:
pages = savePages("reviews")
saveReviews(pages)
if yelp_data.p or yelp_data.b:
pages = savePages("pics")
savePics(pages)
return
## Execute Yelper
def main():
"""Execute Yelper."""
# start program timer
start = dt.now()
# parse through command line arguments
parser = ArgumentParser(description=NAME+'\n'+VERSION+'\n'+AUTHOR+'\n'+COPYRIGHT, formatter_class=RawTextHelpFormatter)
parser.add_argument('-r', action='store_true', help='Save reviews')
parser.add_argument('-p', action='store_true', help='Save pictures')
parser.add_argument('-b', action='store_true', help='Save both reviews and pictures')
# if no arguments provided, print help and/or examples
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
yelp_data = parser.parse_args()
# set filenames and create directories as necessary
if not yelp_data.r and not yelp_data.p and not yelp_data.b:
parser.error('Invalid mode selected. Please choose -r (reviews), -p (pictures), or -b (both)')
# save data
saveData(yelp_data)
# end program timer
end = dt.now()
elapsed = end - start
print "\n\nTime Elapsed:",elapsed
if __name__ == '__main__':
try:
main()
except (IOError, KeyError, AttributeError, ValueError, NameError, TypeError, LookupError), message:
error = '%s -- Got Exception: %s\n' % (dt.strftime(dt.now(), TSFORMAT), message)
print error