This IPython Notebook provides an interactive way to follow along with and explore the numbered examples from Mining the Social Web (2nd Edition). The intent behind this notebook is to reinforce the concepts from the sample code in a fun, convenient, and effective way. This notebook assumes that you are reading along with the book and have the context of the discussion as you work through these exercises.
In the somewhat unlikely event that you've somehow stumbled across this notebook outside of its context on GitHub, you can find the full source code repository here.
You are free to use or adapt this notebook for any purpose you'd like. However, please respect the Simplified BSD License that governs its use.
import requests # pip install requests
from BeautifulSoup import BeautifulSoup # pip install BeautifulSoup
# XXX: Any URL containing a geo microformat...
URL = 'http://en.wikipedia.org/wiki/Franklin,_Tennessee'
# In the case of extracting content from Wikipedia, be sure to
# review its "Bot Policy," which is defined at
# http://meta.wikimedia.org/wiki/Bot_policy#Unacceptable_usage
req = requests.get(URL, headers={'User-Agent' : "Mining the Social Web"})
soup = BeautifulSoup(req.text)
geoTag = soup.find(True, 'geo')
if geoTag and len(geoTag) > 1:
lat = geoTag.find(True, 'latitude').string
lon = geoTag.find(True, 'longitude').string
print 'Location is at', lat, lon
elif geoTag and len(geoTag) == 1:
(lat, lon) = geoTag.string.split(';')
(lat, lon) = (lat.strip(), lon.strip())
print 'Location is at', lat, lon
else:
print 'No location found'
from IPython.display import IFrame
from IPython.core.display import display
# Google Maps URL template for an iframe
google_maps_url = "http://maps.google.com/maps?q={0}+{1}&" + \
"ie=UTF8&t=h&z=14&{0},{1}&output=embed".format(lat, lon)
display(IFrame(google_maps_url, '425px', '350px'))
import sys
import requests
import json
import BeautifulSoup
# Pass in a URL containing hRecipe...
URL = 'http://britishfood.about.com/od/recipeindex/r/applepie.htm'
# Parse out some of the pertinent information for a recipe.
# See http://microformats.org/wiki/hrecipe.
def parse_hrecipe(url):
req = requests.get(URL)
soup = BeautifulSoup.BeautifulSoup(req.text)
hrecipe = soup.find(True, 'hrecipe')
if hrecipe and len(hrecipe) > 1:
fn = hrecipe.find(True, 'fn').string
author = hrecipe.find(True, 'author').find(text=True)
ingredients = [i.string
for i in hrecipe.findAll(True, 'ingredient')
if i.string is not None]
instructions = []
for i in hrecipe.find(True, 'instructions'):
if type(i) == BeautifulSoup.Tag:
s = ''.join(i.findAll(text=True)).strip()
elif type(i) == BeautifulSoup.NavigableString:
s = i.string.strip()
else:
continue
if s != '':
instructions += [s]
return {
'name': fn,
'author': author,
'ingredients': ingredients,
'instructions': instructions,
}
else:
return {}
recipe = parse_hrecipe(URL)
print json.dumps(recipe, indent=4)
import requests
import json
from BeautifulSoup import BeautifulSoup
# Pass in a URL that contains hReview-aggregate info...
URL = 'http://britishfood.about.com/od/recipeindex/r/applepie.htm'
def parse_hreview_aggregate(url, item_type):
req = requests.get(URL)
soup = BeautifulSoup(req.text)
# Find the hRecipe or whatever other kind of parent item encapsulates
# the hReview (a required field).
item_element = soup.find(True, item_type)
item = item_element.find(True, 'item').find(True, 'fn').text
# And now parse out the hReview
hreview = soup.find(True, 'hreview-aggregate')
# Required field
rating = hreview.find(True, 'rating').find(True, 'value-title')['title']
# Optional fields
try:
count = hreview.find(True, 'count').text
except AttributeError: # optional
count = None
try:
votes = hreview.find(True, 'votes').text
except AttributeError: # optional
votes = None
try:
summary = hreview.find(True, 'summary').text
except AttributeError: # optional
summary = None
return {
'item': item,
'rating': rating,
'count': count,
'votes': votes,
'summary' : summary
}
# Find hReview aggregate information for an hRecipe
reviews = parse_hreview_aggregate(URL, 'hrecipe')
print json.dumps(reviews, indent=4)
Note: You may also want to try Google's structured data testing tool to extract semantic markup from a webpage
Note: You can use bash cell magic as shown below to invoke FuXi on the sample data file introduced at the end of the chapter as follows:
%%bash
FuXi --rules=resources/ch08-semanticweb/chuck-norris.n3 --ruleFacts --naive
You can explore other options for FuXi by invoking its --help command
%%bash
FuXi --help