This IPython Notebook provides an interactive way to follow along with and explore the numbered examples from Mining the Social Web (2nd Edition). The intent behind this notebook is to reinforce the concepts from the sample code in a fun, convenient, and effective way. This notebook assumes that you are reading along with the book and have the context of the discussion as you work through these exercises.
In the somewhat unlikely event that you've somehow stumbled across this notebook outside of its context on GitHub, you can find the full source code repository here.
You are free to use or adapt this notebook for any purpose you'd like. However, please respect the Simplified BSD License that governs its use.
This notebook is still a work in progress and currently features 25 recipes. The example titles should be fairly self-explanatory, and the code is designed to be reused as you progress further in the notebook --- meaning that you should follow along and execute each cell along the way since later cells may depend on functions being defined from earlier cells. Consider this notebook draft material at this point.
import twitter
def oauth_login():
# XXX: Go to http://twitter.com/apps/new to create an app and get values
# for these credentials that you'll need to provide in place of these
# empty string values that are defined as placeholders.
# See https://dev.twitter.com/docs/auth/oauth for more information
# on Twitter's OAuth implementation.
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
# Sample usage
twitter_api = oauth_login()
# Nothing to see by displaying twitter_api except that it's now a
# defined variable
print twitter_api
import json
from flask import Flask, request
import multiprocessing
from threading import Timer
from IPython.display import IFrame
from IPython.display import display
from IPython.display import Javascript as JS
import twitter
from twitter.oauth_dance import parse_oauth_tokens
from twitter.oauth import read_token_file, write_token_file
# Note: This code is exactly the flow presented in the _AppendixB notebook
OAUTH_FILE = "resources/ch09-twittercookbook/twitter_oauth"
# XXX: Go to http://twitter.com/apps/new to create an app and get values
# for these credentials that you'll need to provide in place of these
# empty string values that are defined as placeholders.
# See https://dev.twitter.com/docs/auth/oauth for more information
# on Twitter's OAuth implementation, and ensure that *oauth_callback*
# is defined in your application settings as shown next if you are
# using Flask in this IPython Notebook.
# Define a few variables that will bleed into the lexical scope of a couple of
# functions that follow
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
oauth_callback = 'http://127.0.0.1:5000/oauth_helper'
# Set up a callback handler for when Twitter redirects back to us after the user
# authorizes the app
webserver = Flask("TwitterOAuth")
@webserver.route("/oauth_helper")
def oauth_helper():
oauth_verifier = request.args.get('oauth_verifier')
# Pick back up credentials from ipynb_oauth_dance
oauth_token, oauth_token_secret = read_token_file(OAUTH_FILE)
_twitter = twitter.Twitter(
auth=twitter.OAuth(
oauth_token, oauth_token_secret, CONSUMER_KEY, CONSUMER_SECRET),
format='', api_version=None)
oauth_token, oauth_token_secret = parse_oauth_tokens(
_twitter.oauth.access_token(oauth_verifier=oauth_verifier))
# This web server only needs to service one request, so shut it down
shutdown_after_request = request.environ.get('werkzeug.server.shutdown')
shutdown_after_request()
# Write out the final credentials that can be picked up after the following
# blocking call to webserver.run().
write_token_file(OAUTH_FILE, oauth_token, oauth_token_secret)
return "%s %s written to %s" % (oauth_token, oauth_token_secret, OAUTH_FILE)
# To handle Twitter's OAuth 1.0a implementation, we'll just need to implement a
# custom "oauth dance" and will closely follow the pattern defined in
# twitter.oauth_dance.
def ipynb_oauth_dance():
_twitter = twitter.Twitter(
auth=twitter.OAuth('', '', CONSUMER_KEY, CONSUMER_SECRET),
format='', api_version=None)
oauth_token, oauth_token_secret = parse_oauth_tokens(
_twitter.oauth.request_token(oauth_callback=oauth_callback))
# Need to write these interim values out to a file to pick up on the callback
# from Twitter that is handled by the web server in /oauth_helper
write_token_file(OAUTH_FILE, oauth_token, oauth_token_secret)
oauth_url = ('http://api.twitter.com/oauth/authorize?oauth_token=' + oauth_token)
# Tap the browser's native capabilities to access the web server through a new
# window to get user authorization
display(JS("window.open('%s')" % oauth_url))
# After the webserver.run() blocking call, start the OAuth Dance that will
# ultimately cause Twitter to redirect a request back to it. Once that request
# is serviced, the web server will shut down and program flow will resume
# with the OAUTH_FILE containing the necessary credentials.
Timer(1, lambda: ipynb_oauth_dance()).start()
webserver.run(host='0.0.0.0')
# The values that are read from this file are written out at
# the end of /oauth_helper
oauth_token, oauth_token_secret = read_token_file(OAUTH_FILE)
# These four credentials are what is needed to authorize the application
auth = twitter.oauth.OAuth(oauth_token, oauth_token_secret,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
print twitter_api
import json
import twitter
def twitter_trends(twitter_api, woe_id):
# Prefix ID with the underscore for query string parameterization.
# Without the underscore, the twitter package appends the ID value
# to the URL itself as a special-case keyword argument.
return twitter_api.trends.place(_id=woe_id)
# Sample usage
twitter_api = oauth_login()
# See https://dev.twitter.com/docs/api/1.1/get/trends/place and
# http://developer.yahoo.com/geo/geoplanet/ for details on
# Yahoo! Where On Earth ID
WORLD_WOE_ID = 1
world_trends = twitter_trends(twitter_api, WORLD_WOE_ID)
print json.dumps(world_trends, indent=1)
US_WOE_ID = 23424977
us_trends = twitter_trends(twitter_api, US_WOE_ID)
print json.dumps(us_trends, indent=1)
def twitter_search(twitter_api, q, max_results=200, **kw):
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets and
# https://dev.twitter.com/docs/using-search for details on advanced
# search criteria that may be useful for keyword arguments
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets
search_results = twitter_api.search.tweets(q=q, count=100, **kw)
statuses = search_results['statuses']
# Iterate through batches of results by following the cursor until we
# reach the desired number of results, keeping in mind that OAuth users
# can "only" make 180 search queries per 15-minute interval. See
# https://dev.twitter.com/docs/rate-limiting/1.1/limits
# for details. A reasonable number of results is ~1000, although
# that number of results may not exist for all queries.
# Enforce a reasonable limit
max_results = min(1000, max_results)
for _ in range(10): # 10*100 = 1000
try:
next_results = search_results['search_metadata']['next_results']
except KeyError, e: # No more results when next_results doesn't exist
break
# Create a dictionary from next_results, which has the following form:
# ?max_id=313519052523986943&q=NCAA&include_entities=1
kwargs = dict([ kv.split('=')
for kv in next_results[1:].split("&") ])
search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']
if len(statuses) > max_results:
break
return statuses
# Sample usage
twitter_api = oauth_login()
q = "CrossFit"
results = twitter_search(twitter_api, q, max_results=10)
# Show one sample search result by slicing the list...
print json.dumps(results[0], indent=1)
from functools import partial
pp = partial(json.dumps, indent=1)
twitter_world_trends = partial(twitter_trends, twitter_api, WORLD_WOE_ID)
print pp(twitter_world_trends())
authenticated_twitter_search = partial(twitter_search, twitter_api)
results = authenticated_twitter_search("iPhone")
print pp(results)
authenticated_iphone_twitter_search = partial(authenticated_twitter_search, "iPhone")
results = authenticated_iphone_twitter_search()
print pp(results)
import io, json
def save_json(filename, data):
with io.open('resources/ch09-twittercookbook/{0}.json'.format(filename),
'w', encoding='utf-8') as f:
f.write(unicode(json.dumps(data, ensure_ascii=False)))
def load_json(filename):
with io.open('resources/ch09-twittercookbook/{0}.json'.format(filename),
encoding='utf-8') as f:
return f.read()
# Sample usage
q = 'CrossFit'
twitter_api = oauth_login()
results = twitter_search(twitter_api, q, max_results=10)
save_json(q, results)
results = load_json(q)
print json.dumps(results, indent=1)
import json
import pymongo # pip install pymongo
def save_to_mongo(data, mongo_db, mongo_db_coll, **mongo_conn_kw):
# Connects to the MongoDB server running on
# localhost:27017 by default
client = pymongo.MongoClient(**mongo_conn_kw)
# Get a reference to a particular database
db = client[mongo_db]
# Reference a particular collection in the database
coll = db[mongo_db_coll]
# Perform a bulk insert and return the IDs
return coll.insert(data)
def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False,
criteria=None, projection=None, **mongo_conn_kw):
# Optionally, use criteria and projection to limit the data that is
# returned as documented in
# http://docs.mongodb.org/manual/reference/method/db.collection.find/
# Consider leveraging MongoDB's aggregations framework for more
# sophisticated queries.
client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]
if criteria is None:
criteria = {}
if projection is None:
cursor = coll.find(criteria)
else:
cursor = coll.find(criteria, projection)
# Returning a cursor is recommended for large amounts of data
if return_cursor:
return cursor
else:
return [ item for item in cursor ]
# Sample usage
q = 'CrossFit'
twitter_api = oauth_login()
results = twitter_search(twitter_api, q, max_results=10)
save_to_mongo(results, 'search_results', q)
load_from_mongo('search_results', q)
# Finding topics of interest by using the filtering capablities it offers.
import twitter
# Query terms
q = 'CrossFit' # Comma-separated list of terms
print >> sys.stderr, 'Filtering the public timeline for track="%s"' % (q,)
# Returns an instance of twitter.Twitter
twitter_api = oauth_login()
# Reference the self.auth parameter
twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)
# See https://dev.twitter.com/docs/streaming-apis
stream = twitter_stream.statuses.filter(track=q)
# For illustrative purposes, when all else fails, search for Justin Bieber
# and something is sure to turn up (at least, on Twitter)
for tweet in stream:
print tweet['text']
# Save to a database in a particular collection
import sys
import datetime
import time
import twitter
def get_time_series_data(api_func, mongo_db_name, mongo_db_coll,
secs_per_interval=60, max_intervals=15, **mongo_conn_kw):
# Default settings of 15 intervals and 1 API call per interval ensure that
# you will not exceed the Twitter rate limit.
interval = 0
while True:
# A timestamp of the form "2013-06-14 12:52:07"
now = str(datetime.datetime.now()).split(".")[0]
ids = save_to_mongo(api_func(), mongo_db_name, mongo_db_coll + "-" + now)
print >> sys.stderr, "Write {0} trends".format(len(ids))
print >> sys.stderr, "Zzz..."
print >> sys.stderr.flush()
time.sleep(secs_per_interval) # seconds
interval += 1
if interval >= 15:
break
# Sample usage
get_time_series_data(twitter_world_trends, 'time-series', 'twitter_world_trends')
def extract_tweet_entities(statuses):
# See https://dev.twitter.com/docs/tweet-entities for more details on tweet
# entities
if len(statuses) == 0:
return [], [], [], [], []
screen_names = [ user_mention['screen_name']
for status in statuses
for user_mention in status['entities']['user_mentions'] ]
hashtags = [ hashtag['text']
for status in statuses
for hashtag in status['entities']['hashtags'] ]
urls = [ url['expanded_url']
for status in statuses
for url in status['entities']['urls'] ]
symbols = [ symbol['text']
for status in statuses
for symbol in status['entities']['symbols'] ]
# In some circumstances (such as search results), the media entity
# may not appear
if status['entities'].has_key('media'):
media = [ media['url']
for status in statuses
for media in status['entities']['media'] ]
else:
media = []
return screen_names, hashtags, urls, media, symbols
# Sample usage
q = 'CrossFit'
statuses = twitter_search(twitter_api, q)
screen_names, hashtags, urls, media, symbols = extract_tweet_entities(statuses)
# Explore the first five items for each...
print json.dumps(screen_names[0:5], indent=1)
print json.dumps(hashtags[0:5], indent=1)
print json.dumps(urls[0:5], indent=1)
print json.dumps(media[0:5], indent=1)
print json.dumps(symbols[0:5], indent=1)
import twitter
def find_popular_tweets(twitter_api, statuses, retweet_threshold=3):
# You could also consider using the favorite_count parameter as part of
# this heuristic, possibly using it to provide an additional boost to
# popular tweets in a ranked formulation
return [ status
for status in statuses
if status['retweet_count'] > retweet_threshold ]
# Sample usage
q = "CrossFit"
twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q, max_results=200)
popular_tweets = find_popular_tweets(twitter_api, search_results)
for tweet in popular_tweets:
print tweet['text'], tweet['retweet_count']
import twitter
from collections import Counter
def get_common_tweet_entities(statuses, entity_threshold=3):
# Create a flat list of all tweet entities
tweet_entities = [ e
for status in statuses
for entity_type in extract_tweet_entities([status])
for e in entity_type
]
c = Counter(tweet_entities).most_common()
# Compute frequencies
return [ (k,v)
for (k,v) in c
if v >= entity_threshold
]
# Sample usage
q = 'CrossFit'
twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q, max_results=100)
common_entities = get_common_tweet_entities(search_results)
print "Most common tweet entities"
print common_entities
from prettytable import PrettyTable
# Get some frequency data
twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q, max_results=100)
common_entities = get_common_tweet_entities(search_results)
# Use PrettyTable to create a nice tabular display
pt = PrettyTable(field_names=['Entity', 'Count'])
[ pt.add_row(kv) for kv in common_entities ]
pt.align['Entity'], pt.align['Count'] = 'l', 'r' # Set column alignment
print pt
import twitter
twitter_api = oauth_login()
print """User IDs for retweeters of a tweet by @fperez_org
that was retweeted by @SocialWebMining and that @jyeee then retweeted
from @SocialWebMining's timeline\n"""
print twitter_api.statuses.retweeters.ids(_id=334188056905129984)['ids']
print json.dumps(twitter_api.statuses.show(_id=334188056905129984), indent=1)
print
print "@SocialWeb's retweet of @fperez_org's tweet\n"
print twitter_api.statuses.retweeters.ids(_id=345723917798866944)['ids']
print json.dumps(twitter_api.statuses.show(_id=345723917798866944), indent=1)
print
print "@jyeee's retweet of @fperez_org's tweet\n"
print twitter_api.statuses.retweeters.ids(_id=338835939172417537)['ids']
print json.dumps(twitter_api.statuses.show(_id=338835939172417537), indent=1)
import re
def get_rt_attributions(tweet):
# Regex adapted from Stack Overflow (http://bit.ly/1821y0J)
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_attributions = []
# Inspect the tweet to see if it was produced with /statuses/retweet/:id.
# See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid.
if tweet.has_key('retweeted_status'):
attribution = tweet['retweeted_status']['user']['screen_name'].lower()
rt_attributions.append(attribution)
# Also, inspect the tweet for the presence of "legacy" retweet patterns
# such as "RT" and "via", which are still widely used for various reasons
# and potentially very useful. See https://dev.twitter.com/discussions/2847
# and https://dev.twitter.com/discussions/1748 for some details on how/why.
try:
rt_attributions += [
mention.strip()
for mention in rt_patterns.findall(tweet['text'])[0][1].split()
]
except IndexError, e:
pass
# Filter out any duplicates
return list(set([rta.strip("@").lower() for rta in rt_attributions]))
# Sample usage
twitter_api = oauth_login()
tweet = twitter_api.statuses.show(_id=214746575765913602)
print get_rt_attributions(tweet)
print
tweet = twitter_api.statuses.show(_id=345723917798866944)
print get_rt_attributions(tweet)
import sys
import time
from urllib2 import URLError
from httplib import BadStatusLine
import json
import twitter
def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw):
# A nested helper function that handles common HTTPErrors. Return an updated
# value for wait_period if the problem is a 500 level error. Block until the
# rate limit is reset if it's a rate limiting issue (429 error). Returns None
# for 401 and 404 errors, which requires special handling by the caller.
def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
if wait_period > 3600: # Seconds
print >> sys.stderr, 'Too many retries. Quitting.'
raise e
# See https://dev.twitter.com/docs/error-codes-responses for common codes
if e.e.code == 401:
print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'
return None
elif e.e.code == 404:
print >> sys.stderr, 'Encountered 404 Error (Not Found)'
return None
elif e.e.code == 429:
print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)'
if sleep_when_rate_limited:
print >> sys.stderr, "Retrying in 15 minutes...ZzZ..."
sys.stderr.flush()
time.sleep(60*15 + 5)
print >> sys.stderr, '...ZzZ...Awake now and trying again.'
return 2
else:
raise e # Caller must handle the rate limiting issue
elif e.e.code in (500, 502, 503, 504):
print >> sys.stderr, 'Encountered %i Error. Retrying in %i seconds' % \
(e.e.code, wait_period)
time.sleep(wait_period)
wait_period *= 1.5
return wait_period
else:
raise e
# End of nested helper function
wait_period = 2
error_count = 0
while True:
try:
return twitter_api_func(*args, **kw)
except twitter.api.TwitterHTTPError, e:
error_count = 0
wait_period = handle_twitter_http_error(e, wait_period)
if wait_period is None:
return
except URLError, e:
error_count += 1
print >> sys.stderr, "URLError encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
except BadStatusLine, e:
error_count += 1
print >> sys.stderr, "BadStatusLine encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
# Sample usage
twitter_api = oauth_login()
# See https://dev.twitter.com/docs/api/1.1/get/users/lookup for
# twitter_api.users.lookup
response = make_twitter_request(twitter_api.users.lookup,
screen_name="SocialWebMining")
print json.dumps(response, indent=1)
def get_user_profile(twitter_api, screen_names=None, user_ids=None):
# Must have either screen_name or user_id (logical xor)
assert (screen_names != None) != (user_ids != None), \
"Must have screen_names or user_ids, but not both"
items_to_info = {}
items = screen_names or user_ids
while len(items) > 0:
# Process 100 items at a time per the API specifications for /users/lookup.
# See https://dev.twitter.com/docs/api/1.1/get/users/lookup for details.
items_str = ','.join([str(item) for item in items[:100]])
items = items[100:]
if screen_names:
response = make_twitter_request(twitter_api.users.lookup,
screen_name=items_str)
else: # user_ids
response = make_twitter_request(twitter_api.users.lookup,
user_id=items_str)
for user_info in response:
if screen_names:
items_to_info[user_info['screen_name']] = user_info
else: # user_ids
items_to_info[user_info['id']] = user_info
return items_to_info
# Sample usage
twitter_api = oauth_login()
print get_user_profile(twitter_api, screen_names=["SocialWebMining", "ptwobrussell"])
#print get_user_profile(twitter_api, user_ids=[132373965])
import twitter_text
# Sample usage
txt = "RT @SocialWebMining Mining 1M+ Tweets About #Syria http://wp.me/p3QiJd-1I"
ex = twitter_text.Extractor(txt)
print "Screen Names:", ex.extract_mentioned_screen_names_with_indices()
print "URLs:", ex.extract_urls_with_indices()
print "Hashtags:", ex.extract_hashtags_with_indices()
from functools import partial
from sys import maxint
def get_friends_followers_ids(twitter_api, screen_name=None, user_id=None,
friends_limit=maxint, followers_limit=maxint):
# Must have either screen_name or user_id (logical xor)
assert (screen_name != None) != (user_id != None), \
"Must have screen_name or user_id, but not both"
# See https://dev.twitter.com/docs/api/1.1/get/friends/ids and
# https://dev.twitter.com/docs/api/1.1/get/followers/ids for details
# on API parameters
get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids,
count=5000)
get_followers_ids = partial(make_twitter_request, twitter_api.followers.ids,
count=5000)
friends_ids, followers_ids = [], []
for twitter_api_func, limit, ids, label in [
[get_friends_ids, friends_limit, friends_ids, "friends"],
[get_followers_ids, followers_limit, followers_ids, "followers"]
]:
if limit == 0: continue
cursor = -1
while cursor != 0:
# Use make_twitter_request via the partially bound callable...
if screen_name:
response = twitter_api_func(screen_name=screen_name, cursor=cursor)
else: # user_id
response = twitter_api_func(user_id=user_id, cursor=cursor)
if response is not None:
ids += response['ids']
cursor = response['next_cursor']
print >> sys.stderr, 'Fetched {0} total {1} ids for {2}'.format(len(ids),
label, (user_id or screen_name))
# XXX: You may want to store data during each iteration to provide an
# an additional layer of protection from exceptional circumstances
if len(ids) >= limit or response is None:
break
# Do something useful with the IDs, like store them to disk...
return friends_ids[:friends_limit], followers_ids[:followers_limit]
# Sample usage
twitter_api = oauth_login()
friends_ids, followers_ids = get_friends_followers_ids(twitter_api,
screen_name="SocialWebMining",
friends_limit=10,
followers_limit=10)
print friends_ids
print followers_ids
def setwise_friends_followers_analysis(screen_name, friends_ids, followers_ids):
friends_ids, followers_ids = set(friends_ids), set(followers_ids)
print '{0} is following {1}'.format(screen_name, len(friends_ids))
print '{0} is being followed by {1}'.format(screen_name, len(followers_ids))
print '{0} of {1} are not following {2} back'.format(
len(friends_ids.difference(followers_ids)),
len(friends_ids), screen_name)
print '{0} of {1} are not being followed back by {2}'.format(
len(followers_ids.difference(friends_ids)),
len(followers_ids), screen_name)
print '{0} has {1} mutual friends'.format(
screen_name, len(friends_ids.intersection(followers_ids)))
# Sample usage
screen_name = "ptwobrussell"
twitter_api = oauth_login()
friends_ids, followers_ids = get_friends_followers_ids(twitter_api,
screen_name=screen_name)
setwise_friends_followers_analysis(screen_name, friends_ids, followers_ids)
def harvest_user_timeline(twitter_api, screen_name=None, user_id=None, max_results=1000):
assert (screen_name != None) != (user_id != None), \
"Must have screen_name or user_id, but not both"
kw = { # Keyword args for the Twitter API call
'count': 200,
'trim_user': 'true',
'include_rts' : 'true',
'since_id' : 1
}
if screen_name:
kw['screen_name'] = screen_name
else:
kw['user_id'] = user_id
max_pages = 16
results = []
tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)
if tweets is None: # 401 (Not Authorized) - Need to bail out on loop entry
tweets = []
results += tweets
print >> sys.stderr, 'Fetched %i tweets' % len(tweets)
page_num = 1
# Many Twitter accounts have fewer than 200 tweets so you don't want to enter
# the loop and waste a precious request if max_results = 200.
# Note: Analogous optimizations could be applied inside the loop to try and
# save requests. e.g. Don't make a third request if you have 287 tweets out of
# a possible 400 tweets after your second request. Twitter does do some
# post-filtering on censored and deleted tweets out of batches of 'count', though,
# so you can't strictly check for the number of results being 200. You might get
# back 198, for example, and still have many more tweets to go. If you have the
# total number of tweets for an account (by GET /users/lookup/), then you could
# simply use this value as a guide.
if max_results == kw['count']:
page_num = max_pages # Prevent loop entry
while page_num < max_pages and len(tweets) > 0 and len(results) < max_results:
# Necessary for traversing the timeline in Twitter's v1.1 API:
# get the next query's max-id parameter to pass in.
# See https://dev.twitter.com/docs/working-with-timelines.
kw['max_id'] = min([ tweet['id'] for tweet in tweets]) - 1
tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)
results += tweets
print >> sys.stderr, 'Fetched %i tweets' % (len(tweets),)
page_num += 1
print >> sys.stderr, 'Done fetching tweets'
return results[:max_results]
# Sample usage
twitter_api = oauth_login()
tweets = harvest_user_timeline(twitter_api, screen_name="SocialWebMining", \
max_results=200)
# Save to MongoDB with save_to_mongo or a local file with save_json...
def crawl_followers(twitter_api, screen_name, limit=1000000, depth=2):
# Resolve the ID for screen_name and start working with IDs for consistency
# in storage
seed_id = str(twitter_api.users.show(screen_name=screen_name)['id'])
_, next_queue = get_friends_followers_ids(twitter_api, user_id=seed_id,
friends_limit=0, followers_limit=limit)
# Store a seed_id => _follower_ids mapping in MongoDB
save_to_mongo({'followers' : [ _id for _id in next_queue ]}, 'followers_crawl',
'{0}-follower_ids'.format(seed_id))
d = 1
while d < depth:
d += 1
(queue, next_queue) = (next_queue, [])
for fid in queue:
follower_ids = get_friends_followers_ids(twitter_api, user_id=fid,
friends_limit=0,
followers_limit=limit)
# Store a fid => follower_ids mapping in MongoDB
save_to_mongo({'followers' : [ _id for _id in next_queue ]},
'followers_crawl', '{0}-follower_ids'.format(fid))
next_queue += follower_ids
# Sample usage
screen_name = "timoreilly"
twitter_api = oauth_login()
crawl_followers(twitter_api, screen_name, depth=1, limit=10)
def analyze_tweet_content(statuses):
if len(statuses) == 0:
print "No statuses to analyze"
return
# A nested helper function for computing lexical diversity
def lexical_diversity(tokens):
return 1.0*len(set(tokens))/len(tokens)
# A nested helper function for computing the average number of words per tweet
def average_words(statuses):
total_words = sum([ len(s.split()) for s in statuses ])
return 1.0*total_words/len(statuses)
status_texts = [ status['text'] for status in statuses ]
screen_names, hashtags, urls, media, _ = extract_tweet_entities(statuses)
# Compute a collection of all words from all tweets
words = [ w
for t in status_texts
for w in t.split() ]
print "Lexical diversity (words):", lexical_diversity(words)
print "Lexical diversity (screen names):", lexical_diversity(screen_names)
print "Lexical diversity (hashtags):", lexical_diversity(hashtags)
print "Averge words per tweet:", average_words(status_texts)
# Sample usage
q = 'CrossFit'
twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q)
analyze_tweet_content(search_results)
import sys
import json
import nltk
import numpy
import urllib2
from boilerpipe.extract import Extractor
def summarize(url, n=100, cluster_threshold=5, top_sentences=5):
# Adapted from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
#
# Parameters:
# * n - Number of words to consider
# * cluster_threshold - Distance between words to consider
# * top_sentences - Number of sentences to return for a "top n" summary
# Begin - nested helper function
def score_sentences(sentences, important_words):
scores = []
sentence_idx = -1
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
sentence_idx += 1
word_idx = []
# For each word in the word list...
for w in important_words:
try:
# Compute an index for important words in each sentence
word_idx.append(s.index(w))
except ValueError, e: # w not in this particular sentence
pass
word_idx.sort()
# It is possible that some sentences may not contain any important words
if len(word_idx)== 0: continue
# Using the word index, compute clusters with a max distance threshold
# for any two consecutive words
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < cluster_threshold:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
# Score each cluster. The max score for any given cluster is the score
# for the sentence.
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster \
* significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, score))
return scores
# End - nested helper function
extractor = Extractor(extractor='ArticleExtractor', url=url)
# It's entirely possible that this "clean page" will be a big mess. YMMV.
# The good news is that the summarize algorithm inherently accounts for handling
# a lot of this noise.
txt = extractor.getText()
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:n]
scored_sentences = score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-top_sentences:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
# Sample usage
sample_url = 'http://radar.oreilly.com/2013/06/phishing-in-facebooks-pond.html'
summary = summarize(sample_url)
print "-------------------------------------------------"
print " 'Top N Summary'"
print "-------------------------------------------------"
print " ".join(summary['top_n_summary'])
print
print
print "-------------------------------------------------"
print " 'Mean Scored' Summary"
print "-------------------------------------------------"
print " ".join(summary['mean_scored_summary'])
def analyze_favorites(twitter_api, screen_name, entity_threshold=2):
# Could fetch more than 200 by walking the cursor as shown in other
# recipes, but 200 is a good sample to work with.
favs = twitter_api.favorites.list(screen_name=screen_name, count=200)
print "Number of favorites:", len(favs)
# Figure out what some of the common entities are, if any, in the content
common_entities = get_common_tweet_entities(favs,
entity_threshold=entity_threshold)
# Use PrettyTable to create a nice tabular display
pt = PrettyTable(field_names=['Entity', 'Count'])
[ pt.add_row(kv) for kv in common_entities ]
pt.align['Entity'], pt.align['Count'] = 'l', 'r' # Set column alignment
print
print "Common entities in favorites..."
print pt
# Print out some other stats
print
print "Some statistics about the content of the favorities..."
print
analyze_tweet_content(favs)
# Could also start analyzing link content or summarized link content, and more.
# Sample usage
twitter_api = oauth_login()
analyze_favorites(twitter_api, "ptwobrussell")