This IPython Notebook provides an interactive way to follow along with and explore the numbered examples from Mining the Social Web (2nd Edition). The intent behind this notebook is to reinforce the concepts from the sample code in a fun, convenient, and effective way. This notebook assumes that you are reading along with the book and have the context of the discussion as you work through these exercises.
In the somewhat unlikely event that you've somehow stumbled across this notebook outside of its context on GitHub, you can find the full source code repository here.
You are free to use or adapt this notebook for any purpose you'd like. However, please respect the Simplified BSD License that governs its use.
import requests
from getpass import getpass
import json
username = '' # Your GitHub username
password = '' # Your GitHub password
# Note that credentials will be transmitted over a secure SSL connection
url = 'https://api.github.com/authorizations'
note = 'Mining the Social Web, 2nd Ed.'
post_data = {'scopes':['repo'],'note': note }
response = requests.post(
url,
auth = (username, password),
data = json.dumps(post_data),
)
print "API response:", response.text
print
print "Your OAuth token is", response.json()['token']
# Go to https://github.com/settings/applications to revoke this token
import json
import requests
# An unauthenticated request that doesn't contain an ?access_token=xxx query string
url = "https://api.github.com/repos/ptwobrussell/Mining-the-Social-Web/stargazers"
response = requests.get(url)
# Display one stargazer
print json.dumps(response.json()[0], indent=1)
print
# Display headers
for (k,v) in response.headers.items():
print k, "=>", v
from github import Github
# XXX: Specify your own access token here
ACCESS_TOKEN = ''
# Specify a username and repository of interest for that user.
USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'
client = Github(ACCESS_TOKEN, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)
# Get a list of people who have bookmarked the repo.
# Since you'll get a lazy iterator back, you have to traverse
# it if you want to get the total number of stargazers.
stargazers = [ s for s in repo.get_stargazers() ]
print "Number of stargazers", len(stargazers)
import networkx as nx
# Create a directed graph
g = nx.DiGraph()
# Add an edge to the directed graph from X to Y
g.add_edge('X', 'Y')
# Print some statistics about the graph
print nx.info(g)
print
# Get the nodes and edges from the graph
print "Nodes:", g.nodes()
print "Edges:", g.edges()
print
# Get node properties
print "X props:", g.node['X']
print "Y props:", g.node['Y']
# Get edge properties
print "X=>Y props:", g['X']['Y']
print
# Update a node property
g.node['X'].update({'prop1' : 'value1'})
print "X props:", g.node['X']
print
# Update an edge property
g['X']['Y'].update({'label' : 'label1'})
print "X=>Y props:", g['X']['Y']
# Expand the initial graph with (interest) edges pointing each direction for
# additional people interested. Take care to ensure that user and repo nodes
# do not collide by appending their type.
import networkx as nx
g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)
for sg in stargazers:
g.add_node(sg.login + '(user)', type='user')
g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')
# Poke around in the current graph to get a better feel for how NetworkX works
print nx.info(g)
print
print g.node['Mining-the-Social-Web(repo)']
print g.node['ptwobrussell(user)']
print
print g['ptwobrussell(user)']['Mining-the-Social-Web(repo)']
# The next line would throw a KeyError since no such edge exists:
# print g['Mining-the-Social-Web(repo)']['ptwobrussell(user)']
print
print g['ptwobrussell(user)']
print g['Mining-the-Social-Web(repo)']
print
print g.in_edges(['ptwobrussell(user)'])
print g.out_edges(['ptwobrussell(user)'])
print
print g.in_edges(['Mining-the-Social-Web(repo)'])
print g.out_edges(['Mining-the-Social-Web(repo)'])
from operator import itemgetter
from IPython.display import HTML
from IPython.core.display import display
display(HTML('<img src="files/resources/ch07-github/kite-graph.png" width="400px">'))
# The classic Krackhardt kite graph
kkg = nx.generators.small.krackhardt_kite_graph()
print "Degree Centrality"
print sorted(nx.degree_centrality(kkg).items(),
key=itemgetter(1), reverse=True)
print
print "Betweenness Centrality"
print sorted(nx.betweenness_centrality(kkg).items(),
key=itemgetter(1), reverse=True)
print
print "Closeness Centrality"
print sorted(nx.closeness_centrality(kkg).items(),
key=itemgetter(1), reverse=True)
# Add (social) edges from the stargazers' followers. This can take a while
# because of all of the potential API calls to GitHub. The approximate number
# of requests for followers for each iteration of this loop can be calculated as
# math.ceil(sg.get_followers() / 100.0) per the API returning up to 100 items
# at a time.
import sys
for i, sg in enumerate(stargazers):
# Add "follows" edges between stargazers in the graph if any relationships exist
try:
for follower in sg.get_followers():
if follower.login + '(user)' in g:
g.add_edge(follower.login + '(user)', sg.login + '(user)',
type='follows')
except Exception, e: #ssl.SSLError
print >> sys.stderr, "Encountered an error fetching followers for", \
sg.login, "Skipping."
print >> sys.stderr, e
print "Processed", i+1, " stargazers. Num nodes/edges in graph", \
g.number_of_nodes(), "/", g.number_of_edges()
print "Rate limit remaining", client.rate_limiting
from operator import itemgetter
from collections import Counter
# Let's see how many social edges we added since last time.
print nx.info(g)
print
# The number of "follows" edges is the difference
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
print
# The repository owner is possibly one of the more popular users in this graph.
print len([e
for e in g.edges_iter(data=True)
if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)'])
print
# Let's examine the number of adjacent edges to each node
print sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10]
print
# Consider the ratio of incoming and outgoing edges for a couple of users with
# high node degrees...
# A user who follows many but is not followed back by many.
print len(g.out_edges('hcilab(user)'))
print len(g.in_edges('hcilab(user)'))
print
# A user who is followed by many but does not follow back.
print len(g.out_edges('ptwobrussell(user)'))
print len(g.in_edges('ptwobrussell(user)'))
print
c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
print "Number of popular users", len(popular_users)
print "Top 10 popular users:", popular_users[:10]
# Save your work by serializing out (pickling) the graph
nx.write_gpickle(g, "resources/ch07-github/data/github.gpickle.1")
# How to restore the graph...
# import networkx as nx
# g = nx.read_gpickle("resources/ch07-github/data/github.gpickle.1")
from operator import itemgetter
# Create a copy of the graph so that we can iteratively mutate the copy
# as needed for experimentation
h = g.copy()
# Remove the seed of the interest graph, which is a supernode, in order
# to get a better idea of the network dynamics
h.remove_node('Mining-the-Social-Web(repo)')
# XXX: Remove any other nodes that appear to be supernodes.
# Filter any other nodes that you can by threshold
# criteria or heuristics from inspection.
# Display the centrality measures for the top 10 nodes
dc = sorted(nx.degree_centrality(h).items(),
key=itemgetter(1), reverse=True)
print "Degree Centrality"
print dc[:10]
print
bc = sorted(nx.betweenness_centrality(h).items(),
key=itemgetter(1), reverse=True)
print "Betweenness Centrality"
print bc[:10]
print
print "Closeness Centrality"
cc = sorted(nx.closeness_centrality(h).items(),
key=itemgetter(1), reverse=True)
print cc[:10]
# Let's add each stargazer's additional starred repos and add edges
# to find additional interests.
MAX_REPOS = 500
for i, sg in enumerate(stargazers):
print sg.login
try:
for starred in sg.get_starred()[:MAX_REPOS]: # Slice to avoid supernodes
g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \
owner=starred.owner.login)
g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
except Exception, e: #ssl.SSLError:
print "Encountered an error fetching starred repos for", sg.login, "Skipping."
print "Processed", i+1, "stargazers' starred repos"
print "Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
print "Rate limit", client.rate_limiting
NOTE: Given that Example 12 is potentially a very time-consuming example to run, be sure to snapshot your work
# Save your work by serializing out another snapshot of the graph
nx.write_gpickle(g, "resources/ch07-github/data/github.gpickle.2")
#import networkx as nx
#g = nx.read_gpickle("resources/ch07-github/data/github.gpickle.2")
Consider analysis similar to Example 12 here. Create a copy of the graph and be selective in pruning it or extracting subgraphs of interst.
# Poke around: how to get users/repos
from operator import itemgetter
print nx.info(g)
print
# Get a list of repositories from the graph.
repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']
# Most popular repos
print "Popular repositories"
print sorted([(n,d)
for (n,d) in g.in_degree_iter()
if g.node[n]['type'] == 'repo'], \
key=itemgetter(1), reverse=True)[:10]
print
# Projects gazed at by a user
print "Respositories that ptwobrussell has bookmarked"
print [(n,g.node[n]['lang'])
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'gazes']
print
# Programming languages for each user
print "Programming languages ptwobrussell is interested in"
print list(set([g.node[n]['lang']
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'gazes']))
print
# Find supernodes in the graph by approximating with a high number of
# outgoing edges
print "Supernode candidates"
print sorted([(n, len(g.out_edges(n)))
for n in g.nodes_iter()
if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \
key=itemgetter(1), reverse=True)
# Iterate over all of the repos, and add edges for programming languages
# for each person in the graph. We'll also add edges back to repos so that
# we have a good point to "pivot" upon.
repos = [n
for n in g.nodes_iter()
if g.node[n]['type'] == 'repo']
for repo in repos:
lang = (g.node[repo]['lang'] or "") + "(lang)"
stargazers = [u
for (u, r, d) in g.in_edges_iter(repo, data=True)
if d['type'] == 'gazes'
]
for sg in stargazers:
g.add_node(lang, type='lang')
g.add_edge(sg, lang, type='programs')
g.add_edge(lang, repo, type='implements')
# Some stats
print nx.info(g)
print
# What languages exist in the graph?
print [n
for n in g.nodes_iter()
if g.node[n]['type'] == 'lang']
print
# What languages do users program with?
print [n
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'programs']
# What is the most popular programming language?
print "Most popular languages"
print sorted([(n, g.in_degree(n))
for n in g.nodes_iter()
if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10]
print
# How many users program in a particular language?
python_programmers = [u
for (u, l) in g.in_edges_iter('Python(lang)')
if g.node[u]['type'] == 'user']
print "Number of Python programmers:", len(python_programmers)
print
javascript_programmers = [u for
(u, l) in g.in_edges_iter('JavaScript(lang)')
if g.node[u]['type'] == 'user']
print "Number of JavaScript programmers:", len(javascript_programmers)
print
# What users program in both Python and JavaScript?
print "Number of programmers who use JavaScript and Python"
print len(set(python_programmers).intersection(set(javascript_programmers)))
# Programmers who use JavaScript but not Python
print "Number of programmers who use JavaScript but not Python"
print len(set(javascript_programmers).difference(set(python_programmers)))
# XXX: Can you determine who is the most polyglot programmer?
NOTE: Optionally, snapshot the final graph
# Save your work by serializing out another snapshot of the graph
nx.write_gpickle(g, "resources/ch07-github/data/github.gpickle.3")
#import networkx as nx
#g = nx.read_gpickle("resources/ch07-github/data/github.gpickle.3")
import os
import json
from IPython.display import IFrame
from IPython.core.display import display
from networkx.readwrite import json_graph
print "Stats on the full graph"
print nx.info(g)
print
# Create a subgraph from a collection of nodes. In this case, the
# collection is all of the users in the original interest graph
mtsw_users = [n for n in g if g.node[n]['type'] == 'user']
h = g.subgraph(mtsw_users)
print "Stats on the extracted subgraph"
print nx.info(h)
# Visualize the social network of all people from the original interest graph.
d = json_graph.node_link_data(h)
json.dump(d, open('resources/ch07-github/force.json', 'w'))
# IPython Notebook can serve files and display them into
# inline frames. Prepend the path with the 'files' prefix.
# A D3 template for displaying the graph data.
viz_file = 'files/resources/ch07-github/force.html'
# Display the D3 visualization.
display(IFrame(viz_file, '100%', '600px'))