The lab is divided in four parts
We use a dataset of real Facebook interaction graph
In some cases, we use a small (directed) toy graph
# interactive plots in Jupyter, used to show plots inline in the notebook
%matplotlib inline
# The igraph library
from igraph import *
# Numpy for enhanced math array management
import numpy as np
# statistical tools (we only use ECDF)
from statsmodels.distributions.empirical_distribution import ECDF
# Mathematical plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
# use to control whether to show the entire cell output or only the last_expr (default)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# to generate random numbers
from random import *
# to fit power law distributions
from powerlaw import *
Notes
Packages needed are
fg = read("./facebook.ncol", format = "ncol", directed = True)
Notes
summary(fg, verbosity = 1, max_rows = 25, edge_list_format = "edgelist")
Notes
The four initial letters in the first row
Then
Edgelist format shows the graph one edge per row
See the $\tt{GraphSummary}$ documentation for more details on how to format the output
The undirected version of the graph
fg_u = read("./facebook.ncol", format = "ncol", directed = False)
summary(fg_u, verbosity = 1, max_rows = 25, edge_list_format = "edgelist")
Notes Note edges with IDs 0 and 20
Vertex, Edge, VertexSeq and EdgeSeq
# Vertex Sequence of the graph method vs()
fg_vs = fg.vs()
# the attributes of the vertices
fg_vs.attributes()
# VertexSeq can be accessed as arrays, e.g., the 11-th vertices in the sequence is
fg_vs[10]
# VertexSeq can also be accessed as dictionaries, e.g., the names of the first ten nodes
# Note:
# - fg_vs["name"] is a list with all names of all vertices, from which we only pick the first 10 entries
fg_vs["name"][0:9]
# One specific vertex (a Vertex object)
vert = fg_vs[0]
vert
# Attributes of the vertex, as a dict object
vert.attributes()
# So, attributes can be accessed as dictionaries of a Vertex object
vert["name"]
# The index of the vertex and the graph it belongs to
vert.index
vert.graph
# VertexSeq can be indexed
names = [v["name"] for v in fg_vs]
names[0:9]
Note
Very similar to Vertex sequences
# Edge Sequence of the graph method es()
fg_es = fg.es()
# the attributes of the edges
fg_es.attributes()
# EdgeSeq can be accessed as arrays, e.g., the fisrt edge in the sequence is
fg_es[0]
# EdgeSeq can also be accessed as dictionaries, e.g., the names of the first ten nodes
# Note:
# - fg_es["weight"] is a list with all weights of all edges, from which we only pick the first 10 entries
fg_es["weight"][0:9]
# One specific edge (an Edge object)
edge = fg_es[0]
edge
# Attributes of the edge, as a dict object
edge.attributes()
# So, attributes can be accessed as dictionaries of a Vertex object
edge["weight"]
# The index of the vertex and the graph it belongs to
edge.index
edge.graph
# The IDs of the vertices forming the edge
edge.tuple
edge.source
edge.target
# The names of the vertices forming the edge
fg_vs[edge.source]["name"]
fg_vs[edge.target]["name"]
# EdgeSeq can be indexed
weights = [e["weight"] for e in fg_es]
weights[0:9]
select a subset of elements from a Sequence object (either Vertex of Edges), based on properties of the elements of the set
# select based on attributes
vs_sel_name = fg_vs.select(name_eq = "2")
vs_sel_name[0]
# select based on callable object
def index5(v):
if v.index < 5:
return True
else:
return False
vs_sel_index = fg_vs.select(index5)
vs_sel_index["name"]
# select based on iterables
indices = [1, 2, 3]
vs_sel_iter = fg_vs.select(indices)
vs_sel_iter["name"]
# find() only finds the first object matching the condition
# NB: when no operator is given explictly, igraph assumes that "name" is used
# therefore, the following is equivalent to find(name_eq = "2")
vs_find_name = fg.vs.find("2")
vs_find_name["name"]
vs_find_name.index
Notes
Select based on attributes
Select based on callable object
Select based on iterables
with the $\tt{write()}$ method
write(fg, "./facebook.pajek", format = "pajek")
Vertices and edges can be added/removed.
We use a small toy graph as example, stored in NCOL format in $\tt{./toy\_graph.ncol}$
toy_g = read("./toy_graph.ncol", format = "ncol", directed = True)
summary(toy_g, verbosity = 1, edge_list_format = "edgelist")
# Add two vertices whose names are "6" and "7"
toy_g.add_vertices(["6","7"])
toy_g.vs["name"]
toy_g.vs.select(name_eq = "6")[0].index
Notes
Multiple nodes with the same name can exist (igraph identifies vertices by IDs)
toy_g.add_vertices(["6","7"])
toy_g.vs["name"]
# remove vertices
toy_g.delete_vertices(["6","7"])
toy_g.vs["name"]
# remove vertices by index
toy_g.delete_vertices([5,6])
toy_g.vs["name"]
# vertices can be added/delected also by using a Vertex Sequence
toy_g.add_vertices(["6","7"])
toy_g.vs["name"]
to_delete = toy_g.vs.select(index5)
toy_g.delete_vertices(to_delete)
toy_g.vs["name"]
toy_g = read("./toy_graph.ncol", format = "ncol", directed = True)
summary(toy_g, verbosity = 1, edge_list_format = "edgelist")
# Add:
# - two vertices, with name "a" and "b"
# - weighted links between them
# - a link from "5" to "a"
# - see http://igraph.org/python/doc/igraph.Graph-class.html#add_edge
toy_g.add_vertices(["a","b"])
toy_g.add_edge("a","b",weight = 0.02)
toy_g.add_edge("b","a",weight = 0.01)
toy_g.add_edge("5","a",weight = 0.03)
summary(toy_g, verbosity = 1, edge_list_format = "edgelist")
# see http://igraph.org/python/doc/igraph.Graph-class.html#delete_edges
# We want to remove edges between nodes "a" and "b"
# First way: use edges IDs
# - edges IDs as a list
def first_way():
edge_ids = toy_g.get_eids([("a","b"),("b","a")])
toy_g.delete_edges(edge_ids)
# Second way: delete edges by the names of the vertices
# - set of edges as list of tuples
def second_way():
toy_g.delete_edges([("a","b"), ("b","a")])
# Third way: delete edges by the IDs of the vertices
def third_way():
id1 = toy_g.vs.select(name_eq = "a")[0].index
id2 = toy_g.vs.select(name_eq = "b")[0].index
toy_g.delete_edges([(id1,id2), (id2, id1)])
# Fourth_way: delete an EdgeSeq
def fourth_way():
id1 = toy_g.vs.select(name_eq = "a")[0].index
id2 = toy_g.vs.select(name_eq = "b")[0].index
del_edges = toy_g.es.select(_between = ([id1],[id2]))
toy_g.delete_edges(del_edges)
# emulate a switch
cases = {
'first' : first_way,
'second' : second_way,
'third' : third_way,
'fourth' : fourth_way
}
my_case = cases['first']
my_case()
# check the result
summary(toy_g, verbosity=1, edge_list_format = "edgelist")
# put the deleted edges back
toy_g.add_edge("a","b",weight = 0.02)
toy_g.add_edge("b","a",weight = 0.01)
summary(toy_g, verbosity=1, edge_list_format = "edgelist")
Notes
# Adding an attribute to nodes
toy_g.vs["color"] = 'red'
# Setting the attribute of a specific node
id = toy_g.vs.find(name_eq = 'a').index
toy_g.vs[id]["color"] = 'blue'
summary(toy_g, verbosity=1, edge_list_format = "edgelist", print_vertex_attributes = True)
# deleting attributes
# - vertices seen as a dictionary
del toy_g.vs["color"]
Setting edge attributes
# All of type_B
toy_g.es["type"] = 'type_B'
# Vertex set with indices of the first group
vs1 = toy_g.vs.select(name_in = ["1","2"])
# Vertex set with indices of the second group
vs2 = toy_g.vs.select(name_in = ["3","4"])
# Edges between nodes in the two sets
es = toy_g.es.select(_between = (vs1, vs2))
# Set the attribute for those edges
es["type"] = 'type_A'
summary(toy_g, verbosity=1, edge_list_format = "edgelist")
# Check whether the graph is connected or not
toy_g.is_connected(mode = "WEAK")
toy_g.delete_edges(toy_g.get_eid("5","a"))
toy_g.is_connected(mode = "WEAK")
# Compute the connected components in the graph
# - "WEAK" does not consider the direction of edges
toy_g_clust = toy_g.clusters(mode = "WEAK")
# the number of clusters
len(toy_g_clust)
# the membership of vertices in the clusters
toy_g_clust.membership
# the sizes of the clusters
toy_g_clust.sizes()
# the vertex IDs of the first cluster
toy_g_clust[0]
# the Giant Componet (the biggest cluster)
toy_g_GC = toy_g_clust.giant()
summary(toy_g_GC, verbosity = 1, edge_list_format = "edgelist")
# Equivalent ways (3. also works for non-Giant components)
# 1. obtain the id of the giant component (index of the max in the cluster sizes array)
# extract the subgraph
id_giant = np.argmax(toy_g_clust.sizes())
toy_g_GC = toy_g_clust.subgraph(id_giant)
summary(toy_g_GC, verbosity = 1, edge_list_format = "edgelist")
# 2. get the members of the GC (as vertices IDs)
# use the induced_subgraph() method of Graph
gc_members = toy_g_clust[id_giant]
gc_members
toy_g_GC = toy_g.induced_subgraph(gc_members)
summary(toy_g_GC, verbosity = 1, edge_list_format = "edgelist")
# 3. find the i-th largest component
def comp_id(clust, i):
# clust must be a VertexClustering object
sorted_sizes = sorted(clust.sizes())
size = sorted_sizes[-i]
comp_id = np.flatnonzero(np.array(clust.sizes()) == size)[0]
return comp_id
# this is the Giant Componet (i=1)
cid = comp_id(toy_g_clust, 1)
subG = toy_g_clust.subgraph(cid)
summary(subG, verbosity = 1, edge_list_format = "edgelist")
# this is the second largest component (i=2)
cid = comp_id(toy_g_clust, 2)
subG = toy_g_clust.subgraph(cid)
summary(subG, verbosity = 1, edge_list_format = "edgelist")
# weakly connected components means that it is sufficient that nodes are connected via a unidirected path
# strongly connected components means that nodes must be mutually connected via two unidirected paths
fg_clust = fg.clusters(mode=WEAK)
fb_GC = fg_clust.giant()
summary(fb_GC, verbosity = 1, edge_list_format = "edgelist", max_rows = 20)
# number of vertices and edges in the original graph
fg.vcount()
fg.ecount()
# number of clusters
len(fg_clust)
# sizes (sorted, first 20 elements)
sorted(fg_clust.sizes(), reverse=True)[0:19]
This is based on the igraph $\tt{plot()}$ method, which is built on the $\tt{Cairo}$ package (and specifically on the $\tt{pycairo}$ bindings between Cairo and Python).
The layout of the graph (i.e., the coordinates) of the vertices, must be computed first, with the $\tt{layout()}$ functions.
# Compute the layout, using one of the possible methods (Fructherman-Reingold)
layout = toy_g.layout_fruchterman_reingold()
# Plot the graph
plot(toy_g, layout = layout)
# Beautyfying the the plot with some visual style attributes
# - define a dictionary for setting the required options
# - use the dictionary as the plot parameter
# - see http://igraph.org/python/doc/tutorial/tutorial.html for the set of possible options
colors = ['red', 'blue']
shapes = ['rectangle', 'circle']
sizes = [10,20]
try:
del visual_style
visual_style = {}
except NameError:
visual_style = {}
visual_style["bbox"] = (300,300)
visual_style["vertex_label"] = toy_g.vs["name"]
visual_style["vertex_label_dist"] = 2
visual_style["vertex_size"] = [sizes[toy_g_clust.membership[i]] for i in range(toy_g.vcount())]
visual_style["vertex_color"] = [colors[toy_g_clust.membership[i]] for i in range(toy_g.vcount())]
visual_style["vertex_shape"] = [shapes[toy_g_clust.membership[i]] for i in range(toy_g.vcount())]
visual_style["layout"] = layout
plot(toy_g, **visual_style)
plot(toy_g, "toy_g_plot.pdf", **visual_style)
When using $\tt{plot()}$ with a VertexClustering object
try:
del visual_style
visual_style = {}
except NameError:
visual_style = {}
plot(toy_g_clust, layout=layout, mark_groups = {0: 'blue', 1:'red'})
# Select only nodes with degree > 100 from the FB Giant Component
vs = fb_GC.vs.select(_degree_gt = 100)
fb_g_sub = fb_GC.induced_subgraph(vs)
try:
del visual_style
visual_style = {}
except NameError:
visual_style = {}
visual_style["bbox"] = (600,600)
visual_style["label"] = []
visual_style["layout"] = fb_g_sub.layout_fruchterman_reingold()
visual_style["vertex_size"] = 5
visual_style["vertex_color"] = 'red'
visual_style["vertex_shape"] = 'circle'
visual_style["edge_arrow_size"] = 0.2
visual_style["edge_width"] = np.array(fb_g_sub.es["weight"])/10
plot(fb_g_sub, **visual_style)
Notes
# degree() method
# - mode = "ALL" to consider the undirected graph
fb_deg = fb_GC.degree(mode = "all")
fb_deg[0:19]
# the maximum degree, and the ID of the node with maximum degree
max(fb_deg)
id_max = np.argmax(fb_deg)
id_max
# the set of neighbours of the node with max degree
# - NB: in case of bidirectional links, the same neighbour is counted twice if mode = 'all'
nei = fb_GC.neighbors(id_max, mode="all")
len(nei)
# the set of nodes reachable from id_max with AT MOST 1 jump
neighbours = fb_GC.neighborhood(id_max, order = 1, mode="all")
neighbours[0:19]
# the number of such nodes
# - NB: it also includes the node id_max itself (which is reachable with 0 jumps)
# - thus, the number of nodes reachable with one jump is this - 1
len(neighbours)
fb_GC.neighborhood_size(id_max, order = 1, mode="all")
Note Why is the output of $\tt{neighbourhood\_size()}$ different from the length of $\tt{nei}$?
Let's redo the same on the equivalent undirected graph
# take the undirected version of the Giant Component
# combine_edges tells what to do with the weights (default, lost attribute; here: sum values)
fb_GC_u = fb_GC.as_undirected(combine_edges = "sum")
# Note the lower number of edges with respect to the directed version.
# This is because igraph automatically simplifies the graph (i.e., merges edges between the same nodes)
# to do so manually on a multi-edge graph: g.simplify()
# to check if the graph is simple or not: g.is_simple()
summary(fb_GC_u, verbosity = 1, edge_list_format = "edgelist", max_rows = 25)
# the maximum degree, and the ID of the node with maximum degree
fb_deg_u = fb_GC_u.degree()
max(fb_deg_u)
id_max_u = np.argmax(fb_deg_u)
id_max_u
# the set of neighbours of the node with max degree
nei_u = fb_GC_u.neighbors(id_max_u)
len(nei_u)
# the set of nodes reachable from id_max with AT MOST 1 jump
neighbours = fb_GC_u.neighborhood(id_max, order = 1, mode="all")
neighbours[0:19]
# the number of such nodes
# - NB: it also includes the node id_max itself (which is reachable with 0 jumps)
# - thus, the number of nodes reachable with one jump is this - 1
len(neighbours)
fb_GC_u.neighborhood_size(id_max, order = 1, mode="all")
One way to visualise the density of the degree is through a histogram
igraph includes a method to compute the histogram of the degree distribution
But we use Numpy/Matplotlib methods, which are more flexible
dd_h, dd_h_bins, _ = plt.hist(fb_deg, bins=range(1,max(fb_deg)+2), normed = True, color = 'red')
Notes
# how do the histogram and bins look like
dd_h[0:19]
dd_h_bins[0:19]
# zoom in
dd_h, dd_h_bins, _ = plt.hist(fb_deg, bins=range(1,max(fb_deg)+2), normed = True, color = 'red')
plt.axis([0,50,0,0.2])
# Degree density on a loglog scale
plt.loglog(dd_h_bins[:-1], dd_h, 'bo')
plt.xlabel("d")
plt.ylabel("P(Degree = d)")
plt.title("Degree density on a log-log scale")
# Compute the CCDF - we can use 2 ways
# 1. use the histogram functions with parameter cumulative=-1 gives the CCDF
dd_h, dd_h_bins, _ = plt.hist(fb_deg, bins=range(1,max(fb_deg)+2), normed = True, color = 'red', cumulative = -1)
plt.axis([0,100,0,1])
# 2. More general: use the ECDF function of statsmodels.distributions.empirical_distribution
# ECDF(dataset) returns a the empirical CDF computed from the dataset, which can be used as a FUNCTION
# - i.e., it is possible to call ECDF(x) for any x, irrespective of the set of data from which the ECDF is derived
deg_cdf = ECDF(fb_deg)
# scale the fig size twice in length
default_sizes = plt.rcParams["figure.figsize"]
fig_sizes = (2*default_sizes[0], default_sizes[1])
# generate a figure with 2 subplots, organised in 1 row and 2 columns
# ax1 and ax2 ("axes") are used to access the individual plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = fig_sizes)
# plot the CCDF in lin-lin and log-log scales
# see http://matplotlib.org/api/axes_api.html for the API of the Axis class
# see http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot or the Axes.plot() documentation
# for the parameters of the plot method
degs = np.arange(1,max(fb_deg)+1)
ax1.plot(degs, 1-deg_cdf(degs), 'bo')
ax1.set_xlabel("$d$")
ax1.set_ylabel("$P(D>d)$")
ax1.set_title("Degree CCDF in a lin-lin scale")
ax2.loglog(degs, 1-deg_cdf(degs), 'bo')
ax2.set_xlabel("$d$")
ax2.set_ylabel("$P(D>d)$")
ax2.set_title("Degree CCDF in a log-log scale")
_ = plt.loglog(degs, 1-deg_cdf(degs), 'bo')
_ = plt.xlabel("$d$")
_ = plt.ylabel("$P(D>d)$")
_ = plt.title("Degree CCDF in a log-log scale")
# The global assortativity coefficient - we will need the undirected version of the graph from now on
fb_GC_u.assortativity_degree()
# The knn data. Two lists are returned
# - the list of average degrees for each node (knn)
# - the list of average degrees for each degree (knnk)
fb_knn, fb_knnk = fb_GC_u.knn()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = fig_sizes)
ax1.loglog(fb_GC_u.degree(), fb_knn, 'go')
ax1.set_xlabel("degree")
ax1.set_ylabel("Neighbors degree")
ax1.set_title("$knn$ index for the FB Giant Component")
ax2.loglog(range(1,max(fb_GC_u.degree())+1), fb_knnk, 'go')
ax2.set_xlabel("degree")
ax2.set_ylabel("Average degree of neighbors")
ax2.set_title("$knnk$ index for the FB Giant Component")
# global transitivity (C1)
# as_undirected(), otherwise results would be wrong
fb_GC_u.transitivity_undirected()
# average local transitivity (C2)
# mode = 0 means that nodes with less than two neighbours will have zero transitivity
fb_GC_u.transitivity_avglocal_undirected(mode="zero")
# local transitivity for all vertices
local_trans = fb_GC_u.transitivity_local_undirected(mode="zero")
local_trans[0:19]
mean(local_trans)
# we use a subset of the graph, for computational purposes, i.e., we pick the subgraph of nodes in the GC with degree>60
idx = np.argwhere(np.array(fb_GC_u.degree())>60).flatten()
sub_g = fb_GC_u.induced_subgraph(idx)
# closeness centrality
sub_g.closeness()[0:19]
# betweenness centrality
sub_g.betweenness()[0:19]
# edge betweenness, and edge with the maximum edge betweenness
edge_bet = sub_g.edge_betweenness()
idx_max = np.argmax(edge_bet)
idx_max
edge_bet[idx_max]
sub_g.es[idx_max].tuple
idx = np.argwhere(np.array(fb_GC_u.degree())>100).flatten()
sub_g1 = fb_GC_u.induced_subgraph(idx)
# Fastgreedy algorithm
vd = sub_g1.community_fastgreedy()
# The number of detected communities
vd.optimal_count
# convert to a cluster object to plot it nicely
vd_clust = vd.as_clustering()
plot(vd_clust, layout=sub_g1.layout_fruchterman_reingold(), mark_groups = True)
# identify the indices of edges that cut across different communities
cros = np.array(vd_clust.crossing())
np.argwhere(cros == True).flatten()
# generated a sample of 100 vertices as source/destinations
src = sample(fb_GC_u.vs, 100)
trg = sample(fb_GC_u.vs, 100)
# computed the UNWEIGHTED shortest paths
# - result is a matrix, while we need an array for plotting
n_hops_u = fb_GC_u.shortest_paths(source = src, target = trg, weights = None)
n_hops_u = np.array(n_hops_u).flatten()
_,_,_ = plt.hist(n_hops_u, bins = range(1,max(n_hops_u)+2), normed = True)
_ = plt.axvline(mean(n_hops_u), color = 'gold', linewidth = 4)
_ = plt.xlabel("# of hops")
_ = plt.ylabel("frequency")
_ = plt.title("Histogram of UNWEIGHTED path length")
# WEIGHTED SHORTEST PATHS
# - weights in the dataset are the importance of the relationship,
# while shortest paths assume this is a cost
# - so, use 1/weights as the weights of the shortest path computations
weights = 1/np.array(fb_GC_u.es["weight"])
# fb_GC.shortest_paths returns the WEIGHTED NUMBER OF HOPS
# - instead, we need the number of hops on the shortest (weighted) path
# - so, we need to:
# - compute the shortest weighted paths (method get_shortest_paths)
# - extract the # of hops
# - get_shortest_paths() can be used on a SINGLE source vertices at a time
# - so, we need to cycle over all elements of src,
# and compute the number of hops towards all target nodes (function n_hops)
def n_hops(s, t, w):
sps = fb_GC_u.get_shortest_paths(s, t, output = 'epath', weights = w)
return [len(sps[i]) for i in range(0,len(sps))]
n_hops_w = []
for s in src:
n_hops_w += n_hops(s, trg, weights)
# prepare for plotting the two distributions of shortest paths lengths
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = fig_sizes)
# plot the density and the average value (as a vertical line) of the unweighted distribution in the first plot
h_values,_,_ = ax1.hist(n_hops_u, bins = range(1,max(n_hops_u)+2), normed = True)
mean_u_hops = mean(n_hops_u)
_ = ax1.axvline(mean_u_hops, color = 'gold', linewidth = 4)
_ = ax1.text(1.2*mean_u_hops, max(h_values), "avg_hops = %.2f" % mean_u_hops)
_ = ax1.set_xlabel("# of hops")
_ = ax1.set_ylabel("frequency")
_ = ax1.set_title("Histogram of UNWEIGHTED path length")
# plot the density and the average value (as a vertical line) of the weighted distribution in the second plot
h_values,_,_ = ax2.hist(n_hops_w, bins = range(1,max(n_hops_w)+2), normed = True, color = 'red')
mean_w_hops = mean(n_hops_w)
_ = ax2.axvline(mean(n_hops_w), color = 'gold', linewidth = 4)
_ = ax2.text(1.2*mean_w_hops, max(h_values), "avg_hops = %.2f" % mean_w_hops)
_ = ax2.set_xlabel("# of hops")
_ = ax2.set_ylabel("frequency")
_ = ax2.set_title("Histogram of WEIGHTED path length")
According to the Erdős–Rényi model, a random graph is constructed by connecting nodes randomly with probability $p$. We build an Erdős–Rényi model equivalent to the $\tt{subg}$ graph
# we use the sub_g1 FB graph only for visual comparison here
# compute parameter p from the sub_g1 graph
er_p_fb = mean(sub_g1.degree())/sub_g1.vcount()
# create the equivalent Erdos-Renyi graph
er_fb_sub = Graph.Erdos_Renyi(sub_g1.vcount(), er_p_fb)
try:
del visual_style
visual_style = {}
except NameError:
visual_style = {}
visual_style["label"] = []
visual_style["layout"] = er_fb_sub.layout_circle()
visual_style["vertex_size"] = 10
visual_style["vertex_color"] = 'gold'
visual_style["vertex_shape"] = 'circle'
visual_style["edge_width"] = 2
visual_style["bbox"] = (250,250)
plot(er_fb_sub, **visual_style)
visual_style["layout"] = sub_g1.layout_circle()
plot(sub_g1, **visual_style)
# Now we compare the degree distributions for the complete fb Giant Component
er_p_GC = mean(fb_GC_u.degree())/fb_GC_u.vcount()
er_fb_all = Graph.Erdos_Renyi(fb_GC_u.vcount(), er_p_GC)
# take only the Giant Component
er_fb = er_fb_all.clusters(mode = "WEAK").giant()
er_fb.vcount()
fb_GC_u.vcount()
# we use GridSpecs for a finer control of the plot positioning
fig_sizes = (fig_sizes[0], 2*default_sizes[1])
f = plt.figure(figsize = fig_sizes)
# create a 2x2 Grid Specification
gs = gridspec.GridSpec(2, 2)
# add subplots to the figure, using the GridSpec gs
# position [0,0] (upper-left corner)
ax1 = plt.subplot(gs[0,0])
ax2 = plt.subplot(gs[0,1])
# the third plot spans the entire second row
ax3 = plt.subplot(gs[1,:])
# compute and plot the histogram of FB degrees
d_fb = fb_GC_u.degree()
_,_,_ = ax1.hist(d_fb, bins=range(1,max(d_fb)+2), normed = True, color = 'red')
_ = ax1.set_xlim(0,80)
_ = ax1.set_xlabel("$d$")
_ = ax1.set_ylabel("Frequencies")
_ = ax1.set_title("Histogram of FB degrees")
# compute and plot the histogram of ER degrees
d_er = er_fb.degree()
_,_,_ = ax2.hist(d_er, bins=range(1,max(d_er)+2), normed = True, color = 'blue')
_ = ax2.set_xlim(0,80)
_ = ax2.set_xlabel("$d$")
_ = ax2.set_ylabel("Frequencies")
_ = ax2.set_title("Histogram of ER degrees")
# compute and plot the degree CCDFs
fb_ecdf = ECDF(d_fb)
er_ecdf = ECDF(d_er)
x = np.arange(1,max(d_fb)+1)
_ = ax3.loglog(x, 1-fb_ecdf(x), 'ro', label = 'Facebook')
x = np.arange(1,max(d_er)+1)
_ = ax3.loglog(x, 1-er_ecdf(x), 'bo', label = 'Erdos-Renyi')
_ = ax3.set_xlabel("$d$")
_ = ax3.set_ylabel("$P(D>d)$")
_ = ax3.set_title("Comparison between degree CCDFs")
_ = ax3.legend(numpoints = 1)
# clustering
fb_GC_u.transitivity_undirected()
fb_GC_u.transitivity_avglocal_undirected(mode="zero")
er_fb.transitivity_undirected()
er_fb.transitivity_avglocal_undirected(mode="zero")
# Shortest path lenght
# on a subset of the nodes, as otherwise it will take forever to compute
fb_vs_src = sample(fb_GC_u.vs, 1000)
fb_vs_trg = sample(fb_GC_u.vs, 1000)
er_vs_src = sample(er_fb.vs, 1000)
er_vs_trg = sample(er_fb.vs, 1000)
fb_sp = mean(np.array(fb_GC_u.shortest_paths(fb_vs_src, fb_vs_trg, weights=None)).flatten())
fb_sp
fb_GC_u.vcount()
mean(np.array(er_fb.shortest_paths(er_vs_src, er_vs_trg)).flatten())
er_fb.vcount()
Given the number of nodes $N$, the mean degree $k$, and the rewiring probability $\beta$, the model constructs a graph (undirected) with $N$ nodes and $\frac{Nk}{2}$ edges following these steps:
# visual comparison, with small graph
ws_p = 0.08
avg_deg_sub = mean(sub_g1.degree())
ws_sub = Graph.Watts_Strogatz(1, sub_g1.vcount(), int(round(avg_deg_sub)), ws_p)
# check if ws_p was ok to reproduce the same clustering
ws_sub.transitivity_undirected()
sub_g1.transitivity_undirected()
#plot
visual_style["layout"] = ws_sub.layout_circle()
plot(ws_sub, **visual_style)
visual_style["layout"] = sub_g1.layout_circle()
plot(sub_g1, **visual_style)
# Analyse the entire graph
# The value of <k> and the rewiring probability
avg_deg_fb = mean(fb_GC_u.degree())
ws_p = 0.3
ws_fb_all = Graph.Watts_Strogatz(1, fb_GC_u.vcount(), int(round(avg_deg_fb)), 0.3)
# take only the Giant Component
ws_fb = ws_fb_all.clusters(mode = "WEAK").giant()
ws_fb.vcount()
fb_GC_u.vcount()
# clustering coefficient
ws_fb.transitivity_undirected()
fb_GC_u.transitivity_undirected()
# Shortest path lenght
# on a subset of the nodes, as otherwise it will take forever to compute
ws_vs_src = sample(ws_fb.vs, 1000)
ws_vs_trg = sample(ws_fb.vs, 1000)
mean(np.array(ws_fb.shortest_paths(ws_vs_src, ws_vs_trg)).flatten())
fb_sp
# Now we compare the degree distributions for the complete fb Giant Component
# we use GridSpecs for a finer control of the plot positioning
fig_sizes = (fig_sizes[0], 2*default_sizes[1])
f = plt.figure(figsize = fig_sizes)
# create a 2x2 Grid Specification
gs = gridspec.GridSpec(2, 2)
# add subplots to the figure, using the GridSpec gs
# position [0,0] (upper-left corner)
ax1 = plt.subplot(gs[0,0])
ax2 = plt.subplot(gs[0,1])
# the third plot spans the entire second row
ax3 = plt.subplot(gs[1,:])
# compute and plot the histogram of FB degrees
d_fb = fb_GC_u.degree()
_,_,_ = ax1.hist(d_fb, bins=range(1,max(d_fb)+2), normed = True, color = 'red')
_ = ax1.set_xlim(0,80)
_ = ax1.set_xlabel("$d$")
_ = ax1.set_ylabel("Frequencies")
_ = ax1.set_title("Histogram of FB degrees")
# compute and plot the histogram of WS degrees
d_ws = ws_fb.degree()
_,_,_ = ax2.hist(d_ws, bins=range(1,max(d_ws)+2), normed = True, color = 'blue')
_ = ax2.set_xlim(0,80)
_ = ax2.set_xlabel("$d$")
_ = ax2.set_ylabel("Frequencies")
_ = ax2.set_title("Histogram of WS degrees")
# compute and plot the degree CCDFs
fb_ecdf = ECDF(d_fb)
ws_ecdf = ECDF(d_ws)
x = np.arange(1,max(d_fb)+1)
_ = ax3.loglog(x, 1-fb_ecdf(x), 'ro', label = 'Facebook')
x = np.arange(1,max(d_ws)+1)
_ = ax3.loglog(x, 1-ws_ecdf(x), 'bo', label = 'Watts-Strogatz')
_ = ax3.set_xlabel("$d$")
_ = ax3.set_ylabel("$P(D>d)$")
_ = ax3.set_title("Comparison between degree CCDFs")
_ = ax3.legend(numpoints = 1)
The model starts from an initial network of $m_{0}$ nodes.
New nodes are added to the network one at a time. Each new node is connected to $m\leq m_{0}$ existing nodes with a probability that is proportional to the number of links that the existing nodes already have. The probability that a new node is connected to node $i$ is the following:
$$p_{i}={\frac {k_{i}}{\sum _{j}k_{j}}},$$where $k_i$ is the degree of node $i$
# visual comparison, with small graph
ba_sub = Graph.Barabasi(sub_g1.vcount())
#plot
visual_style["layout"] = ba_sub.layout_circle()
plot(ba_sub, **visual_style)
visual_style["layout"] = sub_g1.layout_circle()
plot(sub_g1, **visual_style)
# Analyse the entire graph
ba_fb_all = Graph.Barabasi(fb_GC_u.vcount(), m=2)
# take only the Giant Component
ba_fb = ba_fb_all.clusters(mode = "WEAK").giant()
ba_fb.vcount()
fb_GC_u.vcount()
# clustering coefficient
ba_fb.transitivity_undirected()
fb_GC_u.transitivity_undirected()
# Shortest path lenght
# on a subset of the nodes, as otherwise it will take forever to compute
ba_vs_src = sample(ba_fb.vs, 1000)
ba_vs_trg = sample(ba_fb.vs, 1000)
mean(np.array(ba_fb.shortest_paths(ba_vs_src, ba_vs_trg)).flatten())
fb_sp
# Now we compare the degree distributions for the complete fb Giant Component
# we use GridSpecs for a finer control of the plot positioning
fig_sizes = (fig_sizes[0], 2*default_sizes[1])
f = plt.figure(figsize = fig_sizes)
# create a 2x2 Grid Specification
gs = gridspec.GridSpec(2, 2)
# add subplots to the figure, using the GridSpec gs
# position [0,0] (upper-left corner)
ax1 = plt.subplot(gs[0,0])
ax2 = plt.subplot(gs[0,1])
# the third plot spans the entire second row
ax3 = plt.subplot(gs[1,:])
# compute and plot the histogram of FB degrees
d_fb = fb_GC_u.degree()
_,_,_ = ax1.hist(d_fb, bins=range(1,max(d_fb)+2), normed = True, color = 'red')
_ = ax1.set_xlim(0,80)
_ = ax1.set_xlabel("$d$")
_ = ax1.set_ylabel("Frequencies")
_ = ax1.set_title("Histogram of FB degrees")
# compute and plot the histogram of BA degrees
d_ba = ba_fb.degree()
_,_,_ = ax2.hist(d_ba, bins=range(1,max(d_ws)+2), normed = True, color = 'blue')
_ = ax2.set_xlim(0,80)
_ = ax2.set_xlabel("$d$")
_ = ax2.set_ylabel("Frequencies")
_ = ax2.set_title("Histogram of BA degrees")
# compute and plot the degree CCDFs
fb_ecdf = ECDF(d_fb)
ba_ecdf = ECDF(d_ba)
x = np.arange(1,max(d_fb)+1)
_ = ax3.loglog(x, 1-fb_ecdf(x), 'ro', label = 'Facebook')
x = np.arange(1,max(d_ba)+1)
_ = ax3.loglog(x, 1-ba_ecdf(x), 'bo', label = 'Barabasi-Albert')
_ = ax3.set_xlabel("$d$")
_ = ax3.set_ylabel("$P(D>d)$")
_ = ax3.set_title("Comparison between degree CCDFs")
_ = ax3.legend(numpoints = 1)
# visual comparison, with small graph
exp_pl = 2.5
N = sub_g1.vcount()
M = sub_g1.ecount()
pl_sub = Graph.Static_Power_Law(N, int(round(M)), exp_pl)
#plot
visual_style["layout"] = pl_sub.layout_circle()
plot(pl_sub, **visual_style)
visual_style["layout"] = sub_g1.layout_circle()
plot(sub_g1, **visual_style)
# Analyse the entire graph
# First, we find the best power law fit for the degree distribution
# with a fixed minimum value xmin (minimum degree for which the fitting is computed)
# - see the plot of the CCDFs for understanding how fitting depends on xmin
xmin = 10
fit_pl = Fit(fb_GC_u.degree(), xmin = xmin)
# by computing automatically the "best" xmin value
fit_pl_auto = Fit(fb_GC_u.degree())
exp_pl_auto = fit_pl_auto.alpha
xmin_auto = fit_pl_auto.xmin
exp_pl = fit_pl.alpha
print "PL exponents: (xmin=%d) %.2f; (auto xmin=%.2f) %.2f" % (xmin, exp_pl, xmin_auto, exp_pl_auto)
# compute the number of nodes and edges of the FB graph to generate the equivalent static Power Law graph
N = fb_GC_u.vcount()
M = fb_GC_u.ecount()
# Equivalent graph for the fitting with fixed xmin
pl_fb_all = Graph.Static_Power_Law(N, M, exp_pl)
# the graph could not be connected, so keep the GC only
pl_fb = pl_fb_all.clusters(mode = "WEAK").giant()
# Equivalent graph for the fitting with automatic xmin
pl_fb_auto_all = Graph.Static_Power_Law(N, M, exp_pl_auto)
pl_fb_auto = pl_fb_auto_all.clusters(mode = "WEAK").giant()
# clustering coefficients
cc_pl = pl_fb.transitivity_undirected()
cc_pl_auto = pl_fb_auto.transitivity_undirected()
cc_fb = fb_GC_u.transitivity_undirected()
print "Clustering: (FB) %.5f; (PL xmin=%d) %.5f; (PL auto xmin) %.5f" %(cc_fb, xmin, cc_pl, cc_pl_auto)
# Shortest path lenght
# on a subset of the nodes, as otherwise it will take forever to compute
pl_vs_src = sample(pl_fb.vs, 500)
pl_vs_trg = sample(pl_fb.vs, 500)
pl_auto_vs_src = sample(pl_fb_auto.vs, 500)
pl_auto_vs_trg = sample(pl_fb_auto.vs, 500)
sp_pl = mean(np.array(pl_fb.shortest_paths(pl_vs_src, pl_vs_trg)).flatten())
sp_pl_auto = mean(np.array(pl_fb_auto.shortest_paths(pl_auto_vs_src, pl_auto_vs_trg)).flatten())
print "Shortest paths: (FB) %.2f; (PL xmin=%d) %.2f; (PL auto xmin) %.2f" % (fb_sp, xmin, sp_pl, sp_pl_auto)
# Now we compare the degree distributions for the complete fb Giant Component
# we use GridSpecs for a finer control of the plot positioning
fig_sizes = (fig_sizes[0], 2*default_sizes[1])
f = plt.figure(figsize = fig_sizes)
# create a 2x3 Grid Specification
gs = gridspec.GridSpec(2, 3)
# add subplots to the figure, using the GridSpec gs
# position [0,0] (upper-left corner)
ax1 = plt.subplot(gs[0,0])
ax2 = plt.subplot(gs[0,1])
ax3 = plt.subplot(gs[0,2])
# the fourth plot spans the entire second row
ax4 = plt.subplot(gs[1,:])
# compute and plot the histogram of FB degrees
d_fb = fb_GC_u.degree()
_,_,_ = ax1.hist(d_fb, bins=range(1,max(d_fb)+2), normed = True, color = 'red')
_ = ax1.set_xlim(0,80)
_ = ax1.set_xlabel("$d$")
_ = ax1.set_ylabel("Frequencies")
_ = ax1.set_title("Histogram of FB degrees")
# compute and plot the histogram of Static Power Law degrees with set xmin
d_pl = pl_fb.degree()
_,_,_ = ax2.hist(d_pl, bins=range(1,max(d_pl)+2), normed = True, color = 'blue', label = "$\gamma$ = %.2f" % exp_pl)
_ = ax2.set_xlim(0,80)
_ = ax2.set_xlabel("$d$")
_ = ax2.set_ylabel("Frequencies")
_ = ax2.set_title("Histogram of PL degrees (xmin=%d)" % xmin)
_ = ax2.legend()
# compute and plot the histogram of Static Power law degrees with auto xmin
d_pl_auto = pl_fb_auto.degree()
_,_,_ = ax3.hist(d_pl_auto, bins=range(1,max(d_pl_auto)+2), normed = True, color = 'green', label = "$\gamma$ = %.2f" % exp_pl_auto)
_ = ax3.set_xlim(0,80)
_ = ax3.set_xlabel("$d$")
_ = ax3.set_ylabel("Frequencies")
_ = ax3.set_title("Histogram of PL degrees (auto xmin)")
_ = ax3.legend()
# compute and plot the degree CCDFs
fb_ecdf = ECDF(d_fb)
pl_ecdf = ECDF(d_pl)
pl_auto_ecdf = ECDF(d_pl_auto)
x = np.arange(1,max(d_fb)+1)
_ = ax4.loglog(x, 1-fb_ecdf(x), 'ro', label = 'Facebook')
x = np.arange(1,max(d_pl)+1)
_ = ax4.loglog(x, 1-pl_ecdf(x), 'bo', label = 'Static PL xmin=%d' % xmin)
x = np.arange(1,max(d_pl_auto)+1)
_ = ax4.loglog(x, 1-pl_auto_ecdf(x), 'go', label = 'Static PL auto xmin')
_ = ax4.set_xlabel("$d$")
_ = ax4.set_ylabel("$P(D>d)$")
_ = ax4.set_title("Comparison between degree CCDFs")
_ = ax4.legend(numpoints = 1)
# for reference, plot the power law functions corresponding to the fitting with fixed and automatic xmin
x1 = np.arange(xmin_auto, max(d_fb)+1)
_ = ax4.loglog(x1, 1000000000000 * x1**(-exp_pl_auto), 'g-', linewidth = 3)
x1 = np.arange(xmin, max(d_fb)+1)
_ = ax4.loglog(x1, 1000 * x1**(-exp_pl), 'b-', linewidth = 2)
deg_seq = fb_GC_u.degree()
# in general the parameter deg_seq needs to be a list with a degree sequence, i.e.
# - one element per node
# - the element is the degree of the node
fb_DS = Graph.Degree_Sequence(deg_seq, method="vl")
# in this case we have just re-created the fb_GC_u graph
fb_DS.vcount()
fb_DS.ecount()
fb_GC_u.vcount()
fb_GC_u.ecount()
mean(fb_DS.degree())
mean(fb_GC_u.degree())