from collections import Counter
import pandas as pd
import seaborn as sns
from matplotlib.colors import ListedColormap
from palettable.colorbrewer import diverging, qualitative, sequential
[docs]def is_data_homogenous(data_container):
"""
Checks that all of the data in the container are of the same Python data
type. This function is called in every other function below, and as such
need not necessarily be called.
:param data_container: A generic container of data points.
:type data_container: `iterable`
"""
data_types = set([type(i) for i in data_container])
return len(data_types) == 1
[docs]def infer_data_type(data_container):
"""
For a given container of data, infer the type of data as one of
continuous, categorical, or ordinal.
For now, it is a one-to-one mapping as such:
- str: categorical
- int: ordinal
- float: continuous
There may be better ways that are not currently implemented below. For
example, with a list of numbers, we can check whether the number of unique
entries is less than or equal to 12, but has over 10000+ entries. This
would be a good candidate for floats being categorical.
:param data_container: A generic container of data points.
:type data_container: `iterable`
"""
# Defensive programming checks.
# 0. Ensure that we are dealing with lists or tuples, and nothing else.
assert isinstance(data_container, list) or isinstance(
data_container, tuple
), "data_container should be a list or tuple."
# 1. Don't want to deal with only single values.
assert (
len(set(data_container)) > 1
), "There should be more than one value in the data container."
# 2. Don't want to deal with mixed data.
assert is_data_homogenous(
data_container
), "Data are not of a homogenous type!"
# Once we check that the data type of the container is homogenous, we only
# need to check the first element in the data container for its type.
datum = data_container[0]
# Return statements below
# treat binomial data as categorical
# TODO: make tests for this.
if len(set(data_container)) == 2:
return "categorical"
elif isinstance(datum, str):
return "categorical"
elif isinstance(datum, int):
return "ordinal"
elif isinstance(datum, float):
return "continuous"
else:
raise ValueError("Not possible to tell what the data type is.")
[docs]def is_data_diverging(data_container):
"""
We want to use this to check whether the data are diverging or not.
This is a simple check, can be made much more sophisticated.
:param data_container: A generic container of data points.
:type data_container: `iterable`
"""
assert infer_data_type(data_container) in [
"ordinal",
"continuous",
], "Data type should be ordinal or continuous"
# Check whether the data contains negative and positive values.
has_negative = False
has_positive = False
for i in data_container:
if i < 0:
has_negative = True
elif i > 0:
has_positive = True
if has_negative and has_positive:
return True
else:
return False
[docs]def is_groupable(data_container):
"""
Returns whether the data container is a "groupable" container or not.
By "groupable", we mean it is a 'categorical' or 'ordinal' variable.
:param data_container: A generic container of data points.
:type data_container: `iterable`
"""
is_groupable = False
if infer_data_type(data_container) in ["categorical", "ordinal"]:
is_groupable = True
return is_groupable
[docs]def num_discrete_groups(data_container):
"""
Returns the number of discrete groups present in a data container.
:param data_container: A generic container of data points.
:type data_container: `iterable`
"""
return len(set(data_container))
[docs]def items_in_groups(data_container):
"""
Returns discrete groups present in a data container and the number items
per group.
:param data_container: A generic container of data points.
:type data_container: `iterable`
"""
return Counter(data_container)
[docs]def n_group_colorpallet(n):
"""If more then 8 categorical groups of nodes or edges this function
creats the matching color_palette
"""
cmap = ListedColormap(sns.color_palette("hls", n))
return cmap
cmaps = {
"Accent_2": qualitative.Accent_3,
"Accent_3": qualitative.Accent_3,
"Accent_4": qualitative.Accent_4,
"Accent_5": qualitative.Accent_5,
"Accent_6": qualitative.Accent_6,
"Accent_7": qualitative.Accent_7,
"Accent_8": qualitative.Accent_8,
"continuous": sequential.YlGnBu_9,
"diverging": diverging.RdBu_11,
"weights": sns.cubehelix_palette(
50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True
),
}
[docs]def to_pandas_nodes(G): # noqa: N803
"""
Convert nodes in the graph into a pandas DataFrame.
"""
data = []
for n, meta in G.nodes(data=True):
d = dict()
d["node"] = n
d.update(meta)
data.append(d)
return pd.DataFrame(data)
[docs]def to_pandas_edges(G, x_kw, y_kw, **kwargs): # noqa: N803
"""
Convert Graph edges to pandas DataFrame that's readable to Altair.
"""
# Get all attributes in nodes
attributes = ["source", "target", "x", "y", "edge", "pair"]
for e in G.edges():
attributes += list(G.edges[e].keys())
attributes = list(set(attributes))
# Build a dataframe for all edges and their attributes
df = pd.DataFrame(index=range(G.size() * 2), columns=attributes)
# Add node data to dataframe.
for i, (n1, n2, d) in enumerate(G.edges(data=True)):
idx = i * 2
x = G.node[n1][x_kw]
y = G.node[n1][y_kw]
data1 = dict(
edge=i, source=n1, target=n2, pair=(n1, n2), x=x, y=y, **d
)
data2 = dict(
edge=i, source=n1, target=n2, pair=(n1, n2), x=x, y=y, **d
)
df.loc[idx] = data1
df.loc[idx + 1] = data2
return df