Source code for nxviz.io

import pandas as pd
from networkx import Graph, MultiGraph


[docs]def graph_from_dataframe( dataframe, threshold_by_percent_unique=0.1, threshold_by_count_unique=None, node_id_columns=[], node_property_columns=[], edge_property_columns=[], node_type_key="type", edge_type_key="type", collapse_edges=True, edge_agg_key="weight", ): """ Build an undirected graph from a pandas dataframe. This function attempts to infer which cells should become nodes based on either: a. what percentage of the column are unique values (defaults to 10%) b. an explicit count of unique values (i.e. any column with 7 unique values or less) c. an explicit list of column keys (i.e. ['employee_id', 'location_code']) Column headers are preserved as node and edge 'types'. By default, this is stored using the key 'type' which is used by some graph import processes but can be reconfigured. This function uses a MultiGraph structure during the build phase so that it is possible to make multiple connections between nodes. By default, at the end of the build phase, the MultiGraph is converted to a Graph and the count of edges between each node-pair is written as a 'weight' property. :param pandas.DataFrame dataframe: A pandas dataframe containing the data to be converted into a graph. :param float threshold_by_percent_unique: A percent value used to determine whether a column should be used to generate nodes based on its cardinality (i.e. in a dataframe with 100 rows, treat any column with 10 or less unique values as a node) :param int threshold_by_count_unique: A numeric value used to determine whether a column should be used to generate nodes based on its cardinality (i.e. if 7 is supplied, treat any column with 7 or less unique values as a node) - supplying a value will take priority over percent_unique :param list node_id_columns: A list of column headers to use for generating nodes. Suppyling any value will take precedence over threshold_by_percent_unique or threshold_by_count_unique. Note: this can cause the size of the graph to expand significantly since every unique value in a column will become a node. :param list node_property_columns: A list of column headers to use for generating properties of nodes. These can include the same column headers used for the node id. :param list edge_property_columns: A list of column headers to use for generating properties of edges. :param str node_type_key: A string that sets the key will be used to preserve the column name as node property (this is useful for importing networkx graphs to databases that distinguish between node 'types' or for visually encoding those types in plots). :param str edge_type_key: A string that sets the key will be used to keep track of edge relationships an 'types' (this is useful for importing networkx graphs to databases that distinguish between edge'types' or for visually encoding those types in plots). Edge type values are automatically set to <node_a_id>_<node_b_id>. :param bool collapse_edges: Graphs are instantiated as a 'MultiGraph' (allow multiple edges between nodes) and then collapsed into a 'Graph' which only has a single edge between any two nodes. Information is preserved by aggregating the count of those edges as a 'weight' value. Set this value to False to return the MultiGraph. Note: this can cause the size of the graph to expand significantly since each row can potentially have n! edges where n is the number of columns in the dataframe. :param str edge_agg_key: A string that sets the key the edge count will be assigned to when edges are aggregated. :returns: A networkx Graph (or MultiGraph if collapse_edges is set to False). """ assert isinstance( dataframe, pd.DataFrame ), "{} is not a pandas DataFrame".format(dataframe) M = MultiGraph() # if explicit specification of node_id_columns is provided, use those if len(node_id_columns) > 0: node_columns = node_id_columns else: # otherwise, compute with thresholds based on the dataframe if threshold_by_count_unique: node_columns = sorted( [ col for col in dataframe.columns if dataframe[col].nunique() <= threshold_by_count_unique ] ) else: node_columns = sorted( [ col for col in dataframe.columns if dataframe[col].nunique() / dataframe.shape[0] <= threshold_by_percent_unique # NOQA to preserve meaningful variable names ] ) # use the unique values for each node column as node types for node_type in node_columns: M.add_nodes_from( [ (node, {node_type_key: node_type}) for node in dataframe[node_type].unique() ] ) # iterate over the rows and generate an edge for each pair of node columns for i, row in dataframe.iterrows(): # assemble the edge properties as a dictionary edge_properties = {k: row[k] for k in edge_property_columns} # iterate over the node_ids in each node_column of the dataframe row node_buffer = [] for node_type in node_columns: node_id = row[node_type] # get a reference to the node and assign any specified properties node = M.nodes[node_id] for k in node_property_columns: # if values are not identical, append with a pipe delimiter if k not in node: node[k] = row[k] elif isinstance(node[k], str) and str(row[k]) not in node[k]: node[k] += "|{}".format(str(row[k])) elif str(row[k]) not in str(node[k]): node[k] = str(node[k]) + "|{}".format(str(row[k])) # build edges using precomputed edge properties for other_node_id, other_node_type in node_buffer: # sort node_type so undirected edges all share the same type ordered_name = "_".join(sorted([node_type, other_node_type])) edge_properties[edge_type_key] = ordered_name M.add_edge(node_id, other_node_id, **edge_properties) # store the node from this column in the buffer for future edges node_buffer.append((node_id, node_type)) if collapse_edges: # convert the MultiGraph to a Graph G = Graph(M) k = edge_agg_key # preserve the edge count as a sum of the weight values for u, v, data in M.edges(data=True): w = data[k] if k in data else 1.0 edge = G[u][v] edge[k] = (w + edge[k]) if k in edge else w return G return M