Source code for nxviz.io

import pandas as pd
from networkx import Graph, MultiGraph


[docs]def graph_from_dataframe(
    dataframe,
    threshold_by_percent_unique=0.1,
    threshold_by_count_unique=None,
    node_id_columns=[],
    node_property_columns=[],
    edge_property_columns=[],
    node_type_key="type",
    edge_type_key="type",
    collapse_edges=True,
    edge_agg_key="weight",
):
    """
    Build an undirected graph from a pandas dataframe.

    This function attempts to infer which cells should become nodes
    based on either:

        a. what percentage of the column are unique values (defaults to 10%)
        b. an explicit count of unique values (i.e. any column with 7 unique
           values or less)
        c. an explicit list of column keys (i.e.
           ['employee_id', 'location_code'])

    Column headers are preserved as node and edge 'types'. By default, this is
    stored using the key 'type' which is used by some graph import processes
    but can be reconfigured.

    This function uses a MultiGraph structure during the build phase so that it
    is possible to make multiple connections between nodes. By default, at the
    end of the build phase, the MultiGraph is converted to a Graph and the
    count of edges between each node-pair is written as a 'weight' property.

    :param pandas.DataFrame dataframe: A pandas dataframe containing the data
        to be converted into a graph.
    :param float threshold_by_percent_unique: A percent value used to determine
        whether a column should be used to generate nodes based on its
        cardinality (i.e. in a dataframe with 100 rows, treat any column with
        10 or less unique values as a node)
    :param int threshold_by_count_unique: A numeric value used to determine
        whether a column should be used to generate nodes based on its
        cardinality (i.e. if 7 is supplied, treat any column with 7 or less
        unique values as a node) - supplying a value will take priority over
        percent_unique
    :param list node_id_columns: A list of column headers to use for generating
        nodes. Suppyling any value will take precedence over
        threshold_by_percent_unique or threshold_by_count_unique. Note: this
        can cause the size of the graph to expand significantly since every
        unique value in a column will become a node.
    :param list node_property_columns: A list of column headers to use for
        generating properties of nodes. These can include the same column
        headers used for the node id.
    :param list edge_property_columns: A list of column headers to use for
        generating properties of edges.
    :param str node_type_key: A string that sets the key will be used to
        preserve the column name as node property (this is useful for importing
        networkx graphs to databases that distinguish between node 'types' or
        for visually encoding those types in plots).
    :param str edge_type_key: A string that sets the key will be used to keep
        track of edge relationships an 'types' (this is useful for importing
        networkx graphs to databases that distinguish between edge'types' or
        for visually encoding those types in plots). Edge type values are
        automatically set to <node_a_id>_<node_b_id>.
    :param bool collapse_edges: Graphs are instantiated as a 'MultiGraph'
        (allow multiple edges between nodes) and then collapsed into a 'Graph'
        which only has a single edge between any two nodes. Information is
        preserved by aggregating the count of those edges as a 'weight' value.
        Set this value to False to return the MultiGraph. Note: this can cause
        the size of the graph to expand significantly since each row can
        potentially have n! edges where n is the number of columns in the
        dataframe.
    :param str edge_agg_key: A string that sets the key the edge count will be
        assigned to when edges are aggregated.
    :returns: A networkx Graph (or MultiGraph if collapse_edges is set to
        False).
    """

    assert isinstance(
        dataframe, pd.DataFrame
    ), "{} is not a pandas DataFrame".format(dataframe)

    M = MultiGraph()

    # if explicit specification of node_id_columns is provided, use those
    if len(node_id_columns) > 0:
        node_columns = node_id_columns
    else:
        # otherwise, compute with thresholds based on the dataframe
        if threshold_by_count_unique:
            node_columns = sorted(
                [
                    col
                    for col in dataframe.columns
                    if dataframe[col].nunique() <= threshold_by_count_unique
                ]
            )
        else:
            node_columns = sorted(
                [
                    col
                    for col in dataframe.columns
                    if dataframe[col].nunique() / dataframe.shape[0]
                    <= threshold_by_percent_unique  # NOQA to preserve meaningful variable names
                ]
            )

    # use the unique values for each node column as node types
    for node_type in node_columns:
        M.add_nodes_from(
            [
                (node, {node_type_key: node_type})
                for node in dataframe[node_type].unique()
            ]
        )

    # iterate over the rows and generate an edge for each pair of node columns
    for i, row in dataframe.iterrows():
        # assemble the edge properties as a dictionary
        edge_properties = {k: row[k] for k in edge_property_columns}

        # iterate over the node_ids in each node_column of the dataframe row
        node_buffer = []
        for node_type in node_columns:
            node_id = row[node_type]

            # get a reference to the node and assign any specified properties
            node = M.nodes[node_id]
            for k in node_property_columns:
                # if values are not identical, append with a pipe delimiter
                if k not in node:
                    node[k] = row[k]
                elif isinstance(node[k], str) and str(row[k]) not in node[k]:
                    node[k] += "|{}".format(str(row[k]))
                elif str(row[k]) not in str(node[k]):
                    node[k] = str(node[k]) + "|{}".format(str(row[k]))

            # build edges using precomputed edge properties
            for other_node_id, other_node_type in node_buffer:
                # sort node_type so undirected edges all share the same type
                ordered_name = "_".join(sorted([node_type, other_node_type]))
                edge_properties[edge_type_key] = ordered_name
                M.add_edge(node_id, other_node_id, **edge_properties)

            # store the node from this column in the buffer for future edges
            node_buffer.append((node_id, node_type))

    if collapse_edges:
        # convert the MultiGraph to a Graph
        G = Graph(M)
        k = edge_agg_key
        # preserve the edge count as a sum of the weight values
        for u, v, data in M.edges(data=True):
            w = data[k] if k in data else 1.0
            edge = G[u][v]
            edge[k] = (w + edge[k]) if k in edge else w
        return G

    return M
Source code for nxviz.io

nxviz

Navigation

Related Topics