Source code for AlloViz.AlloViz.Filtering

"""Module with classes to filter and store networks

Functions of the module (and/or combinations of them) are used for filtering the passed
raw networks with the main class :class:`~AlloViz.AlloViz.Filtering.Filtering`.

"""

import os
import time

import pandas
import numpy as np
import networkx
from multiprocess import Pool

from . import Analysis
from . import utils
from .utils import rgetattr, rhasattr


[docs] def All(pkg, data, **kwargs): """No filtering """ return data
[docs] def GetContacts_edges(pkg, data, GetContacts_threshold, **kwargs): """Retain only edges found by GetContacts It retains only the raw edges for which `GetContacts` has been able to calculate a contact frequency value, i.e., residue pairs that are in physicochemical contact. `GetContacts` data is retrieved from the passed `pkg`'s :class:`~AlloViz.Protein` object stored in it and thus is needed to have been calculated previously. It may (or not) have been calculated using the `GetContacts_threshold` kwarg, filtered afterwards with :meth:`AlloViz.Wrappers.GetContacts.GetContacts.filter_contacts`, or the `GetContacts_threshold` kwarg can also be passed to the analysis to filter the `GetContacts` data in this point. Other Parameters ---------------- **kwargs `GetContacts_threshold` kwarg can be passed to specify the minimum contact frequency (0-1, default 0) threshold, which will be used to filter out contacts with a frequency (average) lower than it before analysis. """ # Get GetContacts raw data if its calculation is available, else raise an Exception try: gc = pkg.protein.GetContacts.raw except: raise Exception("GetContacts results are needed first") # If GetContacts_threshold kwarg is passed, use it to filter the present data without affecting the saved GetContacts' Protein attribute gc = gc[gc["weight"] >= GetContacts_threshold] # Return the data filtered retaining only the indices that are in GetContacts data return data.filter(gc.index, axis=0)
[docs] def No_Sequence_Neighbors(pkg, data, Sequence_Neighbor_distance, **kwargs): """Filter out residue pairs too close in the sequence It only retains edges between residue pairs that are minimum a certain number of positions away in the protein sequence (default: 5). It can be assumed that residues that are close in the protein sequence will be considerably correlated and/or interacting, because of their intrinsic short spatial distance. Specially, it is known that a turn of an alpha-helix is completed almost every 4 residues, and that the residues i and i+4 (and i-5) in an alpha-helix interact with a backbone hydrogen bond. Thus, to take out these interactions and also the contacts due to closeness of residues i and i+5, the default value is 5 sequence positions. Other Parameters ---------------- **kwargs `Sequence_Neighbor_distance` kwarg can be passed to specify the minimum number of sequence positions/distance between residues of a pair to retain in Intercontact filtering, which defaults to 5. """ # Define a function to retrieve the residue number from the residue nomenclature used (chainID:)RES:resnum resnum = lambda res: int(res.rsplit(":")[-1]) # Return the indices whose residue pair satisfies the Intercontact_distance # The absolute value of the subtraction between the two residue numbers must be greater than the parameter to indicate that they are at a greater distance in the sequence than it indices = [ idx for idx in data.index if abs(resnum(idx[0]) - resnum(idx[1])) > Sequence_Neighbor_distance #Intercontact_distance ] # Return the filtered data return data.filter(indices, axis=0)
[docs] def GPCR_Interhelix(pkg, data, **kwargs): """Retain only edges between different TMs(/ECLs/ICLs) of GPCRs It can only be used for GPCR systems (`GPCR==True`) for which GPCRdb's generic numbering could be retrieved, and it only retains edges between residue pairs that have generic numbering and are in different transmembrane helices (and/or intra-cellular or extracellular loops) according to it. """ if not pkg.protein.GPCR: raise Exception("GPCR_Interhelix filtering is only available for GPCR systems.") # Create a dictionary mapping the residue numbers to their corresponding TM or ICL/ECL (or 0) mapper = dict(zip( pkg.protein.protein.select_atoms("name CA").resnums, np.floor(pkg.protein.protein.select_atoms("name CA").tempfactors) )) # Define a function to retrieve the residue number from the residue nomenclature used (chainID:)RES:resnum resnum = lambda res: int(res.rsplit(":")[-1]) def are_interhelix(idx): # Make a list of the residue pair's TMs TMs = [mapper[resnum(idx[0])], mapper[resnum(idx[1])]] # Then, check that (1) none of the retrieved TMs are 0 (residue with no generic numbering) and that (2) they are different TMs/ICLs/ECLs return all([ all(TMs), len(set(TMs)) > 1 ]) indices = [idx for idx in data.index if are_interhelix(idx)] # Return the filtered data return data.filter(indices, axis=0)
[docs] def Spatially_distant(pkg, data, Interresidue_distance, **kwargs): """Retain only edges between spatially distant residue pairs It only retains edges between residue pairs whose CA atoms are minimum a certain number of angstroms away from each other in the initial PDB/structure (default 10 Å). The relationship found between these residues can be considered purely allosteric, as they are spatially distant and have no direct communication but can be found to be interacting/correlated... Other Parameters ---------------- **kwargs 'Interresidue_distance' kwarg can be passed to specify the minimum number of angstroms that the CA atoms of residue pairs should have between each other in the initial PDB/structure (default 10 Å) to be considered spatially distant. """ from MDAnalysis.analysis import distances # https://userguide.mdanalysis.org/1.1.1/examples/analysis/distances_and_contacts/distances_within_selection.html # Create a triangular matrix with all inter-residue distances CAs = pkg.protein.protein.select_atoms("name CA") n_ca = len(CAs) self_distances = distances.self_distance_array(CAs.positions) sq_dist_arr = np.zeros((n_ca, n_ca)) triu = np.triu_indices_from(sq_dist_arr, k=1) sq_dist_arr[triu] = self_distances # Transform the matrix into a pandas DataFrame resnames = [f"{aa.resname}:{aa.resid}" for aa in pkg.protein.protein.residues] df = pandas.DataFrame(sq_dist_arr, columns=resnames, index=resnames) df = df.where( np.triu(np.ones(df.shape), k=1).astype(bool) ) df = pandas.DataFrame({"dist": df.stack()}) indices = df[df["dist"] >= Interresidue_distance].index # Return the filtered data return data.filter(indices, axis=0)
class NoNetworkException(Exception): pass
[docs] class Filtering: r"""Class for network filtering Instances of this class are used to be added as attributes to instances of Wrappers' classes (as attributes of a :class:`AlloViz.Protein` object) with the purpose of storing (un)filtered networks according to different criteria. Raw network edges stored as a DataFrame in the passed `pkg` object are filtered with the corresponding function or combination of functions of the :mod:`~AlloViz.AlloViz.Filtering` module and converted to individual NetworkX' :external:ref:`Graphs <graph>` with :meth:`~AlloViz.AlloViz.Analysis.Analysis._get_G` to be stored as private attributes. Parameters ---------- pkg : instance of a class from :mod:`AlloViz.Wrappers` The object is used to retrieve the raw network edges for network analysis. It also contains/gives access to the corresponding:class:`~AlloViz.Protein` object and its information. filtering : str or list Filtering scheme(s) with which to filter the list of network edges. A list of strings is used to filter with a combination of criteria. All available (and combinable) filtering options are functions in the :mod:`~AlloViz.AlloViz.Filtering` module: :func:`~AlloViz.AlloViz.Filtering.All`, :func:`~AlloViz.AlloViz.Filtering.GetContacts_edges`, :func:`~AlloViz.AlloViz.Filtering.No_Sequence_Neighbors`, :func:`~AlloViz.AlloViz.Filtering.GPCR_Interhelix`, :func:`~AlloViz.AlloViz.Filtering.Spatially_distant`. name : str Name of the filtering scheme. It will be the same name as the passed filtering option if it is a single string, or the names of the filtering options joined by "_" if a lsit has been passed. It is used to name the pkg's attribute in which the class instance is saved. Other Parameters ---------------- GetContacts_threshold : float Optional kwarg that can be passed to specify the minimum contact frequency (0-1, default 0) threshold, which will be used to filter out contacts with a frequency (average) lower than it before analysis. Sequence_Neighbor_distance : int Optional kwarg that can be passed to specify the minimum number of sequence positions/distance between residues of a pair to retain in No_Sequence_Neighbors filtering, which defaults to 5. Interresidue_distance : int or float Optional kwarg that can be passed to specify the minimum number of angstroms that the CA atoms of residue pairs should have between each other in the initial PDB/structure (default 10 Å) to be considered spatially distant. Attributes ---------- graphs : dict of external:ref:`Graph <graph>` objects See Also -------- AlloViz.Protein.filter : Class method to filter the network(s) raw edge weights with different criteria. AlloViz.Wrappers.Base.Base.filter : Pkg's method to filter the network(s) raw edge weights with different criteria. """ def __init__(self, pkg, filtering, name, *, GetContacts_threshold=0, Sequence_Neighbor_distance=5, Interresidue_distance=10): self._pkg = pkg self._name = name # Establish the paths and filenames self._path = f"{self._pkg.protein._datadir}/{self._pkg._name}/{self._name}" os.makedirs(self._path, exist_ok=True) self._datapq = lambda element, metric: f"{self._path}/{element}_{metric}.pq" # Get the filtered thata according to the filtering scheme(s) filterings = ( filtering if isinstance(filtering, list) else [filtering] ) data = self._pkg.raw for filt in filterings: filtfunc = eval(filt) data = filtfunc(self._pkg, data, GetContacts_threshold=GetContacts_threshold, Sequence_Neighbor_distance=Sequence_Neighbor_distance, Interresidue_distance=Interresidue_distance) # Drop all-0 rows (not taking into account weight_std column if it's present) self._filtdata = data.drop(data[(data.drop(columns=[c for c in data.columns if "std" in c]) == 0).all(axis=1)].index, axis=0) self._graph_distances = -np.log(abs(self._filtdata) + 10E-10) + 10E-10 # un-transform standard error columns self._graph_distances.loc[:,["std" in c for c in self._graph_distances.columns]] = data.loc[:,["std" in c for c in data.columns]] # For each column in the filtered data that is not a standard error, create an analyzable NetworkX's graph and save it self.graphs = {} for col in [c for c in self._graph_distances if "std" not in c]: try: # The approach in lit. is to use -log10(|corr|) as edge weights/distances in the network for analyses # e.g., https://www.pnas.org/doi/full/10.1073/pnas.0810961106 self.graphs[col] = self._get_G(self._graph_distances[col]) except NoNetworkException as e: print(e)
[docs] def _get_G(self, column): r"""Return a DataFrame's column as a Graph Transform a column from a :class:`pandas.DataFrame` (passed as a :class:`pandas.Series`) into a NetworkX' :class:`~networkx.Graph`; retaining only the largest component if the passed information results in an unconnected network. Parameters ---------- column : :class:`pandas.Series` """ # Transform the column into a 3-column dataframe with the nodes' name and the value as weight. Drop 0s, NAs and use absolute value weights = ( column[column != 0].dropna().abs().rename("weight").reset_index() ) # btw calculations fail with 0 value weights and cfb prob with negative # Create the NetworkX's Graph and a list of its components network = networkx.from_pandas_edgelist(weights, "level_0", "level_1", "weight") components = list(networkx.connected_components(network)) if len(components) == 0: raise NoNetworkException( " ".join(( f"EXCEPTION! No connected components in network ({network.number_of_nodes()} nodes):", self._pkg._name, self._name, column.name, )) ) else: # Check that the largest connected component has the same size as the total number of nodes, else select the largest component to return as the Graph to be analyzed largest_component = max(components, key=len) if len(largest_component) < network.number_of_nodes(): print( f"WARNING! Unconnected network ({network.number_of_nodes()} nodes):", self._pkg._name, self._name, column.name, "\n", f"Largest network component will be used for analysis. Sizes (number of nodes) of all components: {[len(comp) for comp in components]}", ) network = network.subgraph(largest_component) return network
[docs] def analyze(self, elements="edges", metrics="all", cores=1, nodes_dict=Analysis.nodes_dict, edges_dict=Analysis.edges_dict, **kwargs): r"""Analyze the filtered network Send the analyses of the filtered network for the passed combinations of elements-metrics. The individual :external:ref:`Graphs <graph>` saved as private attributes upon object initialization are analyzed independently with the :mod:`~AlloViz.AlloViz.Analysis` module (this function calls :func:`AlloViz.AlloViz.Analysis.analyze`). Results are stored as new instances of classes from the :mod:`AlloViz.AlloViz.Elements` module, which extend the :class:`pandas.DataFrame` class. Parameters ---------- elements : str or list, {"edges", "nodes"} Network element for which to perform the analysis. metrics : str or list, default: "all" Network metrics to compute, which must be keys in the `nodes_dict` or `edges_dict` dictionaries. Default is "all" and it sends the computation for all the metrics defined in the corresponding dictionary of the selected elements in `element`. cores : int, default: 1 Number of cores to use for parallelization with a `multiprocess` Pool. Default value only uses 1 core with a custom :class:`AlloViz.utils.dummypool` that performs computations synchronously. Other Parameters ---------------- nodes_dict, edges_dict : dict Optional kwarg(s) of the dictionary(ies) that maps network metrics custom names (e.g., betweenness centrality, "btw") with their corresponding NetworkX function (e.g., "networkx.algorithms.centrality.betweenness_centrality"). Functions strings must be written as if they were absolute imports, and must return a dictionary of edges or nodes, depending on the element dictionary in which they are. The keys of the dictionaries will be used to name the columns of the analyzed data that the functions produce. Defaults are :data:`~AlloViz.AlloViz.Analysis.nodes_dict` and :data:`~AlloViz.AlloViz.Analysis.edges_dict`. **kwargs Other optional keyword arguments that will be passed to the NetworkX analysis function(s) that is(are) used on the method call in case they need extra parameters. """ # Depending on the desired cores, use a dummypool (synchronous calculations) or a `multiprocess` Pool # Changing it inside the `utils` module allows to share the same one between modules if cores > 1: mypool = Pool(cores) else: mypool = utils.dummypool() utils.pool = mypool if self._filtdata.size == 0: print(f"{self._pkg._name} {self._name} is not a connected network (or subnetwork)") else: Analysis.analyze(self, elements, metrics, nodes_dict, edges_dict, **kwargs) # Close the pool utils.pool.close() utils.pool.join() utils.pool = utils.dummypool()