"""Module with classes to filter and store networks
Functions of the module (and/or combinations of them) are used for filtering the passed
raw networks with the main class :class:`~AlloViz.AlloViz.Filtering.Filtering`.
"""
import os
import time
import pandas
import numpy as np
import networkx
from multiprocess import Pool
from . import Analysis
from . import utils
from .utils import rgetattr, rhasattr
[docs]
def All(pkg, data, **kwargs):
"""No filtering
"""
return data
[docs]
def No_Sequence_Neighbors(pkg, data, Sequence_Neighbor_distance, **kwargs):
"""Filter out residue pairs too close in the sequence
It only retains edges between residue pairs that are minimum a certain number of
positions away in the protein sequence (default: 5). It can be assumed that residues
that are close in the protein sequence will be considerably correlated and/or
interacting, because of their intrinsic short spatial distance. Specially, it is
known that a turn of an alpha-helix is completed almost every 4 residues, and that
the residues i and i+4 (and i-5) in an alpha-helix interact with a backbone hydrogen
bond. Thus, to take out these interactions and also the contacts due to closeness of
residues i and i+5, the default value is 5 sequence positions.
Other Parameters
----------------
**kwargs
`Sequence_Neighbor_distance` kwarg can be passed to specify the minimum number of
sequence positions/distance between residues of a pair to retain in Intercontact
filtering, which defaults to 5.
"""
# Define a function to retrieve the residue number from the residue nomenclature used (chainID:)RES:resnum
resnum = lambda res: int(res.rsplit(":")[-1])
# Return the indices whose residue pair satisfies the Intercontact_distance
# The absolute value of the subtraction between the two residue numbers must be greater than the parameter to indicate that they are at a greater distance in the sequence than it
indices = [
idx
for idx in data.index
if abs(resnum(idx[0]) - resnum(idx[1])) > Sequence_Neighbor_distance #Intercontact_distance
]
# Return the filtered data
return data.filter(indices, axis=0)
[docs]
def GPCR_Interhelix(pkg, data, **kwargs):
"""Retain only edges between different TMs(/ECLs/ICLs) of GPCRs
It can only be used for GPCR systems (`GPCR==True`) for which GPCRdb's generic
numbering could be retrieved, and it only retains edges between residue pairs that
have generic numbering and are in different transmembrane helices (and/or
intra-cellular or extracellular loops) according to it.
"""
if not pkg.protein.GPCR:
raise Exception("GPCR_Interhelix filtering is only available for GPCR systems.")
# Create a dictionary mapping the residue numbers to their corresponding TM or ICL/ECL (or 0)
mapper = dict(zip(
pkg.protein.protein.select_atoms("name CA").resnums,
np.floor(pkg.protein.protein.select_atoms("name CA").tempfactors)
))
# Define a function to retrieve the residue number from the residue nomenclature used (chainID:)RES:resnum
resnum = lambda res: int(res.rsplit(":")[-1])
def are_interhelix(idx):
# Make a list of the residue pair's TMs
TMs = [mapper[resnum(idx[0])], mapper[resnum(idx[1])]]
# Then, check that (1) none of the retrieved TMs are 0 (residue with no generic numbering) and that (2) they are different TMs/ICLs/ECLs
return all([
all(TMs),
len(set(TMs)) > 1
])
indices = [idx for idx in data.index if are_interhelix(idx)]
# Return the filtered data
return data.filter(indices, axis=0)
[docs]
def Spatially_distant(pkg, data, Interresidue_distance, **kwargs):
"""Retain only edges between spatially distant residue pairs
It only retains edges between residue pairs whose CA atoms are minimum a certain
number of angstroms away from each other in the initial PDB/structure (default 10 Å).
The relationship found between these residues can be considered purely allosteric, as
they are spatially distant and have no direct communication but can be found to be
interacting/correlated...
Other Parameters
----------------
**kwargs
'Interresidue_distance' kwarg can be passed to specify the minimum number of
angstroms that the CA atoms of residue pairs should have between each other in
the initial PDB/structure (default 10 Å) to be considered spatially distant.
"""
from MDAnalysis.analysis import distances
# https://userguide.mdanalysis.org/1.1.1/examples/analysis/distances_and_contacts/distances_within_selection.html
# Create a triangular matrix with all inter-residue distances
CAs = pkg.protein.protein.select_atoms("name CA")
n_ca = len(CAs)
self_distances = distances.self_distance_array(CAs.positions)
sq_dist_arr = np.zeros((n_ca, n_ca))
triu = np.triu_indices_from(sq_dist_arr, k=1)
sq_dist_arr[triu] = self_distances
# Transform the matrix into a pandas DataFrame
resnames = [f"{aa.resname}:{aa.resid}" for aa in pkg.protein.protein.residues]
df = pandas.DataFrame(sq_dist_arr, columns=resnames, index=resnames)
df = df.where( np.triu(np.ones(df.shape), k=1).astype(bool) )
df = pandas.DataFrame({"dist": df.stack()})
indices = df[df["dist"] >= Interresidue_distance].index
# Return the filtered data
return data.filter(indices, axis=0)
class NoNetworkException(Exception): pass
[docs]
class Filtering:
r"""Class for network filtering
Instances of this class are used to be added as attributes to instances of
Wrappers' classes (as attributes of a :class:`AlloViz.Protein` object) with the
purpose of storing (un)filtered networks according to different criteria.
Raw network edges stored as a DataFrame in the passed `pkg` object are filtered with
the corresponding function or combination of functions of the
:mod:`~AlloViz.AlloViz.Filtering` module and converted to individual NetworkX'
:external:ref:`Graphs <graph>` with :meth:`~AlloViz.AlloViz.Analysis.Analysis._get_G`
to be stored as private attributes.
Parameters
----------
pkg : instance of a class from :mod:`AlloViz.Wrappers`
The object is used to retrieve the raw network edges for network analysis. It
also contains/gives access to the corresponding:class:`~AlloViz.Protein` object
and its information.
filtering : str or list
Filtering scheme(s) with which to filter the list of network edges. A list of
strings is used to filter with a combination of criteria. All available (and
combinable) filtering options are functions in the
:mod:`~AlloViz.AlloViz.Filtering` module:
:func:`~AlloViz.AlloViz.Filtering.All`,
:func:`~AlloViz.AlloViz.Filtering.GetContacts_edges`,
:func:`~AlloViz.AlloViz.Filtering.No_Sequence_Neighbors`,
:func:`~AlloViz.AlloViz.Filtering.GPCR_Interhelix`,
:func:`~AlloViz.AlloViz.Filtering.Spatially_distant`.
name : str
Name of the filtering scheme. It will be the same name as the passed filtering
option if it is a single string, or the names of the filtering options joined by
"_" if a lsit has been passed. It is used to name the pkg's attribute in which
the class instance is saved.
Other Parameters
----------------
GetContacts_threshold : float
Optional kwarg that can be passed to specify the minimum contact
frequency (0-1, default 0) threshold, which will be used to filter out
contacts with a frequency (average) lower than it before analysis.
Sequence_Neighbor_distance : int
Optional kwarg that can be passed to specify the minimum number of sequence
positions/distance between residues of a pair to retain in No_Sequence_Neighbors
filtering, which defaults to 5.
Interresidue_distance : int or float
Optional kwarg that can be passed to specify the minimum number of angstroms
that the CA atoms of residue pairs should have between each other in the initial
PDB/structure (default 10 Å) to be considered spatially distant.
Attributes
----------
graphs : dict of external:ref:`Graph <graph>` objects
See Also
--------
AlloViz.Protein.filter : Class method to filter the network(s) raw edge weights with
different criteria.
AlloViz.Wrappers.Base.Base.filter : Pkg's method to filter the network(s) raw edge
weights with different criteria.
"""
def __init__(self, pkg, filtering, name, *, GetContacts_threshold=0, Sequence_Neighbor_distance=5, Interresidue_distance=10):
self._pkg = pkg
self._name = name
# Establish the paths and filenames
self._path = f"{self._pkg.protein._datadir}/{self._pkg._name}/{self._name}"
os.makedirs(self._path, exist_ok=True)
self._datapq = lambda element, metric: f"{self._path}/{element}_{metric}.pq"
# Get the filtered thata according to the filtering scheme(s)
filterings = (
filtering
if isinstance(filtering, list)
else [filtering]
)
data = self._pkg.raw
for filt in filterings:
filtfunc = eval(filt)
data = filtfunc(self._pkg, data,
GetContacts_threshold=GetContacts_threshold,
Sequence_Neighbor_distance=Sequence_Neighbor_distance,
Interresidue_distance=Interresidue_distance)
# Drop all-0 rows (not taking into account weight_std column if it's present)
self._filtdata = data.drop(data[(data.drop(columns=[c for c in data.columns if "std" in c]) == 0).all(axis=1)].index, axis=0)
self._graph_distances = -np.log(abs(self._filtdata) + 10E-10) + 10E-10
# un-transform standard error columns
self._graph_distances.loc[:,["std" in c for c in self._graph_distances.columns]] = data.loc[:,["std" in c for c in data.columns]]
# For each column in the filtered data that is not a standard error, create an analyzable NetworkX's graph and save it
self.graphs = {}
for col in [c for c in self._graph_distances if "std" not in c]:
try:
# The approach in lit. is to use -log10(|corr|) as edge weights/distances in the network for analyses
# e.g., https://www.pnas.org/doi/full/10.1073/pnas.0810961106
self.graphs[col] = self._get_G(self._graph_distances[col])
except NoNetworkException as e:
print(e)
[docs]
def _get_G(self, column):
r"""Return a DataFrame's column as a Graph
Transform a column from a :class:`pandas.DataFrame` (passed as a
:class:`pandas.Series`) into a NetworkX' :class:`~networkx.Graph`; retaining only the
largest component if the passed information results in an unconnected network.
Parameters
----------
column : :class:`pandas.Series`
"""
# Transform the column into a 3-column dataframe with the nodes' name and the value as weight. Drop 0s, NAs and use absolute value
weights = (
column[column != 0].dropna().abs().rename("weight").reset_index()
) # btw calculations fail with 0 value weights and cfb prob with negative
# Create the NetworkX's Graph and a list of its components
network = networkx.from_pandas_edgelist(weights, "level_0", "level_1", "weight")
components = list(networkx.connected_components(network))
if len(components) == 0:
raise NoNetworkException(
" ".join((
f"EXCEPTION! No connected components in network ({network.number_of_nodes()} nodes):",
self._pkg._name,
self._name,
column.name,
))
)
else:
# Check that the largest connected component has the same size as the total number of nodes, else select the largest component to return as the Graph to be analyzed
largest_component = max(components, key=len)
if len(largest_component) < network.number_of_nodes():
print(
f"WARNING! Unconnected network ({network.number_of_nodes()} nodes):",
self._pkg._name,
self._name,
column.name,
"\n",
f"Largest network component will be used for analysis. Sizes (number of nodes) of all components: {[len(comp) for comp in components]}",
)
network = network.subgraph(largest_component)
return network
[docs]
def analyze(self, elements="edges", metrics="all", cores=1, nodes_dict=Analysis.nodes_dict, edges_dict=Analysis.edges_dict, **kwargs):
r"""Analyze the filtered network
Send the analyses of the filtered network for the passed combinations of
elements-metrics. The individual :external:ref:`Graphs <graph>` saved as private
attributes upon object initialization are analyzed independently with the
:mod:`~AlloViz.AlloViz.Analysis` module (this function calls
:func:`AlloViz.AlloViz.Analysis.analyze`). Results are stored as new instances
of classes from the :mod:`AlloViz.AlloViz.Elements` module, which extend the
:class:`pandas.DataFrame` class.
Parameters
----------
elements : str or list, {"edges", "nodes"}
Network element for which to perform the analysis.
metrics : str or list, default: "all"
Network metrics to compute, which must be keys in the `nodes_dict` or
`edges_dict` dictionaries. Default is "all" and it sends the computation for
all the metrics defined in the corresponding dictionary of the selected
elements in `element`.
cores : int, default: 1
Number of cores to use for parallelization with a `multiprocess` Pool.
Default value only uses 1 core with a custom :class:`AlloViz.utils.dummypool`
that performs computations synchronously.
Other Parameters
----------------
nodes_dict, edges_dict : dict
Optional kwarg(s) of the dictionary(ies) that maps network metrics custom
names (e.g., betweenness centrality, "btw") with their corresponding NetworkX
function (e.g., "networkx.algorithms.centrality.betweenness_centrality").
Functions strings must be written as if they were absolute imports, and must
return a dictionary of edges or nodes, depending on the element dictionary in
which they are. The keys of the dictionaries will be used to name the columns
of the analyzed data that the functions produce. Defaults are
:data:`~AlloViz.AlloViz.Analysis.nodes_dict` and
:data:`~AlloViz.AlloViz.Analysis.edges_dict`.
**kwargs
Other optional keyword arguments that will be passed to the NetworkX analysis
function(s) that is(are) used on the method call in case they need extra
parameters.
"""
# Depending on the desired cores, use a dummypool (synchronous calculations) or a `multiprocess` Pool
# Changing it inside the `utils` module allows to share the same one between modules
if cores > 1:
mypool = Pool(cores)
else:
mypool = utils.dummypool()
utils.pool = mypool
if self._filtdata.size == 0:
print(f"{self._pkg._name} {self._name} is not a connected network (or subnetwork)")
else:
Analysis.analyze(self, elements, metrics, nodes_dict, edges_dict, **kwargs)
# Close the pool
utils.pool.close()
utils.pool.join()
utils.pool = utils.dummypool()