Source code for runClarinet

"""
@author: Yasmine Ahmed
"""

import pandas as pd
import re
import numpy as np
import networkx as nx
import math
import pickle
from community import community_louvain
import matplotlib.pyplot as plt
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import seaborn as sns
import argparse
import os
import time
import logging
from datetime import datetime


[docs]def create_eclg(interaction_filename,model_dict):
    """
    This function creates the ECLG where a node is an event (e.g., biochemical interaction) and there
    is an edge between two nodes (two events) if they happen to occur in the same paper. The reading output
    file is in BioRECIPE format.

    Parameters
    ----------
    interaction_filename : str
               The path of the reading output file (extracted events)
    model_dict : dict
               Dictionary that holds critical information of each baseline model element

    Returns
    -------
    G : Graph
               Event CoLlaboration Graph
    """
    papers=list() #list of Paper IDs within the RO file
    Mean_interactions_per_paper = 0

    #FIXME: hard coded here, the 28th column of ReadingOutput file has to be 'Paper IDs'
    Paper_IDs_col = 27
    #FIXME: hard coded here, the 1st, 9th, 17th columns of ReadingOutput file
    #have to be 'Regulator Name', 'Regulated Name' and 'Sign'
    regulator_col = 0
    regulated_col = 8
    interaction_sign_col = 16

    with open(interaction_filename) as interaction_file:
        for line in interaction_file:
            if 'Regulator Name' in line:
                continue
            line = line.strip()
            interaction = re.split(',',line)

            papers.append(interaction[Paper_IDs_col])

    for p in np.unique(papers):
        Mean_interactions_per_paper+=papers.count(p)

    Paper_ID_unique=np.unique(papers)
    logging.info('Number of unique paper IDs: {}'.format(str(len(Paper_ID_unique))))
    logging.info('Average interactions per paper: {}'.format(str(Mean_interactions_per_paper/len(Paper_ID_unique))))

    G = nx.Graph() #this will contain ECLG nodes and edges
    IDs = len(Paper_ID_unique)
    curr_map=dict()

    #Create ECLG from RO file
    for jj in range(0, IDs):
        my_list = []
        with open(interaction_filename) as interaction_file:
            for line in interaction_file:
                if 'Regulator Name' in line:
                    continue
                line = line.strip()
                interaction = re.split(',',line)
                if Paper_ID_unique[jj] in interaction[Paper_IDs_col]:
                    inter1P=''
                    if (interaction[regulator_col] != '' and interaction[regulated_col] != ''):
                        elem11 = getVariableName(model_dict,curr_map,interaction[regulator_col:regulated_col])
                        elem22 = getVariableName(model_dict,curr_map,interaction[regulated_col:interaction_sign_col])
                        if interaction[interaction_sign_col] == 'positive':
                            inter1P = elem11+'->'+elem22+'->+'#+'->'+we
                        else:
                            inter1P = elem11+'->'+elem22+'->-'#+'->'+we
                        my_list.append(inter1P)

        for n in range(0, len(my_list)-1):
            for nn in range(n+1, len(my_list)):
                if G.has_edge(my_list[n], my_list[nn]) or G.has_edge(my_list[nn], my_list[n]):
                    G[my_list[n]][my_list[nn]]['weight'] += 1
                    if my_list[n] == my_list[nn]:
                        G.remove_edge(my_list[n],my_list[nn])
                else:
                    G.add_edge(my_list[n], my_list[nn], weight=1)
                    if my_list[n] == my_list[nn]:#self loops
                        G.remove_edge(my_list[n],my_list[nn])

    return G


[docs]def create_eclg_el(interaction_filename,model_dict):

    """
    This function creates the ECLG where a node is an event (e.g., biochemical interaction) and there
    is an edge between two nodes (two events) if they happen to occur in the same paper. The reading output
    file header must contain the following fields: Element Name, Element Type, Element Identifier,
    PosReg Name/Type/ID, NegReg Name/Type/ID and Paper ID.

    Parameters
    ----------
    interaction_filename : str
               The path of the reading output file (extracted events)
    model_dict : dict
               Dictionary that holds critical information of each baseline model element

    Returns
    -------
    G : Graph
               Event CoLlaboration Graph
    """
    df = pd.read_excel(interaction_filename)
    papers=list()

    #remove date entries
    date_indices = list()
    for idx,ele_name in enumerate(df['Element Name']):
        if isinstance(ele_name, datetime) or isinstance(df['PosReg Name'].iloc[idx], datetime) or isinstance(df['NegReg Name'].iloc[idx], datetime):
            date_indices.append(int(idx))
    for iii in date_indices:
        df = df.drop([iii])

    Element_name=df['Element Name']
    Element_type=df['Element Type']
    Element_ID=df['Element Identifier']
    PosReg_Name=df['PosReg Name']
    PosReg_type=df['PosReg Type']
    PosReg_ID=df['PosReg ID']
    NegReg_Name=df['NegReg Name']
    NegReg_type=df['NegReg Type']
    NegReg_ID=df['NegReg ID']
    Paper_ID=df['Paper ID']

    for p in Paper_ID:
        papers.append(p)

    Paper_ID_unique=np.unique(papers)
    #print(len(Paper_ID_unique))
    Mean_interactions_per_paper =0
    for p in np.unique(papers):
        Mean_interactions_per_paper+=papers.count(p)

    G = nx.Graph()
    IDs=len(Paper_ID_unique)

    curr_map=dict()
    for jj in range(0,IDs):
        my_list= []
        for idx,ele_name in enumerate(Element_name):
            if Paper_ID_unique[jj] in Paper_ID.iloc[idx]:
                inter1P=''
                elem=getVariableName(model_dict,curr_map,[(ele_name),'','',Element_ID.iloc[idx],'',Element_type.iloc[idx],'',''])#ele_name
                Pos=getVariableName(model_dict,curr_map,[str(PosReg_Name.iloc[idx]),'','',str(PosReg_ID.iloc[idx]),'',str(PosReg_type.iloc[idx]),'',''])#PosReg_Name[idx]
                Neg=getVariableName(model_dict,curr_map,[str(NegReg_Name.iloc[idx]),'','',str(NegReg_ID.iloc[idx]),'',str(NegReg_type.iloc[idx]),'',''])#NegReg_Name[idx]

                if str(Pos) != "nan_ext" and str(Pos) != "":
                    for posi in re.split(',',Pos):
                        if elem in model_dict and posi in model_dict:
                            if posi in model_dict[elem]['regulators']:
                                we=str(3)
                            elif posi not in model_dict[elem]['regulators'] and elem not in model_dict[posi]['regulators'] :
                                we=str(2)
                            elif elem in model_dict[posi]['regulators']:
                                we=str(0)
                        elif posi in model_dict and elem not in model_dict:
                            we=str(1)
                        elif elem in model_dict and posi not in model_dict:
                            we=str(1)
                        else:
                            we=str(0)
                        inter1P=posi+'->'+elem+'->+'#+'->'+we
                        my_list.append(inter1P)

                if str(Neg) != "nan_ext" and str(Neg) != "":#type(Neg) is not float:
                    for nega in re.split(',',Neg):
                        if elem in model_dict and nega in model_dict:
                            if nega in model_dict[elem]['regulators']:
                                we=str(3)
                            elif nega not in model_dict[elem]['regulators'] and elem not in model_dict[nega]['regulators'] :
                                we=str(2)
                            elif elem in model_dict[nega]['regulators']:
                                we=str(0)
                        elif nega in model_dict and elem not in model_dict:
                            we=str(1)
                        elif elem in model_dict and nega not in model_dict:
                            we=str(1)
                        else:
                            we=str(0)
                        inter1P=nega+'->'+elem+'->-'#+'->'+we
                        my_list.append(inter1P)


        my_list = list(set(my_list))
        for n in range(0, len(my_list)-1):
            for nn in range(n+1, len(my_list)):

                if G.has_edge(my_list[n], my_list[nn]) or G.has_edge(my_list[nn], my_list[n]):
                    G[my_list[n]][my_list[nn]]['weight']+= 1#*(weight1+weight2) #+= 1/IDs
                    if my_list[n]==my_list[nn]:
                        G.remove_edge(my_list[n],my_list[nn])
                else:
                    G.add_edge(my_list[n], my_list[nn], weight=1)#*(weight1+weight2))#=1/IDs)
                    if my_list[n]==my_list[nn]:
                        G.remove_edge(my_list[n],my_list[nn])
    return G

# Individual assessment (IA)
[docs]def node_weighting(G, freqTh, path):
    """
    This function assigns weights to graph nodes using frequency class, and returns a new ECLG after removing less frequent nodes.
    In the meantime, ECLG nodes and their freqClass level, ECLG edges before and after the removal will be saved to specified directory.

    Parameters
    ----------
    G : undirected graph
        Event CoLlaboration Graph
    freqTh : int
        Frequency class threshold value, events (nodes) having FC greater than this value will be removed
    path : str
        The output directory where the genereted files will be saved

    Returns
    -------
    G : undirected graph
        a new ECLG after the removal of less frequent nodes

    """
    # Assign weights to nodes using frequency class concept

    ebunch = list() #less frequent nodes that will be removed
    nodesDegree = list()
    fill = os.path.join(path, "freqClass")
    output_stream = open(fill, 'w')

    for g in G.nodes:
        nodesDegree.append(G.degree[g])
        if G.degree[g] == 0:
            continue

    freqMostCommonNode = max(nodesDegree)
    #print("Frequency of most frequent node before: "+str(freqMostCommonNode))

    #Mapping node names to frequency class
    for g in G.nodes:
        if G.degree[g] == 0: continue
        freqCLASS = math.floor(0.5-np.log2(G.degree[g]/freqMostCommonNode))
        newName = g + '->'+str(freqCLASS)
        mapping = {g: newName}
        G = nx.relabel_nodes(G, mapping)

    nx.write_edgelist(G, os.path.join(path,"ECLGbefore.txt")) #ECLG nodes and edges before the removal of less frequent nodes

    for g in G.nodes:
        if G.degree[g] == 0:
            continue
        freqCLASS = math.floor(0.5-np.log2(G.degree[g]/freqMostCommonNode))
        if freqCLASS > int(freqTh): # Frequency class threshold which may vary from case study to another (set it =2 for T cell case study)
            ebunch.append(g)
        output_stream.write(g+' '+str(G.degree[g])+' '+str(freqCLASS)+'\n')
        nodesDegree.append(G.degree[g])
    G.remove_nodes_from(ebunch)

    nx.write_edgelist(G, os.path.join(path,"ECLGafter.txt")) #ECLG nodes and edges after the removal of less frequent nodes

    return G

# Pair assessment (IA)
[docs]def edge_weighting(G, path, weightMethod):
    """
    This function assigns weights to graph edges using frequency class (FC) or inverse frequency formula (IF), and returns a weighted ECLG.
    In the meantime, ECLG edges and their weights will be saved to specified directory.

    Parameters
    ----------
    G : undirected graph
        Event CoLlaboration Graph
    path : str
        The output directory where the genereted files will be saved
    weightMethod : str
        'FC' or 'IF'

    Returns
    -------
    G : undirected graph
        ECLG after assigning weights to edges
    """
    # Assign weights to edges using frequency class concept (fc) or inverse frequency concept (iif)

    fill = os.path.join(path, "LSS") #This file will be input to get_cluster_info()
    output_stream = open(fill, 'w')
    N = G.number_of_nodes()
    weights = list()

    for n1, n2, w in G.edges.data():
        weights.append(G[n1][n2]['weight'])

    maxWeight = int(max(weights))

    for n1, n2, w in G.edges.data():
        w1 = G[n1][n2]['weight']
        if weightMethod == 'FC':
            dd = math.floor(0.5-np.log2(w1/(maxWeight))) #fc
            G[n1][n2]['weight']= dd
            output_stream.write(n1+' '+n2+' '+str(dd)+'\n')
        elif weightMethod == 'IF':
            p1 = np.log(N/G.degree[n1])
            p2 = np.log(N/G.degree[n2])
            dd = w1*(p1 + p2)
            if dd < 0:
                dd = 0
            G[n1][n2]['weight']= dd
            output_stream.write(n1+' '+n2+' '+str(dd)+'\n')
    return G

[docs]def clustering(G, path):
    """
    This function implements three things:
    (1) clusters the ECLG using the community detection algorithm by Blondel et al., and returns a pickle file containing grouped
    (clustered) extensions, specified as nested lists. Each group starts with an integer, followed by interactions specified as
    [regulator element, regulated element, Interaction type: Activation (+) or Inhibition (-)];
    (2) displays the cluster result;
    (3) saves each cluster in a separate file, in both uninterpreted (under GeneratedClusters/) and interpreted manners (under InterpretedClusters/).

    Parameters
    ----------
    G : undirected graph
        Event CoLlaboration Graph
    path : str
        The output directory where the genereted files will be saved

    """
    #Clustering

    partition = community_louvain.best_partition(G)
    centers = {}
    communities = {}
    G_main_com = G.copy()
    min_nb = 2
    com_edges = list()
    group_num = 1
    for com in set(partition.values()) :
        list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
        if len(list_nodes) < min_nb:
            G_main_com.remove_nodes_from(list_nodes)
        else:
            # Get center
            H = G_main_com.subgraph(list_nodes)
            d_c = nx.degree_centrality(H)
            center = max(d_c, key=d_c.get)
            centers[center] = com
            communities[com] = center
            # Print community
            logging.info('Community centered at "{}"(community label is {}) has {} interactions:\n{}\n'.format(center, com, len(list_nodes),list_nodes))
            NODESS=list()
            for ii in range(0, len(list_nodes)-1):
                for jj in range(ii+1, len(list_nodes)):
                    if G.has_edge(list_nodes[ii],list_nodes[jj]):
                        wi=G[list_nodes[ii]][list_nodes[jj]]['weight']
                        temp=list()
                        temp.append(list_nodes[ii])
                        temp.append(list_nodes[jj])
                        temp.append(wi)
                        NODESS.append(temp)
            com_edges.append([group_num]+NODESS)
            group_num=group_num+1
    pickle.dump(com_edges, open(os.path.join(path, "grouped_ext"),'wb')) #all clusters in one pickle file

    # Display graph
    plt.figure(figsize=(13, 5))
    node_size = 30
    count = 0
    pos = nx.spring_layout(G_main_com)
    colors = dict(zip(communities.keys(), sns.color_palette('hls', len(communities.keys()))))
    for com in communities.keys():
        count = count + 1
        list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com and nodes not in communities.values()]
        nx.draw_networkx_nodes(G_main_com, pos, list_nodes, node_size = node_size, node_color = colors[com])
        nx.draw_networkx_nodes(G_main_com, pos, list([communities[com]]), node_size = node_size*5, node_color = colors[com])
    nx.draw_networkx_edges(G_main_com, pos, alpha=0.5)
    labels = {k: k for k,v in centers.items()}
    nx.draw_networkx_labels(G_main_com, pos, labels)
    plt.axis('off')
    plt.show()

    #Save each cluster in a separate file

    thePath = os.path.join(path, "GeneratedClusters") # Directory containing generated clusters
    if not os.path.exists(thePath):
        os.makedirs(thePath)
    clusterFile = os.path.join(thePath, 'GeneratedCluster')  # generated (uninterpreted) clusters

    for e in com_edges:
        cl = []
        fill=clusterFile+str(e[0])
        output_stream = open(fill, 'w')
        for ee in e[1:]:
            s0 = ee[0]
            s1 = ee[1]
            if ee[1] == ee[0]:continue
            if ee[0]+' '+ee[1]+' '+str(ee[2])+'\n' not in cl:
                output_stream.write(ee[0]+' '+ee[1]+' '+str(ee[2])+'\n')
                cl.append(ee[0]+' '+ee[1]+' '+str(ee[2])+'\n')
        output_stream.close()

    thePath = os.path.join(path, "InterpretedClusters")
    if not os.path.exists(thePath):
        os.makedirs(thePath)
    clusterFile = os.path.join(thePath, 'InterpretedCluster') #interpreted clusters

    for e in com_edges:
        cl = []
        fill = clusterFile+str(e[0])
        output_stream = open(fill, 'w')
        for ee in e[1:]:
            s0 = ee[0].split('->')
            s1 = ee[1].split('->')
            if ee[1] == ee[0]:continue
            if s0[0]+' '+s0[1]+' '+s0[2]+' '+s0[3]+'\n' not in cl and s1[0]+' '+s1[1]+' '+s1[2]+' '+s1[3]+'\n' not in cl:
                output_stream.write(s0[0]+' '+s0[1]+' '+s0[2]+' '+s0[3]+'\n')
                output_stream.write(s1[0]+' '+s1[1]+' '+s1[2]+' '+s1[3]+'\n')
                cl.append(s0[0]+' '+s0[1]+' '+s0[2]+' '+s0[3]+'\n')
                cl.append(s1[0]+' '+s1[1]+' '+s1[2]+' '+s1[3]+'\n')
        output_stream.close()

[docs]def get_cluster_info(generated_clu_path, LSS_file, output_path):
    """
    This function returns some basic information about each of these generated clusters as a DataFrame object, as well as saves it as .csv file
    Information includes Cluster_index, Nodes, Edges, Density, AvgPathLength, Coeff, LSS, NodesX, EdgesX, DensityX, AvgPathLength, CoeffX, FreqClass, node_perc.

    Parameters
    ----------
    generated_clu_path : str
        The directory that contains the genereted clusters
    LSS_file : str
        The path of LSS_file, containing ECLG edges and their weights, generated in edge_weighting()
    output_path : str
        The output directory where ClusterInfoFile.csv will be saved

    Returns
    -------
    cluster_df : pandas.DataFrame()
        DataFrame that contains information for each generated cluster
    """

    entries = os.listdir(generated_clu_path) #directory of generated clusters
    NewFile = os.path.join(output_path, "ClusterInfoFile.csv") #output file that contains clusters'info
    output_stream = open(NewFile, "w")
    output_stream.write("Cluster_index, Nodes, Edges, Density, AvgPathLength, Coeff, LSS, NodesX, EdgesX, DensityX, AvgPathLength, CoeffX, FreqClass, node_perc"+'\n')
    cluster_df = pd.DataFrame(columns=['Cluster_index', 'Nodes', 'Edges', 'Density', 'AvgPathLength', 'Coeff', 'LSS', 'NodesX', 'EdgesX', 'DensityX', 'AvgPathLength', 'CoeffX', 'FreqClass', 'node_perc'])
    frequencyCLasses = dict()

    with open(LSS_file) as f: #This file that has been generated from runClarinet, it contains weighted edges of the ECLG, its name is LSSfc
        content = f.readlines()
        for c in content:
            x = c.strip()
            xx = x.split(" ")
            theKey = xx[0]+" "+xx[1]
            frequencyCLasses[theKey]=xx[2] #edge weights either FC or IF

    for entry in entries:
        Cl = ''

        for e in entry:
            if e.isdigit(): Cl += str(e)
        if not entry.startswith("GeneratedCluster"): continue

        G = nx.Graph() #generated cluster
        G1 = nx.Graph() #interpreted cluster

        weights = []

        with open(os.path.join(generated_clu_path, entry)) as f:
            content = f.readlines()
            freqCount = 0

            for c in content:
                x = c.strip()
                xx = x.split(" ")
                G.add_edge(xx[0],xx[1])
                G[xx[0]][xx[1]]['weight']=xx[2]
                weights.append(float(xx[2]))
                Int1 = xx[0].split('->')
                Int2 = xx[1].split('->')
                G1.add_edge(Int1[0],Int1[1])
                G1.add_edge(Int2[0],Int2[1])
                G1[Int1[0]][Int1[1]]['weight']=Int1[3] #Int1[2] will be +/- based on type of interaction
                G1[Int2[0]][Int2[1]]['weight']=Int2[3]

            num_of_nodes = G1.number_of_nodes()
            nod = 0 # increment this variable whenever you find an elemnt that is in the baseline model

            for no in G1.nodes():
                nn1 = no.split('_')
                if len(nn1)>=2:
                    if nn1[-1]!="ext":
                        nod += 1
                elif len(nn1)==1:
                    nod += 1

            # compute % node overlap
            node_perc = 100*(nod/num_of_nodes)
            for n in G.nodes():
                xX = n.split("->")
                if str(xX[3]) == '0':
                    freqCount += 1

            weightsss=list()

            for n1, n2, w in G.edges.data():
                for key in frequencyCLasses:
                    Inter = n1+" "+n2
                    if Inter ==key:
                        weightsss.append(float(frequencyCLasses[key]))

                pathlengths = []
                pathlengths1 = []

                for n in G.nodes():
                    spl = dict(nx.single_source_shortest_path_length(G, n))
                    for p in spl:
                        pathlengths.append(spl[p])
                Paths = sum(pathlengths) / len(pathlengths)

                for n in G1.nodes():
                    spl = dict(nx.single_source_shortest_path_length(G1, n))
                    for p in spl:
                        pathlengths1.append(spl[p])
                Paths1 = sum(pathlengths1) / len(pathlengths1)

            cluster_df.loc[int(Cl)]= [int(Cl), G.number_of_nodes(), G.number_of_edges(), nx.density(G), Paths, nx.average_clustering(G), sum(weights)/len(weights), G1.number_of_nodes(), G1.number_of_edges(), nx.density(G1), Paths1, nx.average_clustering(G1), freqCount, node_perc]
            output_stream.write(', '.join(str(x) for x in cluster_df.loc[int(Cl)]) + '\n')
    output_stream.close()
    return cluster_df

[docs]def merge_clusters(regulators, path, ReturnTh):
    """
    This function records indices of clusters to be merged based on the existence of return paths.
    It generates the grouped_ext_Merged pickle file that contains the merged clusters.

    Parameters
    ----------
    regulators : dict
		Contains baseline model elements and corresponding regulator elements
    path : str
		The path of the directory that contains the grouped_ext file
    ReturnTh : int
		A user-defined integer threshold for the number of return paths, beyond which clusters will be merged
    """
    # Merge clusters if there is one or more return paths

    G = nx.DiGraph()
    G = make_diGraph(regulators)
    com_edges = list()
    group_num = 1
    extensions = pickle.load(open(os.path.join(path,"grouped_ext"),'rb'))

    for ii in range(0,len(extensions)):
        for jj in range(ii+1,len(extensions)):
            count = 0
            cluster1 = extensions[ii]
            cluster2 = extensions[jj]
            G1 = nx.DiGraph()
            G2 = nx.DiGraph()

            for e in cluster1[1:]:
                ee=e[1].split('->')
                if ee[2] == '+':
                    G1.add_edge(ee[0], ee[1],weight=1)
                elif ee[2] == '-':
                    G1.add_edge(ee[0], ee[1],weight=0)

            for e in cluster2[1:]:
                ee=e[1].split('->')
                if ee[2] == '+':
                    G2.add_edge(ee[0], ee[1],weight=1)
                elif e[2] == '-':
                    G2.add_edge(ee[0], ee[1],weight=0)
            Gall = nx.compose(G1,G2)
            for g in G.edges:
                if g[0] in G1:
                    for ne in G.successors(g[1]):
                        if ne in G2:
                            for ne1 in G2.successors(ne):
                                if ne1 in G:
                                    count = count+1
                if g[0] in G1:
                    for ne in G.successors(g[0]):
                        if ne in G2:
                            for ne1 in G2.successors(ne):
                                if ne1 in G:
                                    count = count+1

            if count > int(ReturnTh): #set threshold for the number of return paths
                logging.info('Merge clusters NO.{} and NO.{}'.format(str(ii+1),str(jj+1)))
                Gall = nx.compose(G1,G2)
                NODESS = list()
                for (node1,node2,data) in Gall.edges(data=True):
                    temp=list()
                    temp.append(node1)
                    temp.append(node2)
                    if data['weight'] == 0:
                        temp.append('-')
                    elif data['weight'] == 1:
                        temp.append('+')
                    NODESS.append(temp)
                com_edges.append([group_num] + NODESS)
                group_num = group_num+1

    pickle.dump(com_edges, open(os.path.join(path, "grouped_ext_Merged"),'wb')) #Merged clusters

    return

#This and the following function are inherited from DySE framework
# define regex for valid characters in variable names
_VALID_CHARS = r'a-zA-Z0-9\@\_\/'
[docs]def get_model(model_file: str):
	"""
	This function reads the baseline model of BioRECIPES format and returns two useful dictionaries

	Parameters
	----------
	model_file : str
		The path of the baseline model file

	Returns
	-------
	model_dict : dict
		Dictionary that holds critical information of each baseline model element
	regulators : dict
		 Contains baseline model elements and corresponding regulator elements
	"""

	global _VALID_CHARS

	regulators = dict()
	model_dict = dict()

	# Load the input file containing elements and regulators
	df_model = pd.read_excel(model_file, na_values='NaN', keep_default_na = False)
	# check model format
	if df_model.columns[0].lower() == 'element attributes':
		df_model = df_model.reset_index()
		df_model = df_model.rename(columns=df_model.iloc[1]).drop([0,1]).set_index('#')

	input_col_name = [x.strip() for x in df_model.columns if ('element name' in x.lower())]
	input_col_ids = [x.strip() for x in df_model.columns if ('element ids' in x.lower())]
	input_col_type = [x.strip() for x in df_model.columns if ('element type' in x.lower())]
	input_col_X = [x.strip() for x in df_model.columns if ('variable' in x.lower())]
	input_col_A = [x.strip() for x in df_model.columns if ('positive regulation rule' in x.lower())]
	input_col_I = [x.strip() for x in df_model.columns if ('negative regulation rule' in x.lower())]

	# set index to variable name column
	# remove empty variable names
	# append cols with the sets of regulators using .apply

	for curr_row in df_model.index:
		element_name = df_model.loc[curr_row,input_col_name[-1]].strip()
		ids = str(df_model.loc[curr_row,input_col_ids[0]]).strip().upper().split(',')
		element_type = df_model.loc[curr_row,input_col_type[0]].strip()
		var_name = df_model.loc[curr_row,input_col_X[0]].strip()
		pos_regulators = df_model.loc[curr_row,input_col_A[0]].strip()
		neg_regulators = df_model.loc[curr_row,input_col_I[0]].strip()

		if var_name == '':
			continue

		curr = []

		if pos_regulators != '':
			curr += re.findall('['+_VALID_CHARS+']+',pos_regulators)

		if neg_regulators != '':
			curr += re.findall('['+_VALID_CHARS+']+',neg_regulators)

		# returning regulators separately for compatibility with runMarkovCluster
		regulators[var_name] = set(curr)
		model_dict[var_name] = {
			'name' : element_name,
			'ids' : ids,
			'type' : element_type,
			'regulators' : set(curr)}

	return model_dict, regulators

[docs]def getVariableName(model_dict, curr_map, ext_element_info):
	"""
	A utility function for create_eclg() and create_eclg_el(), which matches the element name from the extracted event to an element in the baseline model

	Parameters
	----------
	model_dict : dict
		Dictionary that holds critical information of each baseline model element
	curr_map: dict
		Temporary dictionary that contains already matched pairs
	ext_element_info: list
		List of information for certain element in the extracted event, starting with element name

	Returns
	-------
	match : str
		The most likely matched element name in model_dict, to the element represented by ext_element_info; Otherwise, return the extended element name suffix by "_ext"
	"""

	global _VALID_CHARS

	ext_element_name= ext_element_info[0]
    #FIXME: hard-coded here and lines 733/734,
    #element name is the first in the tuple, followed by element_type, then five columns later element_id

	# Check for valid element name
	if ext_element_name=='':
		#logging.warn('Missing element name in extensions')
		return ''
	elif re.search('[^'+_VALID_CHARS+']+',ext_element_name):
		#logging.warn(('Skipping due to invalid characters in variable name: %s') % str(ext_element_name))
		return ''

	ext_element_id = ext_element_info[5]
	ext_element_type = ext_element_info[1]

	if ext_element_name in curr_map:
		return curr_map[ext_element_name]

	# from the location and type
	match = ext_element_name + '_ext'
	confidence = 0.0
	# Iterate all names in the dictionary and find the most likely match
	for key,value in model_dict.items():
		#print(ext_element_id)

		curr_conf = 0.0
		if str(ext_element_id).upper() in value['ids']:
			curr_conf = 1
		elif ext_element_name.upper().startswith(value['name'].upper()) \
			or value['name'].upper().startswith(ext_element_name.upper()):
			curr_conf = 0.8

		if curr_conf>0 and value['type'].lower().startswith(ext_element_type):
			curr_conf += 1

		if curr_conf > confidence:
			match = key
			confidence = curr_conf
			if curr_conf==2: break

	curr_map[ext_element_name] = match
	return match


[docs]def make_diGraph(mdldict):

    """
    A utility function for merge_clusters(), this function converts the baseline model into a directed graph.

    Parameters
    ----------
    regulators : dict
		Contains baseline model elements and corresponding regulator elements

    Returns
    -------
    G : DiGraph()
        Directed graph of the baseline model
    """

    G = nx.DiGraph()
    G.clear()
    for key, values in mdldict.items():
        G.add_node(key)
        for value in values:
            G.add_edge(value, key)

    return G

def get_args():

    parser = argparse.ArgumentParser(description="Network model extension using CLARINET")
    parser.add_argument('ReadingOutput', type=str,help="Reading output spreadsheet")
    parser.add_argument('Baseline', type=str,help="Baseline model in BioRECIPES format")
    parser.add_argument('out', type=str,help="Output directory")
    parser.add_argument('ReturnTh', type=str,help="Return path threshold")
    parser.add_argument('FCTh', type=str,help="Frequency class threshold")
    args = parser.parse_args()

    return(args)


def main():

    t0 = time.time()

    args = get_args()
    #Reading output (RO) .csv file. File format(RegulatedName,RegulatedID,RegulatedType,RegulatorName,RegulatorID,RegulatorType,PaperID)
    #for the other RO format use create_eclg_el
    interaction_filename = args.ReadingOutput
    #Baseline model in BioRECIPES tabular format
    model_dict, regulators = get_model(args.Baseline)
    weightMethod = 'FC' # or 'IF'
    G = create_eclg(interaction_filename, model_dict) #or use create_eclg
    G = node_weighting(G, args.FCTh, args.out)
    G = edge_weighting(G, args.out, weightMethod)
    clustering(G, args.out)
    # Get cluster information
    get_cluster_info(os.path.join(args.out,"GeneratedClusters"), os.path.join(args.out,"LSS"), args.out)
    merge_clusters(regulators, args.out, args.ReturnTh)

    t1 = time.time()
    total = t1-t0
    print("time to run CLARINET in seconds: " + str(total))


if __name__ == '__main__':
    main()