hin_save_splits.py

import pandas as pd
import networkx as nx
import numpy as np

from bertopic import BERTopic
from tqdm import tqdm
from copy import deepcopy
from sklearn.manifold import TSNE

path = 'path-to-data-repository'

df_file = pd.read_parquet('{}file-name_query03-05.parquet'.format(path))
del df_file['Unnamed: 0']

df_topics = pd.read_parquet('{}topics03-05.parquet'.format(path))
del df_topics['text']
del df_topics['phrases']
# grobid and nougat
df_topics['txt_file_name'] = df_topics['file_name'].str.replace('.pdf', '.txt')
# old
# df_topics['txt_file_name'] = df_topics['file_name'].str.replace('.pdf', '.pdf.txt')
del df_topics['file_name']

df_emb = pd.read_parquet(f'{path}df_embs_llama3-1.parquet')

df_topics = df_topics.join(df_emb, on='txt_file_name')
del df_topics['txt_file_name']

# def maintain_shape(x, emb_dim=4608):
#     if type(x) is not np.ndarray:
#         x = np.zeros(emb_dim)
#     elif len(x) == 0:
#         x = np.zeros(emb_dim)
#     return x

# df_topics['embedding'] = df_topics['embedding'].apply(maintain_shape)
# tsne_reduced = TSNE(n_components = 512, perplexity = 50.0, random_state = 42, method = 'exact').fit_transform(
#     np.stack(df_topics['embedding'].values))

# df_topics['embedding'] = [x for x in tsne_reduced]

df_file = df_file.join(df_topics.set_index('doi'), on='doi')

df_file = df_file.fillna('?')
df_emb = df_emb.fillna('?')
df_topics = df_topics.fillna('?')

def make_networkx(df, texts_df, topic_model,
                    id_feature='doi', special_attributes={'name': 'smile'}, list_features=[
                        'bioActivity', 'molType', 'collectionSpecie', 'collectionSite', 'collectionType', 'molecularMass', 'monoisotropicMass', 'cLogP', 'tpsa', 
                        'numberOfLipinskiViolations', 'numberOfH_bondAcceptors', 'numberOfH_bondDonors', 'numberOfRotableBonds', 'molecularVolume', 'name'
                    ], emb_dim=4096,
):
    def create_edge(value1, value2, group1, group2, node_from):            
        if value1 != '?' and value2 != '?':
            G.add_edge(value1, value2, edge_group=f'{group1}_{group2}')
            G.nodes[value1]['group'] = group1
            G.nodes[value2]['group'] = group2
            G.nodes[value2]['node_from'] = node_from
    
    def create_attribute(attribute_feature, feature_value, attribute_value):
        if attribute_value != '?':
            try:
                G.nodes[feature_value][attribute_feature] = attribute_value
            except:
                print('attribute_feature: {}, feature_value: {}, attribute_value: {}'. format(attribute_feature, feature_value, attribute_value))

    G = nx.Graph()
    for _, row in tqdm(df.iterrows()):
        for feature in list_features:
            create_edge(row[id_feature], row[feature], id_feature, feature, 'nubbe')
            if feature in special_attributes:
                create_attribute(special_attributes[feature], row[feature], row[special_attributes[feature]])
    for _, row in texts_df.iterrows():
        for topic in row['topics']:
            create_edge(row[id_feature], topic_model.get_topic_info(topic)['Name'].iloc[0], id_feature, 'topic', 'pdf')
        try:
            if row['embedding'] != '?' and len(row['embedding'] > 0):
                G.nodes[row[id_feature]]['embedding'] = row['embedding']
            else:
                G.nodes[row[id_feature]]['embedding'] = np.array([0.0]*emb_dim)
                print(f"doi {row[id_feature]} is NaN")
        except:
            print('doi {} has no connections'.format(row[id_feature]))

    return G

topic_model = BERTopic.load('{}topic_model03-05'.format(path))

G = make_networkx(df_file, df_topics, topic_model)

nx.write_gpickle(G, f"{path}hin_grobid_llama-18-09.gpickle")

## splits

def disturbed_hin(G, split=0.6, random_state=None, extra_cut_from='nubbe', edge_group='doi_bioActivity', node_from_feature='node_from', type_feature='edge_group', group_feature='group'):
    """
    G: hin;
    split: percentage to be cut from the hin;
    random_state: ;
    extra_cut_from: edges from the origin that needs to be cut but not restored;
    edge_group: string of type of edge to be added for restoration;
    type_feature: feature name of edge_type on your hin.
    """
    def keep_left(x, G):
        edge_split = x['type'].split('_')
        if G.nodes[x['node']][group_feature] != edge_split[0]:
            x['node'], x['neighbor'] = x['neighbor'], x['node']
        return x
    # prepare data for type counting
    edges = list(G.edges)
    edge_types = [G[edge[0]][edge[1]][type_feature] for edge in edges]
    
    edges = pd.DataFrame(edges)
    edges = edges.rename(columns={0: 'node', 1: 'neighbor'})
    edges['type'] = edge_types
    edges = edges.apply(keep_left, G=G, axis=1)
    edges_group = edges.groupby(by=['type'], as_index=False).count().reset_index(drop=True)

    # preparar arestas para eliminar
    edges = edges.sample(frac=1, random_state=random_state).reset_index(drop=True)
    edges_group = edges_group.rename(columns={'node': 'count', 'neighbor': 'to_cut_count'})
    edges_group['to_cut_count'] = edges_group['to_cut_count'].apply(lambda x:round(x * split))
    train, test = {}, {}
    for _, row in edges_group.iterrows():
        if row['type'] == edge_group:
            train[row['type']] = edges[edges['type'] == row['type']].reset_index(drop=True).loc[row['to_cut_count']:].reset_index(drop=True)
            test[row['type']] = edges[edges['type'] == row['type']].reset_index(drop=True).loc[:row['to_cut_count']-1].reset_index(drop=True)
                    
    G_disturbed = deepcopy(G)
    hidden = {'node': [], 'neighbor_group': []}
    for tc_df in test.values():
        for _, row in tc_df.iterrows():
            neighbors_list = list(G_disturbed.neighbors(row['node']))
            neighbors_hidden = []
            has_cut = False
            for neighbor in neighbors_list:
                if G_disturbed.nodes[neighbor][node_from_feature] == extra_cut_from:
                    has_cut = True
                    neighbors_hidden.append({'neighbor': neighbor, 'edge_group': G_disturbed[row['node']][neighbor][type_feature]})
                    G_disturbed.remove_edge(row['node'],neighbor)
            if has_cut:
                hidden['node'].append(row['node'])
                hidden['neighbor_group'].append(neighbors_hidden)
    return G_disturbed, train, test, pd.DataFrame(hidden)

def true_restore(G, hidden, train, test, percentual=1.0, edge_group='doi_bioActivity', node_feature='node', neighbor_group_feature='neighbor_group', neighbor_feature='neighbor', edge_group_feature='edge_group'):
    G_found = deepcopy(G)
    adding_df = hidden.loc[0:round(hidden.shape[0] * percentual)-1]
    remaining_df = hidden.loc[round(hidden.shape[0] * percentual):hidden.shape[0]-1]
    df_train, df_test = train[edge_group], test[edge_group]
    for _, row in adding_df.iterrows():
        df_train = pd.concat([df_train, df_test[df_test[node_feature] == row[node_feature]]])
        df_test = df_test.drop(df_test[df_test[node_feature] == row[node_feature]].index)
        for to_add in row[neighbor_group_feature]:
            G_found.add_edge(row[node_feature], to_add[neighbor_feature], edge_type=to_add[edge_group_feature])
    
    train[edge_group], test[edge_group] = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    return G_found, remaining_df.reset_index(drop=True), train, test

def execution(G, iteration, edge_group, percentual_to_time):
    G_disturbed, train, test, hidden = disturbed_hin(G, split=0.8, random_state=(1 + iteration), edge_group=edge_group)
    G_found, hidden, train, test = true_restore(G_disturbed, hidden, train, test, percentual=0.0, edge_group=edge_group)

    for key, value in percentual_to_time.items():
        train[edge_group].to_csv(f"{path}new_splits/train_{edge_group}_{iteration}_{key}.csv", index=False)
        test[edge_group].to_csv(f"{path}new_splits/test_{edge_group}_{iteration}_{key}.csv", index=False)
        nx.write_gpickle(G_found, f"{path}new_splits/kg_{edge_group}_{iteration}_{key}.gpickle")
        G_found, hidden, train, test = true_restore(G_found, hidden, train, test, percentual=value, edge_group=edge_group)

edge_groups = ['doi_name', 'doi_bioActivity', 'doi_collectionSpecie', 'doi_collectionSite', 'doi_collectionType']
percentual_to_time = {'1st': 0.3, '2nd': 0.32, '3rd': 0.5, '4th': 0.0}

for edge_group in tqdm(edge_groups):
    for iteration in range(10):
        execution(G, iteration, edge_group, percentual_to_time)