Spaces:

Metin
/

DataMiningProjectDemo

Sleeping

App Files Files Community

Metin commited on 16 days ago

Commit

d97a439

1 Parent(s): 39daa82

Initial commit

Browse files

Files changed (36) hide show

input/test_data_with_embeddings.parquet +3 -0
input/train_data_with_embeddings.parquet +3 -0
input/val_data_with_embeddings.parquet +3 -0
models/embedding/gte-multilingual-base/.gitattributes +36 -0
models/embedding/gte-multilingual-base/1_Pooling/config.json +7 -0
models/embedding/gte-multilingual-base/README.md +0 -0
models/embedding/gte-multilingual-base/config.json +43 -0
models/embedding/gte-multilingual-base/model.safetensors +3 -0
models/embedding/gte-multilingual-base/modules.json +20 -0
models/embedding/gte-multilingual-base/scripts/gte_embedding.py +154 -0
models/embedding/gte-multilingual-base/sentence_bert_config.json +4 -0
models/embedding/gte-multilingual-base/special_tokens_map.json +51 -0
models/embedding/gte-multilingual-base/tokenizer.json +3 -0
models/embedding/gte-multilingual-base/tokenizer_config.json +54 -0
models/no_edge_gnn/gnn_classifier_model.pth +3 -0
models/no_edge_gnn/gnn_graph_data.pt +3 -0
models/no_edge_gnn/label_mapping.pt +3 -0
models/no_edge_gnn/title_to_id.pt +3 -0
models/undirected_gnn/gnn_classifier_model.pth +3 -0
models/undirected_gnn/gnn_graph_data.pt +3 -0
models/undirected_gnn/label_mapping.pt +3 -0
models/undirected_gnn/title_to_id.pt +3 -0
src/__pycache__/config.cpython-311.pyc +0 -0
src/__pycache__/embedding.cpython-311.pyc +0 -0
src/__pycache__/gnn.cpython-311.pyc +0 -0
src/__pycache__/heuristic.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/__pycache__/visualization.cpython-311.pyc +0 -0
src/config.py +115 -0
src/demo.py +265 -0
src/embedding.py +31 -0
src/gnn.py +164 -0
src/heuristic.py +87 -0
src/streamlit_app.py +0 -40
src/utils.py +140 -0
src/visualization.py +23 -0

input/test_data_with_embeddings.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:039cbc8a7b595f3e10a29e496bd3c44eeba116cb7c6977c86277f05a398e9dcf
+size 6799466

input/train_data_with_embeddings.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e6b545f46a7ab93b8b11092b2bee5ce78973ea9c91d59b8d3e7ff88a7f5beb6
+size 121744699

input/val_data_with_embeddings.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4888805e8ca378c6040a92334ca68c04449d667907c3b758e53b8ef312616fe8
+size 6695324

models/embedding/gte-multilingual-base/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

models/embedding/gte-multilingual-base/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

models/embedding/gte-multilingual-base/README.md ADDED Viewed

The diff for this file is too large to render. See raw diff

models/embedding/gte-multilingual-base/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "NewModel",
+    "NewForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
+    "AutoModelForMaskedLM": "Alibaba-NLP/new-impl--modeling.NewForMaskedLM",
+    "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
+    "AutoModelForMultipleChoice": "Alibaba-NLP/new-impl--modeling.NewForMultipleChoice",
+    "AutoModelForQuestionAnswering": "Alibaba-NLP/new-impl--modeling.NewForQuestionAnswering",
+    "AutoModelForSequenceClassification": "Alibaba-NLP/new-impl--modeling.NewForSequenceClassification",
+    "AutoModelForTokenClassification": "Alibaba-NLP/new-impl--modeling.NewForTokenClassification"
+  },
+  "classifier_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "layer_norm_type": "layer_norm",
+  "max_position_embeddings": 8192,
+  "model_type": "new",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_labels": 1,
+  "pack_qkv": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "rope",
+  "rope_scaling": {
+    "factor": 8.0,
+    "type": "ntk"
+  },
+  "rope_theta": 20000,
+  "torch_dtype": "float16",
+  "transformers_version": "4.39.1",
+  "type_vocab_size": 1,
+  "unpad_inputs": false,
+  "use_memory_efficient_attention": false,
+  "vocab_size": 250048
+}

models/embedding/gte-multilingual-base/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5a35a10faa54da7717870af1517c9b41e9bd8e3880bc5a8e9363d4c3c63e9b0
+size 610753338

models/embedding/gte-multilingual-base/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

models/embedding/gte-multilingual-base/scripts/gte_embedding.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# coding=utf-8
+# Copyright 2024 The GTE Team Authors and Alibaba Group.
+# Licensed under the Apache License, Version 2.0 (the "License");
+from collections import defaultdict
+from typing import Dict, List, Tuple
+import numpy as np
+import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+from transformers.utils import is_torch_npu_available
+class GTEEmbeddidng(torch.nn.Module):
+    def __init__(self,
+                 model_name: str = None,
+                 normalized: bool = True,
+                 use_fp16: bool = True,
+                 device: str = None
+                ):
+        super().__init__()
+        self.normalized = normalized
+        if device:
+            self.device = torch.device(device)
+        else:
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            elif is_torch_npu_available():
+                self.device = torch.device("npu")
+            else:
+                self.device = torch.device("cpu")
+                use_fp16 = False
+        self.use_fp16 = use_fp16
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForTokenClassification.from_pretrained(
+            model_name, trust_remote_code=True, torch_dtype=torch.float16 if self.use_fp16 else None
+        )
+        self.vocab_size = self.model.config.vocab_size
+        self.model.to(self.device)
+    def _process_token_weights(self, token_weights: np.ndarray, input_ids: list):
+        # conver to dict
+        result = defaultdict(int)
+        unused_tokens = set([self.tokenizer.cls_token_id, self.tokenizer.eos_token_id, self.tokenizer.pad_token_id,
+                             self.tokenizer.unk_token_id])
+        # token_weights = np.ceil(token_weights * 100)
+        for w, idx in zip(token_weights, input_ids):
+            if idx not in unused_tokens and w > 0:
+                token = self.tokenizer.decode([int(idx)])
+                if w > result[token]:
+                    result[token] = w
+        return result
+    @torch.no_grad()
+    def encode(self,
+               texts: None,
+               dimension: int = None,
+               max_length: int = 8192,
+               batch_size: int = 16,
+               return_dense: bool = True,
+               return_sparse: bool = False):
+        if dimension is None:
+            dimension = self.model.config.hidden_size
+        if isinstance(texts, str):
+            texts = [texts]
+        num_texts = len(texts)
+        all_dense_vecs = []
+        all_token_weights = []
+        for n, i in enumerate(range(0, num_texts, batch_size)):
+            batch = texts[i: i + batch_size]
+            resulst = self._encode(batch, dimension, max_length, batch_size, return_dense, return_sparse)
+            if return_dense:
+                all_dense_vecs.append(resulst['dense_embeddings'])
+            if return_sparse:
+                all_token_weights.extend(resulst['token_weights'])
+        all_dense_vecs = torch.cat(all_dense_vecs, dim=0)
+        return {
+            "dense_embeddings": all_dense_vecs,
+            "token_weights": all_token_weights
+        }
+    @torch.no_grad()
+    def _encode(self,
+                texts: Dict[str, torch.Tensor] = None,
+                dimension: int = None,
+                max_length: int = 1024,
+                batch_size: int = 16,
+                return_dense: bool = True,
+                return_sparse: bool = False):
+        text_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
+        text_input = {k: v.to(self.model.device) for k,v in text_input.items()}
+        model_out = self.model(**text_input, return_dict=True)
+        output = {}
+        if return_dense:
+            dense_vecs = model_out.last_hidden_state[:, 0, :dimension]
+            if self.normalized:
+                dense_vecs = torch.nn.functional.normalize(dense_vecs, dim=-1)
+            output['dense_embeddings'] = dense_vecs
+        if return_sparse:
+            token_weights = torch.relu(model_out.logits).squeeze(-1)
+            token_weights = list(map(self._process_token_weights, token_weights.detach().cpu().numpy().tolist(),
+                                                    text_input['input_ids'].cpu().numpy().tolist()))
+            output['token_weights'] = token_weights
+        return output
+    def _compute_sparse_scores(self, embs1, embs2):
+        scores = 0
+        for token, weight in embs1.items():
+            if token in embs2:
+                scores += weight * embs2[token]
+        return scores
+    def compute_sparse_scores(self, embs1, embs2):
+        scores = [self._compute_sparse_scores(emb1, emb2) for emb1, emb2 in zip(embs1, embs2)]
+        return np.array(scores)
+    def compute_dense_scores(self, embs1, embs2):
+        scores = torch.sum(embs1*embs2, dim=-1).cpu().detach().numpy()
+        return scores
+    @torch.no_grad()
+    def compute_scores(self,
+        text_pairs: List[Tuple[str, str]],
+        dimension: int = None,
+        max_length: int = 1024,
+        batch_size: int = 16,
+        dense_weight=1.0,
+        sparse_weight=0.1):
+        text1_list = [text_pair[0] for text_pair in text_pairs]
+        text2_list = [text_pair[1] for text_pair in text_pairs]
+        embs1 = self.encode(text1_list, dimension, max_length, batch_size, return_dense=True, return_sparse=True)
+        embs2 = self.encode(text2_list, dimension, max_length, batch_size, return_dense=True, return_sparse=True)
+        scores = self.compute_dense_scores(embs1['dense_embeddings'], embs2['dense_embeddings']) * dense_weight + \
+            self.compute_sparse_scores(embs1['token_weights'], embs2['token_weights']) * sparse_weight
+        scores = scores.tolist()
+        return scores
+if __name__ == '__main__':
+    gte = GTEEmbeddidng('Alibaba-NLP/gte-multilingual-base')
+    docs =  [
+        "黑龙江离俄罗斯很近",
+        "哈尔滨是中国黑龙江省的省会，位于中国东北",
+        "you are the hero"
+    ]
+    print('docs', docs)
+    embs = gte.encode(docs, return_dense=True,return_sparse=True)
+    print('dense vecs', embs['dense_embeddings'])
+    print('sparse vecs', embs['token_weights'])

models/embedding/gte-multilingual-base/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 8192,
+  "do_lower_case": false
+}

models/embedding/gte-multilingual-base/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

models/embedding/gte-multilingual-base/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f59925fcb90c92b894cb93e51bb9b4a6105c5c249fe54ce1c704420ac39b81af
+size 17082756

models/embedding/gte-multilingual-base/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 32768,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

models/no_edge_gnn/gnn_classifier_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7c60400474e5da38560a2146fa9dc997a949c293816f14095120cb3e863cfeb
+size 417848

models/no_edge_gnn/gnn_graph_data.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdce3a0f4ddd0abf8293ae2ddfab06bb4ece1ac0e55d113bc8c28dbad4d77cb8
+size 30912782

models/no_edge_gnn/label_mapping.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd27285c0b3d71f944f4fa2d8096f4a8ddc49c34d8728ccb2a7a82bceccd99ff
+size 1720

models/no_edge_gnn/title_to_id.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c66e0124c82ab11fcc6db287aa5898d8e4745d1a2467b1c5f1d3779729420e3b
+size 281136

models/undirected_gnn/gnn_classifier_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a3756f5565bf83a4ed03713814c60a52fa5f94140b2e0dd493035e1ba955b13
+size 417720

models/undirected_gnn/gnn_graph_data.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f0ebba135027659290f04a40ea2f46bbf7ec219a12331ec78030b6da29633a7
+size 31942670

models/undirected_gnn/label_mapping.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd27285c0b3d71f944f4fa2d8096f4a8ddc49c34d8728ccb2a7a82bceccd99ff
+size 1720

models/undirected_gnn/title_to_id.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c66e0124c82ab11fcc6db287aa5898d8e4745d1a2467b1c5f1d3779729420e3b
+size 281136

src/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (4.22 kB). View file

src/__pycache__/embedding.cpython-311.pyc ADDED Viewed

Binary file (2.87 kB). View file

src/__pycache__/gnn.cpython-311.pyc ADDED Viewed

Binary file (8.88 kB). View file

src/__pycache__/heuristic.cpython-311.pyc ADDED Viewed

Binary file (3.3 kB). View file

src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.98 kB). View file

src/__pycache__/visualization.cpython-311.pyc ADDED Viewed

Binary file (1.32 kB). View file

src/config.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from pydantic_settings import BaseSettings
+class Config(BaseSettings):
+    EMBEDDING_MODEL_PATH: str = r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\models\embedding\gte-multilingual-base"
+    TRAINING_DATA_PATH: str = r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\input\train_data_with_embeddings.parquet"
+    GNN_MODEL_PATH: str = r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\models\undirected_gnn\gnn_classifier_model.pth"
+    GNN_GRAPH_DATA_PATH: str = r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\models\undirected_gnn\gnn_graph_data.pt"
+    LABEL_MAPPING_PATH: str = r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\models\undirected_gnn\label_mapping.pt"
+    TITLE_TO_ID_PATH: str = r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\models\undirected_gnn\title_to_id.pt"
+    ICON_MAPPING: dict[str, str] = {
+        'Africa': 'language',
+        'Americas': 'language',
+        'Architecture': 'home',
+        'Asia': 'language',
+        'Biography': 'person',
+        'Biology': 'science',
+        'Business_and_economics': 'account_balance',
+        'Chemistry': 'science',
+        'Computing': 'laptop',
+        'Earth_and_environment': 'cloud',
+        'Education': 'description',
+        'Engineering': 'factory',
+        'Entertainment': 'campaign',
+        'Europe': 'language',
+        'Fashion': 'sell',
+        'Films': 'monitor',
+        'Food_and_drink': 'store',
+        'Geographical': 'place',
+        'History': 'inventory',
+        'Internet_culture': 'alternate_email',
+        'Libraries_&_Information': 'folder',
+        'Linguistics': 'translate',
+        'Literature': 'description',
+        'Mathematics': 'analytics',
+        'Media': 'chat',
+        'Medicine_&_Health': 'medical_services',
+        'Military_and_warfare': 'flag',
+        'Music': 'campaign',
+        'Oceania': 'language',
+        'Performing_arts': 'group',
+        'Philosophy_and_religion': 'assured_workload',
+        'Physics': 'science',
+        'Politics_and_government': 'assured_workload',
+        'STEM': 'science',
+        'Society': 'group',
+        'Space': 'cloud',
+        'Sports': 'group',
+        'Technology': 'smartphone',
+        'Television': 'monitor',
+        'Transportation': 'directions_car',
+        'Video_games': 'smartphone',
+        'Visual_arts': 'description'
+    }
+    COLOR_MAPPING: dict[str, str] = {
+        # STEM & Natural Sciences -> Emerald (#06d6a0)
+        'Biology': '#06d6a0',
+        'Chemistry': '#06d6a0',
+        'Earth_and_environment': '#06d6a0',
+        'Mathematics': '#06d6a0',
+        'Physics': '#06d6a0',
+        'STEM': '#06d6a0',
+        'Space': '#06d6a0',
+        # Geography & Places -> Ocean Blue (#118ab2)
+        'Africa': '#118ab2',
+        'Americas': '#118ab2',
+        'Asia': '#118ab2',
+        'Europe': '#118ab2',
+        'Oceania': '#118ab2',
+        'Geographical': '#118ab2',
+        # Arts, Entertainment & Culture -> Bubblegum Pink (#ef476f)
+        'Entertainment': '#ef476f',
+        'Fashion': '#ef476f',
+        'Films': '#ef476f',
+        'Music': '#ef476f',
+        'Performing_arts': '#ef476f',
+        'Television': '#ef476f',
+        'Visual_arts': '#ef476f',
+        'Literature': '#ef476f',
+        # Tech, Engineering & Infrastructure -> Dark Teal (#073b4c)
+        'Architecture': '#073b4c',
+        'Computing': '#073b4c',
+        'Engineering': '#073b4c',
+        'Internet_culture': '#073b4c',
+        'Technology': '#073b4c',
+        'Transportation': '#073b4c',
+        'Video_games': '#073b4c',
+        # Society, Humanities & Lifestyle -> Coral Glow (#f78c6b)
+        'Biography': '#f78c6b',
+        'Food_and_drink': '#f78c6b',
+        'Linguistics': '#f78c6b',
+        'Media': '#f78c6b',
+        'Medicine_&_Health': '#f78c6b',
+        'Society': '#f78c6b',
+        'Sports': '#f78c6b',
+        # Institutions, History & Governance -> Royal Gold (#ffd166)
+        'Business_and_economics': '#ffd166',
+        'Education': '#ffd166',
+        'History': '#ffd166',
+        'Libraries_&_Information': '#ffd166',
+        'Military_and_warfare': '#ffd166',
+        'Philosophy_and_religion': '#ffd166',
+        'Politics_and_government': '#ffd166',
+    }
+config = Config()

src/demo.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import time
+import networkx as nx
+import numpy as np
+import pandas as pd
+import streamlit as st
+from src.config import config
+from src.embedding import Embedder
+from src.utils import (create_graph_from_df, gather_neighbors,
+                       get_unique_article_titles)
+from src.heuristic import predict_topic_nth_degree
+from src.gnn import GNNClassifier, load_data, infer_new_node
+from st_link_analysis import EdgeStyle, NodeStyle, st_link_analysis
+from src.visualization import get_edge_styles, get_node_styles
+import torch
+st.set_page_config(
+    page_title="Semantic Article Graph", layout="wide", initial_sidebar_state="expanded"
+)
+if "setup_complete" not in st.session_state:
+    loader = st.empty()
+    with loader.container():
+        st.subheader("🚀 Starting...")
+        with st.status("Loading...", expanded=True) as status:
+            st.write("Initializing Embedding Model...")
+            embedder = Embedder(path=config.EMBEDDING_MODEL_PATH)
+            st.session_state.embedder = embedder
+            st.write("Initializing GNN Model (Undirected)...")
+            undirected_graph_data, undirected_title_to_id, undirected_label_mapping = load_data(version="undirected")
+            undirected_gnn_model = GNNClassifier(
+                input_dim=768,
+                hidden_dim=128,
+                layers=2,
+                output_dim=len(undirected_label_mapping),
+                dropout_rate=0.5,
+            )
+            undirected_gnn_model.load_state_dict(
+                torch.load(config.GNN_MODEL_PATH)
+            )
+            st.session_state.undirected_gnn_model = undirected_gnn_model
+            st.session_state.undirected_graph_data = undirected_graph_data
+            st.session_state.undirected_title_to_id = undirected_title_to_id
+            st.session_state.undirected_label_mapping = undirected_label_mapping
+            st.write("Initializing GNN Model (No Edges)...")
+            no_edge_graph_data, no_edge_title_to_id, no_edge_label_mapping = load_data(
+                version="no_edge"
+            )
+            no_edge_gnn_model = GNNClassifier(
+                input_dim=768,
+                hidden_dim=128,
+                layers=2,
+                output_dim=len(no_edge_label_mapping),
+                dropout_rate=0.5,
+            )
+            no_edge_gnn_model.load_state_dict(
+                torch.load(config.GNN_MODEL_PATH.replace("undirected_gnn", "no_edge_gnn"))
+            )
+            st.session_state.no_edge_gnn_model = no_edge_gnn_model
+            st.session_state.no_edge_graph_data = no_edge_graph_data
+            st.session_state.no_edge_title_to_id = no_edge_title_to_id
+            st.session_state.no_edge_label_mapping = no_edge_label_mapping
+            st.write("Reading training data...")
+            training_data = pd.read_parquet(config.TRAINING_DATA_PATH)
+            training_data["embedding"] = training_data["embedding"].apply(lambda x: eval(x))
+            st.session_state.training_data = training_data
+            st.write("Creating graph for visualization...")
+            directed_graph = create_graph_from_df(training_data, directed=True)
+            st.session_state.directed_graph = directed_graph
+            undirected_graph = create_graph_from_df(training_data, directed=False)
+            st.session_state.undirected_graph = undirected_graph
+            status.update(label="Done!", state="complete", expanded=False)
+        time.sleep(0.5)
+    loader.empty()
+    st.session_state.setup_complete = True
+# node_styles = [
+#     NodeStyle("PERSON", "#FF7F3E", "name", "person"),
+#     NodeStyle("POST", "#2A629A", "content", "description"),
+# ]
+# edge_styles = [
+#     EdgeStyle("FOLLOWS", caption="label", directed=True),
+#     EdgeStyle("POSTED", caption="label", directed=True),
+#     EdgeStyle("QUOTES", caption="label", directed=True),
+# ]
+node_styles = get_node_styles()
+edge_styles = get_edge_styles()
+if "existing_nodes" not in st.session_state:
+    article_titles = get_unique_article_titles(st.session_state.training_data)
+    st.session_state.existing_nodes = article_titles
+CLASSES = list(config.ICON_MAPPING.keys())
+def get_dummy_probabilities():
+    """Generates random probabilities for the classes."""
+    probs = np.random.dirichlet(np.ones(len(CLASSES)), size=1)[0]
+    data = pd.DataFrame({"Class": CLASSES, "Score": probs})
+    # Sort by Score descending
+    return data.sort_values(by="Score", ascending=False).head(10)
+st.title("📄 Semantic Article Graph")
+st.markdown("---")
+col_input, col_vis = st.columns([1, 2], gap="large")
+with col_input:
+    st.subheader("1. New Node Details")
+    new_title = st.text_input("Node Title", placeholder="e.g., Istanbul")
+    new_content = st.text_area(
+        "Content", height=150, placeholder="Paste content here..."
+    )
+    references = st.multiselect(
+        "References (Select existing nodes)",
+        options=st.session_state.existing_nodes,
+        help="Search and select multiple papers this node cites.",
+    )
+    st.markdown("---")
+    st.subheader("2. Methodology Configuration")
+    method = st.selectbox(
+        "Select Classification Method",
+        ["GNN (Graph Neural Network)", "Rule-Based"],
+    )
+    model_params = {}
+    is_directed = False
+    max_depth = 2
+    if method == "GNN (Graph Neural Network)":
+        use_edges = st.checkbox("Use Graph Edges", value=True)
+    elif method == "Rule-Based":
+        max_depth = st.slider("Max Depth", 1, 3, 1)
+        is_weighted = st.checkbox("Apply Weights", value=True)
+        is_directed = st.checkbox("Use Directed Graph", value=False)
+        model_params = {"max_depth": max_depth, "is_weighted": is_weighted}
+    else:
+        st.warning("Please select a valid method.")
+    st.markdown("---")
+    run_inference = st.button(
+        "Add Node & Run Inference", type="primary", width="stretch"
+    )
+with col_vis:
+    if run_inference:
+        if not new_title:
+            st.error("Please enter a title for the node.")
+        else:
+            st.subheader(f"🌐 Graph Neighborhood (k-hop)")
+            with st.spinner("Updating Graph Topology..."):
+                time.sleep(1)
+                graph_container = st.container(border=True)
+                with graph_container:
+                    graph = (
+                        st.session_state.directed_graph
+                        if is_directed
+                        else st.session_state.undirected_graph
+                    )
+                    elements = gather_neighbors(
+                        graph, new_title, references, depth=max_depth
+                    )
+                    st_link_analysis(elements, "cose", node_styles, edge_styles)
+                    st.caption(
+                        f"Visualizing neighbors for: **{new_title}** with {len(references)} connections."
+                    )
+            st.markdown("---")
+            st.subheader("📊 Classification Results")
+            with st.spinner(f"Running {method}..."):
+                time.sleep(1.5)
+                embedding = st.session_state.embedder.generate_embedding(new_content)
+                if method == "GNN (Graph Neural Network)":
+                    base_data = st.session_state.undirected_graph_data if use_edges else st.session_state.no_edge_graph_data
+                    title_to_id = st.session_state.undirected_title_to_id if use_edges else st.session_state.no_edge_title_to_id
+                    label_mapping = st.session_state.undirected_label_mapping if use_edges else st.session_state.no_edge_label_mapping
+                    model = st.session_state.undirected_gnn_model if use_edges else st.session_state.no_edge_gnn_model
+                    df_results = infer_new_node(
+                        base_data=base_data,
+                        model=model,
+                        new_embedding=embedding,
+                        referenced_titles=references,
+                        title_to_id=title_to_id,
+                        label_mapping=label_mapping,
+                        device=torch.device("cpu"),
+                        make_undirected_for_new_node=not is_directed,
+                        use_edges=use_edges,
+                    )
+                elif method == "Rule-Based":
+                    graph = (
+                        st.session_state.directed_graph
+                        if is_directed
+                        else st.session_state.undirected_graph
+                    )
+                    df_results = predict_topic_nth_degree(
+                        new_article_title=new_title,
+                        new_article_embedding=embedding,
+                        edges=references,
+                        G=graph,
+                        decay_factor=1.0,
+                        **model_params,
+                    )
+                else:
+                    st.error("Invalid method selected.")
+                    st.stop()
+                top_class = df_results.iloc[0]
+                st.success(
+                    f"**Predicted Class:** {top_class['Class']} ({top_class['Score']:.2%})"
+                )
+                st.dataframe(
+                    df_results,
+                    column_config={
+                        "Class": "Class Name",
+                        "Score": st.column_config.ProgressColumn(
+                            "Confidence",
+                            help="The model's confidence score",
+                            format="%.2f",
+                            min_value=0,
+                            max_value=1,
+                        ),
+                    },
+                    hide_index=True,
+                    width="stretch",
+                )
+    else:
+        st.info(
+            "👈 Enter node details on the left and click 'Add' to see the graph and predictions."
+        )
+        st.markdown(
+            """
+            <div style="height: 600px; border: 2px dashed #ccc; border-radius: 10px;
+            display: flex; align-items: center; justify-content: center; color: #ccc;">
+                Waiting for input...
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )

src/embedding.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import time
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer
+class Embedder:
+    def __init__(self, path):
+        # time.sleep(1)
+        self.model_name_or_path = path
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+        self.model = AutoModel.from_pretrained(
+            self.model_name_or_path, trust_remote_code=True
+        )
+        self.model.to(self.device)
+    def generate_embedding(self, text):
+        inputs = self.tokenizer(
+            text, max_length=8192, padding=True, truncation=True, return_tensors="pt"
+        )
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        dimension = 768
+        embeddings = outputs.last_hidden_state[:, 0][:dimension]
+        normalized_embeddings = F.normalize(embeddings, p=2, dim=1)
+        return normalized_embeddings.squeeze().cpu().numpy()

src/gnn.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch
+import torch.nn.functional as F
+from torch_geometric.nn import GCNConv
+from torch_geometric.data import Data
+import pandas as pd
+from src.config import config
+class GNNClassifier(torch.nn.Module):
+    def __init__(self, input_dim, hidden_dim, layers, output_dim, dropout_rate=0.5):
+        super().__init__()
+        self.dropout_rate = dropout_rate
+        self.hidden_dim = hidden_dim
+        self.layers = layers
+        self.output_dim = output_dim
+        # IMPROVEMENT 1: Reduce to 2 layers to prevent over-smoothing
+        # If you really need 3 layers, you must add Residual Connections (x = x + conv(x))
+        if layers == 2:
+            self.conv1 = GCNConv(input_dim, hidden_dim)
+            self.conv2 = GCNConv(hidden_dim, output_dim)
+        elif layers == 3:
+            self.conv1 = GCNConv(input_dim, hidden_dim)
+            self.conv2 = GCNConv(hidden_dim, hidden_dim)
+            self.conv3 = GCNConv(hidden_dim, output_dim)
+    def forward(self, data):
+        x, edge_index = data.x, data.edge_index
+        # Layer 1
+        x = self.conv1(x, edge_index)
+        x = F.relu(x)
+        # IMPROVEMENT 2: Higher Dropout (0.5 is standard for citation networks)
+        # This prevents the model from relying too much on specific neighbor connections
+        x = F.dropout(x, p=self.dropout_rate, training=self.training)
+        # Layer 2
+        x = self.conv2(x, edge_index)
+        if self.layers == 3:
+            x = F.relu(x)
+            x = F.dropout(x, p=self.dropout_rate, training=self.training)
+            x = self.conv3(x, edge_index)
+        return x
+def load_data(version: str = "undirected"):
+    if version == "undirected":
+        graph_data = torch.load(config.GNN_GRAPH_DATA_PATH)
+        title_to_id = torch.load(config.TITLE_TO_ID_PATH)
+        label_mapping = torch.load(config.LABEL_MAPPING_PATH)
+    elif version == "no_edge":
+        graph_data = torch.load(config.GNN_GRAPH_DATA_PATH.replace("undirected_gnn", "no_edge_gnn"))
+        title_to_id = torch.load(config.TITLE_TO_ID_PATH.replace("undirected_gnn", "no_edge_gnn"))
+        label_mapping = torch.load(config.LABEL_MAPPING_PATH.replace("undirected_gnn", "no_edge_gnn"))
+    else:
+        raise ValueError(f"Unknown version: {version}")
+    return graph_data, title_to_id, label_mapping
+def infer_new_node(
+    base_data: Data,
+    model: torch.nn.Module,
+    new_embedding,                 # shape (768,) list/np array/torch
+    referenced_titles: list[str],  # titles the user selected
+    title_to_id: dict[str, int],
+    label_mapping: dict[str, int],
+    device: torch.device,
+    make_undirected_for_new_node: bool = True,
+    use_edges: bool = True,
+):
+    model.eval()
+    # Move model to device
+    model = model.to(device)
+    base_data = base_data.to(device)
+    # --- 1) Prepare new node feature ---
+    x_old = base_data.x
+    new_x = torch.tensor(new_embedding, dtype=x_old.dtype).view(1, -1)
+    new_x = new_x.to(device)
+    x = torch.cat([x_old, new_x], dim=0)
+    new_id = x.size(0) - 1
+    # --- 2) Build new edges that attach the node ---
+    src_list = []
+    tgt_list = []
+    for t in referenced_titles:
+        if t not in title_to_id:
+            continue
+        old_id = title_to_id[t]
+        # If you want new node to be influenced by referenced nodes in 1 hop,
+        # you need edges old -> new (incoming to new).
+        src_list.append(old_id)
+        tgt_list.append(new_id)
+        # Optional: also add new -> old to make it undirected / symmetric
+        if make_undirected_for_new_node:
+            src_list.append(new_id)
+            tgt_list.append(old_id)
+    # If the user picked nothing, the node is isolated; GCNConv can still work
+    # because it adds self-loops by default, but performance may be weak.
+    if len(src_list) > 0 and use_edges:
+        new_edges = torch.tensor([src_list, tgt_list], dtype=torch.long)
+        new_edges = new_edges.to(device)
+        edge_index = torch.cat([base_data.edge_index, new_edges], dim=1)
+    else:
+        edge_index = base_data.edge_index
+    # --- 3) Run inference on the augmented graph ---
+    data_aug = Data(x=x, edge_index=edge_index).to(device)
+    with torch.no_grad():
+        out = model(data_aug)  # your model returns raw logits
+        log_probs = F.log_softmax(out, dim=1)
+        log_probs = log_probs[new_id]  # get log-probs for the new node only
+        pred_id = int(torch.argmax(log_probs).item())
+    inv_label_mapping = {v: k for k, v in label_mapping.items()}
+    pred_label = inv_label_mapping[pred_id]
+    probs = log_probs.exp().detach().cpu()  # convert log-probs -> probs
+    columns = ["Class", "Score"]
+    result_df = pd.DataFrame(
+        [(inv_label_mapping[i], prob.item()) for i, prob in enumerate(probs)],
+        columns=columns,
+    ).sort_values(by="Score", ascending=False)
+    return result_df
+if __name__ == "__main__":
+    from src.embedding import Embedder
+    graph_data, title_to_id, label_mapping = load_data()
+    model = GNNClassifier(input_dim=768, hidden_dim=128, layers=2, output_dim=len(label_mapping), dropout_rate=0.5)
+    model.load_state_dict(torch.load(r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\models\gnn\gnn_classifier_model.pth"))
+    new_node_content = "Istanbul Türkiye'nin en büyük şehri ve kültürel başkentidir. Tarih boyunca birçok medeniyete ev sahipliği yapmıştır."
+    embedder = Embedder(path=r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\models\embedding\gte-multilingual-base")
+    new_embedding = embedder.generate_embedding(new_node_content)
+    referenced_titles = ["forum istanbul", "istanbul film festivali", "akıllı şehir"]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    result = infer_new_node(
+        base_data=graph_data,
+        model=model,
+        new_embedding=new_embedding,
+        referenced_titles=referenced_titles,
+        title_to_id=title_to_id,
+        label_mapping=label_mapping,
+        device=device,
+        make_undirected_for_new_node=True,
+    )
+    print("Prediction Results for New Node:")
+    print(result)

src/heuristic.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+from typing import List, Any, Optional
+from collections import defaultdict, deque
+def predict_topic_nth_degree(
+    new_article_title: str,
+    new_article_embedding: List[float],
+    edges: List[str],
+    G: Any,
+    max_depth: int = 1,
+    is_weighted: bool = False,
+    decay_factor: float = 1.0,
+) -> Optional[str]:
+    """
+    Predicts topic based on neighbors up to n-degrees away.
+    Args:
+        max_depth: How many hops to traverse (1 = direct neighbors, 2 = neighbors of neighbors).
+        decay_factor: Multiplier for distance. 1.0 = no decay.
+                      0.5 means a neighbor at depth 2 has half the voting power of depth 1.
+    """
+    # 1. Setup BFS
+    # Queue stores: (current_node_name, current_depth)
+    queue = deque()
+    # We maintain a visited set to avoid cycles and processing the same node twice
+    visited = set()
+    visited.add(new_article_title)
+    # 2. Initialize BFS with the "Virtual" First Hop
+    # We iterate the input list 'edges' manually because the new article isn't in G.
+    for ref in edges:
+        if ref in G and ref not in visited:
+            visited.add(ref)
+            queue.append((ref, 1))  # Depth 1
+    if not queue:
+        return None
+    topic_scores = defaultdict(float)
+    # 3. Process BFS
+    while queue:
+        current_node, current_depth = queue.popleft()
+        # --- Score Calculation ---
+        node_data = G.nodes[current_node]
+        topic = node_data.get("label")
+        if topic:
+            # Determine base weight
+            if is_weighted:
+                neighbor_embedding = node_data["embedding"]
+                # Calculate similarity
+                base_score = cosine_similarity(
+                    [new_article_embedding], [neighbor_embedding]
+                )[0][0]
+            else:
+                base_score = 1.0
+            # Apply Distance Decay
+            # Formula: Score * (decay ^ (depth - 1))
+            # Depth 1: Score * 1
+            # Depth 2: Score * decay
+            weighted_score = base_score * (decay_factor ** (current_depth - 1))
+            topic_scores[topic] += weighted_score
+        # --- Expand to next level if within limit ---
+        if current_depth < max_depth:
+            for neighbor in G.neighbors(current_node):
+                if neighbor not in visited:
+                    visited.add(neighbor)
+                    queue.append((neighbor, current_depth + 1))
+    # 4. Determine Winner
+    if not topic_scores:
+        return None
+    columns = ["Class", "Score"]
+    result_df = pd.DataFrame(
+        [(topic, score) for topic, score in topic_scores.items()], columns=columns
+    ).sort_values(by="Score", ascending=False)
+    return result_df

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

src/utils.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import networkx as nx
+import pandas as pd
+def get_unique_article_titles(df: pd.DataFrame) -> list[str]:
+    unique_articles = df["article_title_processed"].unique()
+    unique_articles_sorted = sorted(unique_articles.tolist())
+    return unique_articles_sorted
+def create_graph_from_df(df, directed: bool = False) -> nx.Graph:
+    G = nx.Graph()
+    for i, row in df.iterrows():
+        node_title = row["article_title_processed"]
+        node_class = row["predicted_topic"]
+        G.add_node(node_title, label=node_class, embedding=row["embedding"])
+    for i, row in df.iterrows():
+        node_title = row["article_title_processed"]
+        references = eval(row["links_processed"])
+        for ref in references:
+            if ref in G and ref != node_title:
+                G.add_edge(node_title, ref)
+                if not directed:
+                    G.add_edge(ref, node_title)
+    return G
+def gather_neighbors(
+    graph: nx.DiGraph, node_title: str, references: list[str], depth: int = 1
+):
+    neighbors = set()
+    modified_graph = graph.copy()
+    modified_graph.add_node(node_title)
+    for ref in references:
+        if ref in modified_graph and ref != node_title:
+            modified_graph.add_edge(node_title, ref)
+    neighbors = get_neighbors_for_visualizer(modified_graph, node_title, depth=depth)
+    return neighbors
+def get_neighbors_for_visualizer(graph: nx.Graph, start_node, depth=1):
+    """
+    Returns the neighbors of a node within a given depth in a format
+    compatible with Cytoscape-style visualizers.
+    Args:
+        graph (nx.Graph): The source NetworkX graph.
+        start_node: The title/ID of the node to start from.
+        depth (int): How many hops (degrees of separation) to traverse.
+    Returns:
+        dict: A dictionary containing 'nodes' and 'edges' formatted for the visualizer.
+    """
+    # 1. Create a subgraph of neighbors within the specified depth
+    # If the node doesn't exist, return empty structure or raise error
+    if start_node not in graph:
+        return {"nodes": [], "edges": []}
+    subgraph = nx.ego_graph(graph, start_node, radius=depth)
+    # 2. Prepare data structures
+    nodes_data = []
+    edges_data = []
+    # Helper to map actual node names (titles) to integer IDs required by the format
+    # The example uses 1-based integers for IDs.
+    node_to_id_map = {}
+    current_id = 1
+    # 3. Process Nodes
+    for node in subgraph.nodes():
+        # Assign an integer ID
+        node_to_id_map[node] = current_id
+        # Get attributes (safely default if label is missing)
+        # We ignore 'embedding' as requested
+        node_attrs = subgraph.nodes[node]
+        label = node_attrs.get("label", "Unknown")
+        node_obj = {
+            "data": {
+                "id": current_id,
+                "label": label,
+                "name": str(node),  # Using the node title/ID as 'name'
+            }
+        }
+        nodes_data.append(node_obj)
+        current_id += 1
+    # 4. Process Edges
+    # Edge IDs usually need to be unique strings or integers.
+    # We continue the counter from where nodes left off to ensure uniqueness.
+    edge_id_counter = current_id
+    for u, v in subgraph.edges():
+        source_id = node_to_id_map[u]
+        target_id = node_to_id_map[v]
+        # Get edge attributes if they exist (e.g., relationship type)
+        edge_attrs = subgraph.edges[u, v]
+        edge_label = edge_attrs.get("label", "CITES")  # Default label if none exists
+        edge_obj = {
+            "data": {
+                "id": edge_id_counter,
+                "label": edge_label,
+                "source": source_id,
+                "target": target_id,
+            }
+        }
+        edges_data.append(edge_obj)
+        edge_id_counter += 1
+    # 5. Return the final structure
+    return {"nodes": nodes_data, "edges": edges_data}
+if __name__ == "__main__":
+    data = pd.read_parquet(
+        r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\input\train_data_with_embeddings.parquet"
+    )
+    graph = create_graph_from_df(data)
+    test_title = "Sample Article Title"
+    test_references = ["finansal matematik", "genel yapay zekâ", "andrej karpathy"]
+    neighbors = gather_neighbors(graph, test_title, test_references, depth=2)
+    # print(f"References for '{test_title}': {test_references}")
+    print(f"Neighbors of '{test_title}': {neighbors}")

src/visualization.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from st_link_analysis import EdgeStyle, NodeStyle
+from src.config import config
+def get_node_styles() -> list[NodeStyle]:
+    node_styles = []
+    for class_name in config.ICON_MAPPING.keys():
+        color = config.COLOR_MAPPING.get(class_name, "#888888")  # Default gray if not found
+        icon = config.ICON_MAPPING.get(class_name, None)
+        node_styles.append(NodeStyle(
+            label=class_name,
+            color=color,
+            icon=icon,
+        ))
+    return node_styles
+def get_edge_styles() -> list[EdgeStyle]:
+    edge_styles = [
+        EdgeStyle(
+            label="CITES",
+        )
+    ]
+    return edge_styles