Spaces:

vialibre
/

edia_full_es

Running

File size: 2,993 Bytes

0a94528

import pandas as pd
from typing import List, Dict, Tuple

class Vocabulary:
    def __init__(
        self, 
        subset_name: str
    ) -> None:

        # Dataset info
        self.subset_name = subset_name
        self.ds_path = f"data/{subset_name}_vocab_v6.zip"
        
        # Pandas dataset
        self.df_vocab = None

        # Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph
        self.histogram = None

        # Load vocabulary dataset
        self.__load()

    def __contains__(
        self, 
        word: str
    ) -> bool:

        return word in self.df_vocab['word'].to_list()

    def __load(
        self
    ) -> None:

        print(f"Preparing {self.subset_name} vocabulary...")

        # --- Download vocab dataset ---
        self.df_vocab = pd.read_json(self.ds_path)

        # --- Create min histogram to plot the word distribution graph ---
        x_values = self.df_vocab['percentile'].to_list()
        y_values = self.df_vocab['freq'].to_list()

        # Delete duplicated tups
        uniques_tups_list = set(list(zip(x_values, y_values)))
        # Leave only tuples with different first element
        uniques_tups_list = dict(uniques_tups_list)

        self.histogram = sorted(
            uniques_tups_list.items(),
            key=lambda tup: tup[0], 
            reverse=True
        )
        
    def __getValue(
        self, 
        word: str, 
        feature: str
    ):
        word_id, value = None, None

        if word in self:
            word_id = self.df_vocab['word'].to_list().index(word)
        
        if word_id != None:
            value = self.df_vocab[feature].to_list()[word_id]

        return value

    def getFreq(
        self, 
        word
    ) -> int:

        return self.__getValue(word, 'freq')

    def getPercentile(
        self, 
        word:str
    ) -> float:

        return self.__getValue(word, 'percentile')

    def getSplits(
        self, 
        word: str
    ) -> List[str]:

        return self.__getValue(word, 'splits')
    
    def getSubsets(
        self, 
        word: str
    ) -> Dict[str, int]:

        return self.__getValue(word, 'in_subset')

    def distribution(
        self
    ) -> Tuple:

        x_values, y_values = zip(*self.histogram)
        return x_values, y_values
     
    def getWordNeighbors(
        self, 
        word: str, 
        n_neighbors: int=20
    )-> Tuple:

        word_id = self.df_vocab['word'].to_list().index(word)
        words = self.df_vocab['word'].to_list()
        freqs = self.df_vocab['freq'].to_list()
        l_sorted = list(zip(words, freqs))

        g = l_sorted[max(0, word_id-n_neighbors):word_id]    # less than
        e = l_sorted[word_id]                               # equal than
        l = l_sorted[word_id+1:word_id+n_neighbors]         # greter than

        dic = dict(g+[e]+l)
        l = [x[0] for x in l]
        g = [x[0] for x in g]

        return dic, l, g