File size: 2,993 Bytes
0a94528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
from typing import List, Dict, Tuple

class Vocabulary:
    def __init__(
        self, 
        subset_name: str
    ) -> None:

        # Dataset info
        self.subset_name = subset_name
        self.ds_path = f"data/{subset_name}_vocab_v6.zip"
        
        # Pandas dataset
        self.df_vocab = None

        # Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph
        self.histogram = None

        # Load vocabulary dataset
        self.__load()

    def __contains__(
        self, 
        word: str
    ) -> bool:

        return word in self.df_vocab['word'].to_list()

    def __load(
        self
    ) -> None:

        print(f"Preparing {self.subset_name} vocabulary...")

        # --- Download vocab dataset ---
        self.df_vocab = pd.read_json(self.ds_path)

        # --- Create min histogram to plot the word distribution graph ---
        x_values = self.df_vocab['percentile'].to_list()
        y_values = self.df_vocab['freq'].to_list()

        # Delete duplicated tups
        uniques_tups_list = set(list(zip(x_values, y_values)))
        # Leave only tuples with different first element
        uniques_tups_list = dict(uniques_tups_list)

        self.histogram = sorted(
            uniques_tups_list.items(),
            key=lambda tup: tup[0], 
            reverse=True
        )
        
    def __getValue(
        self, 
        word: str, 
        feature: str
    ):
        word_id, value = None, None

        if word in self:
            word_id = self.df_vocab['word'].to_list().index(word)
        
        if word_id != None:
            value = self.df_vocab[feature].to_list()[word_id]

        return value

    def getFreq(
        self, 
        word
    ) -> int:

        return self.__getValue(word, 'freq')

    def getPercentile(
        self, 
        word:str
    ) -> float:

        return self.__getValue(word, 'percentile')

    def getSplits(
        self, 
        word: str
    ) -> List[str]:

        return self.__getValue(word, 'splits')
    
    def getSubsets(
        self, 
        word: str
    ) -> Dict[str, int]:

        return self.__getValue(word, 'in_subset')

    def distribution(
        self
    ) -> Tuple:

        x_values, y_values = zip(*self.histogram)
        return x_values, y_values
     
    def getWordNeighbors(
        self, 
        word: str, 
        n_neighbors: int=20
    )-> Tuple:

        word_id = self.df_vocab['word'].to_list().index(word)
        words = self.df_vocab['word'].to_list()
        freqs = self.df_vocab['freq'].to_list()
        l_sorted = list(zip(words, freqs))

        g = l_sorted[max(0, word_id-n_neighbors):word_id]    # less than
        e = l_sorted[word_id]                               # equal than
        l = l_sorted[word_id+1:word_id+n_neighbors]         # greter than

        dic = dict(g+[e]+l)
        l = [x[0] for x in l]
        g = [x[0] for x in g]

        return dic, l, g