Spaces:
Build error
Build error
Commit
·
1c56d55
1
Parent(s):
a3679d8
Upload folder using huggingface_hub
Browse files- .ipynb_checkpoints/app-checkpoint.py +159 -0
- .ipynb_checkpoints/util-checkpoint.py +69 -0
- README.md +3 -9
- app.py +159 -0
- corpus/1.txt +73 -0
- util.py +69 -0
.ipynb_checkpoints/app-checkpoint.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load our data
|
2 |
+
import os
|
3 |
+
|
4 |
+
def load_raw_text(corpus_directory: str, file_names=None) -> str:
|
5 |
+
"""Loads all the text files in a directory into one large string"""
|
6 |
+
corpus = ""
|
7 |
+
|
8 |
+
for file_name in os.listdir(corpus_directory):
|
9 |
+
# Read the file as a string
|
10 |
+
file_path = os.path.join(corpus_directory, file_name)
|
11 |
+
if os.path.isdir(file_path):
|
12 |
+
continue
|
13 |
+
|
14 |
+
# Make sure we only read text files
|
15 |
+
if ".txt" not in file_name:
|
16 |
+
continue
|
17 |
+
|
18 |
+
with open(file_path, 'r') as file:
|
19 |
+
file_contents = file.read()
|
20 |
+
corpus += (file_contents + "\n")
|
21 |
+
return corpus
|
22 |
+
|
23 |
+
# REPLACE WITH YOUR CORPUS DIRECTORY
|
24 |
+
corpus = load_raw_text(corpus_directory="./corpus")
|
25 |
+
|
26 |
+
import re
|
27 |
+
import util
|
28 |
+
|
29 |
+
# TODO: Strip accents using util.strip_accents
|
30 |
+
|
31 |
+
|
32 |
+
# TODO: Make corpus lowercase
|
33 |
+
corpus = corpus.lower()
|
34 |
+
|
35 |
+
# TODO: Split corpus into tokens using the following function
|
36 |
+
word_regex = r"[a-zïëñ]+"
|
37 |
+
def tokenize(text: str):
|
38 |
+
return re.findall(word_regex, text)
|
39 |
+
s_tok = tokenize(corpus)
|
40 |
+
|
41 |
+
# TODO: Create a set named "lexicon" with all of the unique words
|
42 |
+
lexicon = set()
|
43 |
+
for word in s_tok:
|
44 |
+
lexicon.add(word)
|
45 |
+
|
46 |
+
filtered_lexicon = set()
|
47 |
+
|
48 |
+
for word in lexicon:
|
49 |
+
if 3 <= len(word) <= 7:
|
50 |
+
filtered_lexicon.add(word)
|
51 |
+
|
52 |
+
import random
|
53 |
+
|
54 |
+
def random_scramble(lexicon: set):
|
55 |
+
lexicon = list(lexicon)
|
56 |
+
|
57 |
+
word = random.choice(lexicon)
|
58 |
+
|
59 |
+
# Turn the word into a list of characters
|
60 |
+
word_chars = list(word)
|
61 |
+
|
62 |
+
# Shuffle those characters
|
63 |
+
random.shuffle(word_chars)
|
64 |
+
|
65 |
+
# Re-join the characters into a string
|
66 |
+
shuffled = ''.join(word_chars)
|
67 |
+
|
68 |
+
return {'shuffled': shuffled, 'original': word}
|
69 |
+
|
70 |
+
import gradio as gr
|
71 |
+
from typing import Tuple
|
72 |
+
|
73 |
+
def create_hangman_clue(word, guessed_letters):
|
74 |
+
"""
|
75 |
+
Given a word and a list of letters, create the correct clue.
|
76 |
+
|
77 |
+
For instance, if the word is 'apple' and the guessed letters are 'a' and 'l', the clue should be 'a _ _ l _'
|
78 |
+
"""
|
79 |
+
clue = ''
|
80 |
+
for letter in word:
|
81 |
+
if letter in guessed_letters:
|
82 |
+
clue += letter + ' '
|
83 |
+
else:
|
84 |
+
clue += '_ '
|
85 |
+
return clue
|
86 |
+
|
87 |
+
|
88 |
+
def pick_new_word(lexicon):
|
89 |
+
lexicon = list(lexicon)
|
90 |
+
|
91 |
+
return {
|
92 |
+
'word': random.choice(lexicon),
|
93 |
+
'guessed_letters': set(),
|
94 |
+
'remaining_chances': 6
|
95 |
+
}
|
96 |
+
|
97 |
+
|
98 |
+
def hangman_game(current_state, guess):
|
99 |
+
"""Update the current state based on the guess."""
|
100 |
+
|
101 |
+
|
102 |
+
if guess in current_state['guessed_letters'] or len(guess) > 1:
|
103 |
+
# Illegal guess, do nothing
|
104 |
+
return (current_state, 'Invalid guess')
|
105 |
+
|
106 |
+
current_state['guessed_letters'].add(guess)
|
107 |
+
|
108 |
+
if guess not in current_state['word']:
|
109 |
+
# Wrong guess
|
110 |
+
current_state['remaining_chances'] -= 1
|
111 |
+
|
112 |
+
if current_state['remaining_chances'] == 0:
|
113 |
+
# No more chances! New word
|
114 |
+
current_state = pick_new_word(filtered_lexicon)
|
115 |
+
return (current_state, 'You lose!')
|
116 |
+
else:
|
117 |
+
return (current_state, 'Wrong guess :(')
|
118 |
+
|
119 |
+
else:
|
120 |
+
# Right guess, check if there's any letters left
|
121 |
+
for letter in current_state['word']:
|
122 |
+
if letter not in current_state['guessed_letters']:
|
123 |
+
# Still letters remaining
|
124 |
+
return (current_state, 'Correct guess!')
|
125 |
+
|
126 |
+
# If we made it here, there's no letters left.
|
127 |
+
current_state = pick_new_word(filtered_lexicon)
|
128 |
+
return (current_state, 'You win!')
|
129 |
+
|
130 |
+
|
131 |
+
def state_changed(current_state):
|
132 |
+
clue = create_hangman_clue(current_state['word'], current_state['guessed_letters'])
|
133 |
+
guessed_letters = current_state['guessed_letters']
|
134 |
+
remaining_chances = current_state['remaining_chances']
|
135 |
+
return (clue, guessed_letters, remaining_chances)
|
136 |
+
|
137 |
+
|
138 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="karijona Hangman") as hangman:
|
139 |
+
current_word = gr.State(pick_new_word(filtered_lexicon))
|
140 |
+
|
141 |
+
gr.Markdown("# karijona Hangman")
|
142 |
+
|
143 |
+
with gr.Row():
|
144 |
+
current_word_textbox = gr.Textbox(label="Clue", interactive=False, value=create_hangman_clue(current_word.value['word'], current_word.value['guessed_letters']))
|
145 |
+
guessed_letters_textbox = gr.Textbox(label="Guessed letters", interactive=False)
|
146 |
+
remaining_chances_textbox = gr.Textbox(label="Remaining chances", interactive=False, value=6)
|
147 |
+
|
148 |
+
guess_textbox = gr.Textbox(label="Guess")
|
149 |
+
guess_button = gr.Button(value="Submit")
|
150 |
+
|
151 |
+
output_textbox = gr.Textbox(label="Result", interactive=False)
|
152 |
+
|
153 |
+
guess_button.click(fn=hangman_game, inputs=[current_word, guess_textbox], outputs=[current_word, output_textbox])\
|
154 |
+
.then(fn=state_changed, inputs=[current_word], outputs=[current_word_textbox, guessed_letters_textbox, remaining_chances_textbox])
|
155 |
+
|
156 |
+
hangman.launch()
|
157 |
+
|
158 |
+
|
159 |
+
|
.ipynb_checkpoints/util-checkpoint.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
|
5 |
+
def strip_accents(text: str) -> str:
|
6 |
+
"""Removes accents from text."""
|
7 |
+
return ''.join(c for c in unicodedata.normalize('NFD', text)
|
8 |
+
if unicodedata.category(c) != 'Mn')
|
9 |
+
|
10 |
+
|
11 |
+
def load_raw_text(corpus_directory: str, file_names=None) -> str:
|
12 |
+
"""Loads all the text files in a directory into one large string"""
|
13 |
+
corpus = ""
|
14 |
+
|
15 |
+
for file_name in os.listdir(corpus_directory):
|
16 |
+
# Read the file as a string
|
17 |
+
file_path = os.path.join(corpus_directory, file_name)
|
18 |
+
if os.path.isdir(file_path):
|
19 |
+
continue
|
20 |
+
|
21 |
+
# Make sure we only read text files
|
22 |
+
if ".txt" not in file_name:
|
23 |
+
continue
|
24 |
+
|
25 |
+
with open(file_path, 'r') as file:
|
26 |
+
file_contents = file.read()
|
27 |
+
corpus += (file_contents + "\n")
|
28 |
+
return corpus
|
29 |
+
|
30 |
+
|
31 |
+
word_regex = r"[\w|\']+"
|
32 |
+
def tokenize(text):
|
33 |
+
return re.findall(word_regex, text)
|
34 |
+
|
35 |
+
|
36 |
+
def preprocess(text):
|
37 |
+
"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
|
38 |
+
text = strip_accents(text)
|
39 |
+
text = text.lower()
|
40 |
+
|
41 |
+
tokens = text.split(" ")
|
42 |
+
|
43 |
+
tokens_filtered = []
|
44 |
+
for token in tokens:
|
45 |
+
# Skip any tokens with special characters
|
46 |
+
if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
|
47 |
+
tokens_filtered.append(token)
|
48 |
+
return tokens_filtered
|
49 |
+
|
50 |
+
|
51 |
+
def pad(text: list, num_padding: int):
|
52 |
+
"""Pads the given text, as a list of strings, with <s> characters between sentences."""
|
53 |
+
padded_text = []
|
54 |
+
|
55 |
+
# Add initial padding to the first sentence
|
56 |
+
for _ in range(num_padding):
|
57 |
+
padded_text.append("<s>")
|
58 |
+
|
59 |
+
for word in text:
|
60 |
+
padded_text.append(word)
|
61 |
+
|
62 |
+
# Every time we see an end punctuation mark, add <s> tokens before it
|
63 |
+
# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
|
64 |
+
if word in [".", "?", "!"]:
|
65 |
+
for _ in range(num_padding):
|
66 |
+
padded_text.append("<s>")
|
67 |
+
|
68 |
+
|
69 |
+
return padded_text
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: ⚡
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.35.2
|
8 |
app_file: app.py
|
9 |
-
|
|
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Karijona_Hangman
|
|
|
|
|
|
|
|
|
|
|
3 |
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 3.34.0
|
6 |
---
|
|
|
|
app.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load our data
|
2 |
+
import os
|
3 |
+
|
4 |
+
def load_raw_text(corpus_directory: str, file_names=None) -> str:
|
5 |
+
"""Loads all the text files in a directory into one large string"""
|
6 |
+
corpus = ""
|
7 |
+
|
8 |
+
for file_name in os.listdir(corpus_directory):
|
9 |
+
# Read the file as a string
|
10 |
+
file_path = os.path.join(corpus_directory, file_name)
|
11 |
+
if os.path.isdir(file_path):
|
12 |
+
continue
|
13 |
+
|
14 |
+
# Make sure we only read text files
|
15 |
+
if ".txt" not in file_name:
|
16 |
+
continue
|
17 |
+
|
18 |
+
with open(file_path, 'r') as file:
|
19 |
+
file_contents = file.read()
|
20 |
+
corpus += (file_contents + "\n")
|
21 |
+
return corpus
|
22 |
+
|
23 |
+
# REPLACE WITH YOUR CORPUS DIRECTORY
|
24 |
+
corpus = load_raw_text(corpus_directory="./corpus")
|
25 |
+
|
26 |
+
import re
|
27 |
+
import util
|
28 |
+
|
29 |
+
# TODO: Strip accents using util.strip_accents
|
30 |
+
|
31 |
+
|
32 |
+
# TODO: Make corpus lowercase
|
33 |
+
corpus = corpus.lower()
|
34 |
+
|
35 |
+
# TODO: Split corpus into tokens using the following function
|
36 |
+
word_regex = r"[a-zïëñ]+"
|
37 |
+
def tokenize(text: str):
|
38 |
+
return re.findall(word_regex, text)
|
39 |
+
s_tok = tokenize(corpus)
|
40 |
+
|
41 |
+
# TODO: Create a set named "lexicon" with all of the unique words
|
42 |
+
lexicon = set()
|
43 |
+
for word in s_tok:
|
44 |
+
lexicon.add(word)
|
45 |
+
|
46 |
+
filtered_lexicon = set()
|
47 |
+
|
48 |
+
for word in lexicon:
|
49 |
+
if 3 <= len(word) <= 7:
|
50 |
+
filtered_lexicon.add(word)
|
51 |
+
|
52 |
+
import random
|
53 |
+
|
54 |
+
def random_scramble(lexicon: set):
|
55 |
+
lexicon = list(lexicon)
|
56 |
+
|
57 |
+
word = random.choice(lexicon)
|
58 |
+
|
59 |
+
# Turn the word into a list of characters
|
60 |
+
word_chars = list(word)
|
61 |
+
|
62 |
+
# Shuffle those characters
|
63 |
+
random.shuffle(word_chars)
|
64 |
+
|
65 |
+
# Re-join the characters into a string
|
66 |
+
shuffled = ''.join(word_chars)
|
67 |
+
|
68 |
+
return {'shuffled': shuffled, 'original': word}
|
69 |
+
|
70 |
+
import gradio as gr
|
71 |
+
from typing import Tuple
|
72 |
+
|
73 |
+
def create_hangman_clue(word, guessed_letters):
|
74 |
+
"""
|
75 |
+
Given a word and a list of letters, create the correct clue.
|
76 |
+
|
77 |
+
For instance, if the word is 'apple' and the guessed letters are 'a' and 'l', the clue should be 'a _ _ l _'
|
78 |
+
"""
|
79 |
+
clue = ''
|
80 |
+
for letter in word:
|
81 |
+
if letter in guessed_letters:
|
82 |
+
clue += letter + ' '
|
83 |
+
else:
|
84 |
+
clue += '_ '
|
85 |
+
return clue
|
86 |
+
|
87 |
+
|
88 |
+
def pick_new_word(lexicon):
|
89 |
+
lexicon = list(lexicon)
|
90 |
+
|
91 |
+
return {
|
92 |
+
'word': random.choice(lexicon),
|
93 |
+
'guessed_letters': set(),
|
94 |
+
'remaining_chances': 6
|
95 |
+
}
|
96 |
+
|
97 |
+
|
98 |
+
def hangman_game(current_state, guess):
|
99 |
+
"""Update the current state based on the guess."""
|
100 |
+
|
101 |
+
|
102 |
+
if guess in current_state['guessed_letters'] or len(guess) > 1:
|
103 |
+
# Illegal guess, do nothing
|
104 |
+
return (current_state, 'Invalid guess')
|
105 |
+
|
106 |
+
current_state['guessed_letters'].add(guess)
|
107 |
+
|
108 |
+
if guess not in current_state['word']:
|
109 |
+
# Wrong guess
|
110 |
+
current_state['remaining_chances'] -= 1
|
111 |
+
|
112 |
+
if current_state['remaining_chances'] == 0:
|
113 |
+
# No more chances! New word
|
114 |
+
current_state = pick_new_word(filtered_lexicon)
|
115 |
+
return (current_state, 'You lose!')
|
116 |
+
else:
|
117 |
+
return (current_state, 'Wrong guess :(')
|
118 |
+
|
119 |
+
else:
|
120 |
+
# Right guess, check if there's any letters left
|
121 |
+
for letter in current_state['word']:
|
122 |
+
if letter not in current_state['guessed_letters']:
|
123 |
+
# Still letters remaining
|
124 |
+
return (current_state, 'Correct guess!')
|
125 |
+
|
126 |
+
# If we made it here, there's no letters left.
|
127 |
+
current_state = pick_new_word(filtered_lexicon)
|
128 |
+
return (current_state, 'You win!')
|
129 |
+
|
130 |
+
|
131 |
+
def state_changed(current_state):
|
132 |
+
clue = create_hangman_clue(current_state['word'], current_state['guessed_letters'])
|
133 |
+
guessed_letters = current_state['guessed_letters']
|
134 |
+
remaining_chances = current_state['remaining_chances']
|
135 |
+
return (clue, guessed_letters, remaining_chances)
|
136 |
+
|
137 |
+
|
138 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="karijona Hangman") as hangman:
|
139 |
+
current_word = gr.State(pick_new_word(filtered_lexicon))
|
140 |
+
|
141 |
+
gr.Markdown("# karijona Hangman")
|
142 |
+
|
143 |
+
with gr.Row():
|
144 |
+
current_word_textbox = gr.Textbox(label="Clue", interactive=False, value=create_hangman_clue(current_word.value['word'], current_word.value['guessed_letters']))
|
145 |
+
guessed_letters_textbox = gr.Textbox(label="Guessed letters", interactive=False)
|
146 |
+
remaining_chances_textbox = gr.Textbox(label="Remaining chances", interactive=False, value=6)
|
147 |
+
|
148 |
+
guess_textbox = gr.Textbox(label="Guess")
|
149 |
+
guess_button = gr.Button(value="Submit")
|
150 |
+
|
151 |
+
output_textbox = gr.Textbox(label="Result", interactive=False)
|
152 |
+
|
153 |
+
guess_button.click(fn=hangman_game, inputs=[current_word, guess_textbox], outputs=[current_word, output_textbox])\
|
154 |
+
.then(fn=state_changed, inputs=[current_word], outputs=[current_word_textbox, guessed_letters_textbox, remaining_chances_textbox])
|
155 |
+
|
156 |
+
hangman.launch()
|
157 |
+
|
158 |
+
|
159 |
+
|
corpus/1.txt
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Karijona ekarï
|
2 |
+
Karijona ekarï notonaga ëkëimë mono kërïtawënobë. Karijona echiwaketï nïjïnëjïjëbë notonaga karijona ekarï. Irajo jiyachi, kuyugo dïjïrë mëkë karijona ekarïko
|
3 |
+
esetï karama, jiyanakoto, yakauyana, werewereru, majotoyana, kaikusana, machijuyana, saja saja, namororeke nai ërarërë nesejoyanë marë. Irebë nïwowanë toto.
|
4 |
+
— Ëteke manai?
|
5 |
+
— Kure wae (ganë)?
|
6 |
+
— Ëtï mïjëkae?
|
7 |
+
— Wui tïrïyae tujitëjo
|
8 |
+
— Wïtëeja.
|
9 |
+
dëmë nesejoyanë marë, marë, marë. Irëbë niwowanë toto:
|
10 |
+
— Ëteke manatëi?
|
11 |
+
— Aña kuregïñake nai.
|
12 |
+
Ëwï yesetï nai . Yiye esetï nai Ëwï yumu esetï nai . Yakëmijë esetï gërëja nai Yinotï esetï nai Tamu esetï nai . Tïmugëake wae. Mësa mëitïto.
|
13 |
+
Karijona eremirï etase manai?
|
14 |
+
2. Ëjutujë eurukuse manai?
|
15 |
+
3. Yeremirï tae ëtëkëne tïyajoro. 4. Tïyajoro ejaragae echinemae. 5. Tïyajoro etunutëkë.
|
16 |
+
6. Nekë nechinemanë mëitïto.
|
17 |
+
Aime toto nai tïyajoro.
|
18 |
+
26. Itu tawëdoko aime nai.
|
19 |
+
27. Tetunutë ake.
|
20 |
+
28. Tejaragae tïyajoro ekayakarï.
|
21 |
+
29. Akorodoko ejaragasegërë nai.
|
22 |
+
Mësa mëitïto nai.
|
23 |
+
|
24 |
+
Ësanobë meyae?
|
25 |
+
Ësanobë wërichi neyae? Ësanobë mure neyanë? Ësanobë gïrï neyanë?
|
26 |
+
Aña tujitë tërënobë neyae. Ësanobë añamoro meyatëi? Ësanobë mëkamoro neyanë?
|
27 |
+
yitudae
|
28 |
+
mutudae
|
29 |
+
nutudae
|
30 |
+
nutudae
|
31 |
+
mutudatëi
|
32 |
+
nutudanë
|
33 |
+
Ëti jëkë manai? Ëti jëkë nërë nai? Ëti jëkë manatëi?
|
34 |
+
ëwï
|
35 |
+
ëmërë
|
36 |
+
nërë
|
37 |
+
aña
|
38 |
+
añamoro
|
39 |
+
mëkamoro
|
40 |
+
itawarï
|
41 |
+
tujitë
|
42 |
+
mïnë
|
43 |
+
Sëkënërë atakëmicha dëmë terejarïko wae: teñi jaru jëmïrï, serawërë nureimë, kënëkërëne jëmëi. Mëkë iwajotorï dëmë ikucha wae. Nërë nejï. Ëñaotoene ikucha: sëkënërë jëne, serawërë kunañi. Nai gërëja oworï iyatënë.
|
44 |
+
Teñi Sëkënërë Serawërë Kënëtëkërëne Ëñatoene
|
45 |
+
Ëwï ajereme teñi kaikuchi yeku nai. Ëwï ajereme sëkënërë nureimë nai. Ëwï ajereme serawërë kunañi nai. Ëwï ajereme kënëtëkërëne jëmëi nai. Ëwï ajereme eñatoene jaru
|
46 |
+
Ëteke manatëi? Manatëi reke?
|
47 |
+
Kure
|
48 |
+
Kure dïjïrë
|
49 |
+
Kure aña nai
|
50 |
+
Uwareke
|
51 |
+
Uwa
|
52 |
+
Mëjënae?
|
53 |
+
Mëjënuyae
|
54 |
+
Ikucha mïsakae?
|
55 |
+
Meremiruyae?
|
56 |
+
Wëiko
|
57 |
+
Jiji
|
58 |
+
Enuko Enu
|
59 |
+
Jiji
|
60 |
+
Wërï
|
61 |
+
Muchu
|
62 |
+
Tamu
|
63 |
+
Wëiko Weikorï
|
64 |
+
Jiji
|
65 |
+
Akëmijë
|
66 |
+
Yakëmijë
|
67 |
+
Iyarijarï
|
68 |
+
Yarijarï
|
69 |
+
Wajotorï
|
70 |
+
Yiwajotorï
|
71 |
+
Echirï Yechirï
|
72 |
+
Muguru
|
73 |
+
|
util.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
|
5 |
+
def strip_accents(text: str) -> str:
|
6 |
+
"""Removes accents from text."""
|
7 |
+
return ''.join(c for c in unicodedata.normalize('NFD', text)
|
8 |
+
if unicodedata.category(c) != 'Mn')
|
9 |
+
|
10 |
+
|
11 |
+
def load_raw_text(corpus_directory: str, file_names=None) -> str:
|
12 |
+
"""Loads all the text files in a directory into one large string"""
|
13 |
+
corpus = ""
|
14 |
+
|
15 |
+
for file_name in os.listdir(corpus_directory):
|
16 |
+
# Read the file as a string
|
17 |
+
file_path = os.path.join(corpus_directory, file_name)
|
18 |
+
if os.path.isdir(file_path):
|
19 |
+
continue
|
20 |
+
|
21 |
+
# Make sure we only read text files
|
22 |
+
if ".txt" not in file_name:
|
23 |
+
continue
|
24 |
+
|
25 |
+
with open(file_path, 'r') as file:
|
26 |
+
file_contents = file.read()
|
27 |
+
corpus += (file_contents + "\n")
|
28 |
+
return corpus
|
29 |
+
|
30 |
+
|
31 |
+
word_regex = r"[\w|\']+"
|
32 |
+
def tokenize(text):
|
33 |
+
return re.findall(word_regex, text)
|
34 |
+
|
35 |
+
|
36 |
+
def preprocess(text):
|
37 |
+
"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
|
38 |
+
text = strip_accents(text)
|
39 |
+
text = text.lower()
|
40 |
+
|
41 |
+
tokens = text.split(" ")
|
42 |
+
|
43 |
+
tokens_filtered = []
|
44 |
+
for token in tokens:
|
45 |
+
# Skip any tokens with special characters
|
46 |
+
if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
|
47 |
+
tokens_filtered.append(token)
|
48 |
+
return tokens_filtered
|
49 |
+
|
50 |
+
|
51 |
+
def pad(text: list, num_padding: int):
|
52 |
+
"""Pads the given text, as a list of strings, with <s> characters between sentences."""
|
53 |
+
padded_text = []
|
54 |
+
|
55 |
+
# Add initial padding to the first sentence
|
56 |
+
for _ in range(num_padding):
|
57 |
+
padded_text.append("<s>")
|
58 |
+
|
59 |
+
for word in text:
|
60 |
+
padded_text.append(word)
|
61 |
+
|
62 |
+
# Every time we see an end punctuation mark, add <s> tokens before it
|
63 |
+
# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
|
64 |
+
if word in [".", "?", "!"]:
|
65 |
+
for _ in range(num_padding):
|
66 |
+
padded_text.append("<s>")
|
67 |
+
|
68 |
+
|
69 |
+
return padded_text
|