Update README.md
Browse files
README.md
CHANGED
@@ -47,8 +47,27 @@ sequences = [
|
|
47 |
"GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
|
48 |
]
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
# Process the sequences
|
51 |
-
sequences = [tokenizer.bos_token + sequence for sequence in
|
52 |
|
53 |
# Tokenize the sequences
|
54 |
tokenizer.padding_side = "left"
|
|
|
47 |
"GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
|
48 |
]
|
49 |
|
50 |
+
def left_padding(sequence, padding_char='A', multiple=6):
|
51 |
+
remainder = len(sequence) % multiple
|
52 |
+
if remainder != 0:
|
53 |
+
padding_length = multiple - remainder
|
54 |
+
return padding_char * padding_length + sequence
|
55 |
+
return sequence
|
56 |
+
|
57 |
+
def left_truncation(sequence, multiple=6):
|
58 |
+
remainder = len(sequence) % multiple
|
59 |
+
if remainder != 0:
|
60 |
+
return sequence[remainder:]
|
61 |
+
return sequence
|
62 |
+
|
63 |
+
# Apply left_padding to all sequences
|
64 |
+
# padded_sequences = [left_padding(seq) for seq in sequences]
|
65 |
+
|
66 |
+
# Apply left_truncation to all sequences
|
67 |
+
truncated_sequences = [left_truncation(seq) for seq in sequences]
|
68 |
+
|
69 |
# Process the sequences
|
70 |
+
sequences = [tokenizer.bos_token + sequence for sequence in truncated_sequences]
|
71 |
|
72 |
# Tokenize the sequences
|
73 |
tokenizer.padding_side = "left"
|