GenerTeam commited on
Commit
d80aeda
·
verified ·
1 Parent(s): 5615914

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +20 -1
README.md CHANGED
@@ -47,8 +47,27 @@ sequences = [
47
  "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
48
  ]
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Process the sequences
51
- sequences = [tokenizer.bos_token + sequence for sequence in sequences]
52
 
53
  # Tokenize the sequences
54
  tokenizer.padding_side = "left"
 
47
  "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
48
  ]
49
 
50
+ def left_padding(sequence, padding_char='A', multiple=6):
51
+ remainder = len(sequence) % multiple
52
+ if remainder != 0:
53
+ padding_length = multiple - remainder
54
+ return padding_char * padding_length + sequence
55
+ return sequence
56
+
57
+ def left_truncation(sequence, multiple=6):
58
+ remainder = len(sequence) % multiple
59
+ if remainder != 0:
60
+ return sequence[remainder:]
61
+ return sequence
62
+
63
+ # Apply left_padding to all sequences
64
+ # padded_sequences = [left_padding(seq) for seq in sequences]
65
+
66
+ # Apply left_truncation to all sequences
67
+ truncated_sequences = [left_truncation(seq) for seq in sequences]
68
+
69
  # Process the sequences
70
+ sequences = [tokenizer.bos_token + sequence for sequence in truncated_sequences]
71
 
72
  # Tokenize the sequences
73
  tokenizer.padding_side = "left"