| #SBATCH --job-name=google_sp | |
| #SBATCH --nodes=1 | |
| #SBATCH --cpus-per-task=100 | |
| #SBATCH --ntasks-per-node=1 | |
| #SBATCH -o %x-%j.log | |
| set -x -e | |
| echo "START TIME: $(date)" | |
| BIN_PATH=/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/bin/spm_train | |
| export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/lib | |
| INPUT_FILE=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_59132213.txt | |
| INPUT_FILE_SMALL=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_1000000.txt | |
| VOCAB_SIZE=40000 | |
| COV=0.9995 | |
| MAX_LENGTH=6 | |
| TYPE=bpe | |
| SEED=42 | |
| MAX_INPUT_LENGTH=100000 | |
| OPTION="\ | |
| --input=${INPUT_FILE} \ | |
| --vocab_size=${VOCAB_SIZE} \ | |
| --character_coverage=${COV} \ | |
| --max_sentencepiece_length=${MAX_LENGTH} \ | |
| --model_type=${TYPE} \ | |
| --model_prefix=${TYPE}_v${VOCAB_SIZE}_s${SEED}_cov${COV}_max${MAX_LENGTH} \ | |
| --random_seed=${SEED} \ | |
| --max_sentence_length=100000 \ | |
| --shuffle_input_sentence=true \ | |
| --input_sentence_size=${MAX_INPUT_LENGTH} \ | |
| --minloglevel 1 \ | |
| --num_threads=100 \ | |
| --train_extremely_large_corpus=true \ | |
| " | |
| eval $BIN_PATH $OPTION |