cfsdwe/static-embedding-japanese-ONNX-for-js

hotchpotch/static-embedding-japanese を transformers.js ですぐに使えるようにONNX変換・整理したものです。

sentence-transformersのStaticEmbeddingクラスと同等の推論をtransformers.jsで簡単に行うために、特殊トークンの除外処理を含めたONNXに変換。
Nmt normalizerは v3.4.2 現在 transformers.js で未対応のため、hotchpotch/xlm-roberta-japanese-tokenizer の tokenizer.json から Nmt を除外。
config.json を追加。

使用例

import { AutoModel, AutoTokenizer, matmul } from '@huggingface/transformers';

const model = await AutoModel.from_pretrained(
        "cfsdwe/static-embedding-japanese-for-js",
        { dtype: "q8" } // or fp32
    );
const tokenizer = await AutoTokenizer.from_pretrained('cfsdwe/static-embedding-japanese-for-js');

const sentences = [
    "カレーはおいしい。",
    "カレースープはそこそこ美味しい。",
    "トマトジュースは好みが分かれる。",
];

// `Nmt` normalizerに相当する処理をtokenizerに渡す前に自前で追加
function myNmtNormalizer(text) {
    let normalizedText = text;
    const controlCharsRegex =
        /[\u{1}-\u{8}\u{B}\u{E}-\u{1F}\u{7F}\u{8F}\u{9F}]/gu;
    normalizedText = normalizedText.replace(controlCharsRegex, "");
    
    const mapToSpaceRegex =
        /[\u{0009}\u{000A}\u{000C}\u{000D}\u{1680}\u{200B}-\u{200F}\u{2028}\u{2029}\u{2581}\u{FEFF}\u{FFFD}]/gu;
    normalizedText = normalizedText.replace(mapToSpaceRegex, " ");

    return normalizedText;
}

const sentences_normalized = sentences.map((s) => myNmtNormalizer(s))
const inputs = tokenizer(sentences_normalized, { padding: true, truncation: true });
const { sentence_embedding } = await model(inputs);

const normalized = sentence_embedding.normalize();
const scores = await matmul(normalized, normalized.transpose(1, 0));
console.log(scores.tolist());
// [
//   [1.0000001192092896, 0.7581136226654053, 0.25487640500068665]
//   [0.7581136226654053, 0.9999997019767761, 0.24671493470668793]
//   [0.25487640500068665, 0.24671493470668793, 0.9999983906745911],
// ]