feat: add docstring to EndpointHandler.call ; when multiple inputs are sent, the output now also contains a token_list k/v pair for easier human inspection

Browse files

Files changed (3) hide show

embed_two_chunks.sh +1 -1
handler.py +31 -10
test_endpoint.py +11 -4

embed_two_chunks.sh CHANGED Viewed

@@ -5,5 +5,5 @@ curl \
   --request POST \
   --url http://localhost:4999 \
   --header 'Content-Type: application/json' \
-  --data '{"inputs": ["Please embed me", "And me too, please!"]}' \
   -w "\n"

   --request POST \
   --url http://localhost:4999 \
   --header 'Content-Type: application/json' \
+  --data '{"inputs": ["Please embed me", "En en en mij ook, alsjeblieft !!!"]}' \
   -w "\n"

handler.py CHANGED Viewed

@@ -7,6 +7,7 @@ import logging
 logger = logging.getLogger(__name__)
 MODEL = "fdurant/colbert-xm-for-inference-api"
 class EndpointHandler():
@@ -18,11 +19,25 @@ class EndpointHandler():
             nbits=2, # The number bits that each dimension encodes to.
             kmeans_niters=4, # Number of iterations for k-means clustering during quantization.
             nranks=-1, # Number of ranks (processors) to use for distributed computing; -1 uses all available CPUs/GPUs.
-            checkpoint=MODEL,
         )
         self._checkpoint = Checkpoint(self._config.checkpoint, colbert_config=self._config, verbose=3)
     def __call__(self, data: Any) -> List[Dict[str, Any]]:
         inputs = data["inputs"]
         texts = []
         if isinstance(inputs, str):
@@ -35,31 +50,37 @@ class EndpointHandler():
             if len(texts) == 1:
                 # It's a query
-                logger.info(f"Query: {texts}")
                 embedding = self._checkpoint.queryFromText(
                     queries=texts,
                     full_length_search=False,  # Indicates whether to encode the query for a full-length search.
                 )
-                logger.info(f"Query embedding shape: {embedding.shape}")
                 return [
                     {"input": inputs, "query_embedding": embedding.tolist()[0]}
                 ]
             elif len(texts) > 1:
                 # It's a batch of chunks
                 logger.info(f"Batch of chunks: {texts}")
-                embeddings, token_counts = self._checkpoint.docFromText(
                     docs=texts,
                     bsize=self._config.bsize, # Batch size
                     keep_dims=True, # Do NOT flatten the embeddings
                     return_tokens=True, # Return the tokens as well
                 )
-                for text, embedding, token_count in zip(texts, embeddings, token_counts):
-                    logger.info(f"Chunk: {text}")
-                    logger.info(f"Chunk embedding shape: {embedding.shape}")
-                    logger.info(f"Chunk count: {token_count}")
                 return [
-                    {"input": _input, "chunk_embedding": embedding.tolist(), "token_count": token_count.tolist()}
-                    for _input, embedding, token_count in zip(texts, embeddings, token_counts)
                 ]
             else:
                 raise ValueError("No data to process")

 logger = logging.getLogger(__name__)
+# Hardcoded, I know
 MODEL = "fdurant/colbert-xm-for-inference-api"
 class EndpointHandler():
             nbits=2, # The number bits that each dimension encodes to.
             kmeans_niters=4, # Number of iterations for k-means clustering during quantization.
             nranks=-1, # Number of ranks (processors) to use for distributed computing; -1 uses all available CPUs/GPUs.
+            checkpoint=MODEL, # Path to the model checkpoint.
         )
         self._checkpoint = Checkpoint(self._config.checkpoint, colbert_config=self._config, verbose=3)
     def __call__(self, data: Any) -> List[Dict[str, Any]]:
+        """
+            data args:
+                inputs (:obj: `str`)
+            Return:
+                A :obj:`list` : will be serialized and returned.
+                When the input is a single query string, the returned list will contain a single dictionary with:
+                    - input (:obj: `str`) : The input query.
+                    - query_embedding (:obj: `list`) : The query embedding of shape (1, 32, 128).
+                When the input is a batch (= list) of chunk strings, the returned list will contain a dictionary for each chunk:
+                    - input (:obj: `str`) : The input chunk.
+                    - chunk_embedding (:obj: `list`) : The chunk embedding of shape (1, num_tokens, 128)
+                    - token_ids (:obj: `list`) : The token ids.
+                    - token_list (:obj: `list`) : The token list.
+        """
         inputs = data["inputs"]
         texts = []
         if isinstance(inputs, str):
             if len(texts) == 1:
                 # It's a query
+                logger.debug(f"Query: {texts}")
                 embedding = self._checkpoint.queryFromText(
                     queries=texts,
                     full_length_search=False,  # Indicates whether to encode the query for a full-length search.
                 )
+                logger.debug(f"Query embedding shape: {embedding.shape}")
                 return [
                     {"input": inputs, "query_embedding": embedding.tolist()[0]}
                 ]
             elif len(texts) > 1:
                 # It's a batch of chunks
                 logger.info(f"Batch of chunks: {texts}")
+                embeddings, token_id_lists = self._checkpoint.docFromText(
                     docs=texts,
                     bsize=self._config.bsize, # Batch size
                     keep_dims=True, # Do NOT flatten the embeddings
                     return_tokens=True, # Return the tokens as well
                 )
+                token_lists = []
+                for text, embedding, token_ids in zip(texts, embeddings, token_id_lists):
+                    logger.debug(f"Chunk: {text}")
+                    logger.debug(f"Chunk embedding shape: {embedding.shape}")
+                    logger.debug(f"Chunk token ids: {token_ids}")
+                    token_list = self._checkpoint.doc_tokenizer.tok.convert_ids_to_tokens(token_ids)
+                    token_lists.append(token_list)
+                    logger.debug(f"Chunk tokens: {token_list}")
+#                    reconstructed_text = self._checkpoint.doc_tokenizer.tok.decode(token_count)
+#                    logger.debug(f"Reconstructed text with special tokens: {reconstructed_text}")
                 return [
+                    {"input": _input, "chunk_embedding": embedding.tolist(), "token_ids": token_ids.tolist(), "token_list": token_list}
+                    for _input, embedding, token_ids, token_list in zip(texts, embeddings, token_id_lists, token_lists)
                 ]
             else:
                 raise ValueError("No data to process")

test_endpoint.py CHANGED Viewed

@@ -40,7 +40,8 @@ def test_query_returns_expected_result():
 def test_batch_returns_expected_result():
     chunks = ["try me", "try me again and again and again"]
-    expected_token_counts = [11, 11]  # Including start and stop tokens, I presume. Not exactly clear!
     payload = {"inputs": chunks}
     response = requests.request("POST", URL, json=payload, headers=HEADERS)
@@ -56,12 +57,18 @@ def test_batch_returns_expected_result():
         # Check chunk embedding (actually a list of embeddings, one per token in the chunk)
         chunk_embedding = response_chunk.get("chunk_embedding")
-        token_count = response_chunk.get("token_count")
         assert isinstance(chunk_embedding, list)
-        assert len(chunk_embedding) == len(token_count)
-        assert len(token_count) == expected_token_counts[i]
         # Check first of the token embeddings
         first_token_embedding = chunk_embedding[0]
         assert len(first_token_embedding) == 128
         assert all(isinstance(value, float) for value in first_token_embedding)

 def test_batch_returns_expected_result():
     chunks = ["try me", "try me again and again and again"]
+    length_of_longest_chunk = 11  # Including special tokens and padding
+    doc_maxlen=512
     payload = {"inputs": chunks}
     response = requests.request("POST", URL, json=payload, headers=HEADERS)
         # Check chunk embedding (actually a list of embeddings, one per token in the chunk)
         chunk_embedding = response_chunk.get("chunk_embedding")
+        token_ids = response_chunk.get("token_ids")
         assert isinstance(chunk_embedding, list)
+        assert len(chunk_embedding) == len(token_ids)
+        assert len(token_ids) == length_of_longest_chunk
+        assert len(token_ids) <= doc_maxlen
         # Check first of the token embeddings
         first_token_embedding = chunk_embedding[0]
         assert len(first_token_embedding) == 128
         assert all(isinstance(value, float) for value in first_token_embedding)
+        # Check token list
+        token_list = response_chunk.get("token_list")
+        assert len(token_ids) == len(token_list)
+        assert all(isinstance(token, str) for token in token_list)

feat: add docstring to EndpointHandler.__call__ ; when multiple inputs are sent, the output now also contains a token_list k/v pair for easier human inspection

feat: add docstring to EndpointHandler.call ; when multiple inputs are sent, the output now also contains a token_list k/v pair for easier human inspection