Huggingface example usage is not working. #1

onurozcan361 · 2024-03-27T13:35:21Z

Here is my code and output.

# Use of custom modules that automatically detect the language of the passages to index and activate the language-specific adapters accordingly
from custom import CustomIndexer, CustomSearcher 
from colbert.infra import Run, RunConfig

n_gpu: int = 1 # Set your number of available GPUs
experiment: str = "colbert" # Name of the folder where the logs and created indices will be stored
index_name: str = "my_index" # The name of your index, i.e. the name of your vector database
documents: list = ["Ceci est un premier document.", "Voici un second document.", "etc."] # Corpus
# Step 1: Indexing. This step encodes all passages into matrices, stores them on disk, and builds data structures for efficient search.
with Run().context(RunConfig(nranks=n_gpu,experiment=experiment)):
    indexer = CustomIndexer(checkpoint="antoinelouis/colbert-xm")

    indexer.index(name=index_name, collection=documents)


# Step 2: Searching. Given the model and index, you can issue queries over the collection to retrieve the top-k passages for each query.
with Run().context(RunConfig(nranks=n_gpu,experiment=experiment)):
    searcher = CustomSearcher(index=index_name) # You don't need to specify checkpoint again, the model name is stored in the index.
    results = searcher.search(query="Comment effectuer une recherche avec ColBERT ?", k=10)
    # results: tuple of tuples of length k containing ((passage_id, passage_rank, passage_score), ...)

Output:
nranks = 1 num_gpus = 1 device=0
{
"query_token_id": "[unused0]",
"doc_token_id": "[unused1]",
"query_token": "[Q]",
"doc_token": "[D]",
"ncells": null,
"centroid_score_threshold": null,
"ndocs": null,
"load_index_with_mmap": false,
"index_path": null,
"index_bsize": 64,
"nbits": 2,
"kmeans_niters": 4,
"resume": false,
"similarity": "cosine",
"bsize": 64,
"accumsteps": 1,
"lr": 5e-6,
"maxsteps": 200000,
"save_every": null,
"warmup": 20000,
"warmup_bert": null,
"relu": false,
"nway": 2,
"use_ib_negatives": true,
"reranker": false,
"distillation_alpha": 1.0,
"ignore_scores": true,
"model_name": "facebook/xmod-base",
"query_maxlen": 32,
"attend_to_mask_tokens": true,
"interaction": "colbert",
"dim": 128,
"doc_maxlen": 256,
"mask_punctuation": true,
"checkpoint": "antoinelouis/colbert-xm",
"triples": "data/mmarco/tuples.train.scores-ids.2nway.25.6M.jsonl",
"collection": [
"Ceci est un premier document.",
"Voici un second document.",
"etc."
],
"queries": "data/mmarco/english_queries.train.tsv",
"index_name": "my_index",
"overwrite": false,
"root": "/home/user/Desktop/colbert/experiments",
"experiment": "colbert",
"index_root": null,
"name": "2024-03/27/15.00.59",
"rank": 0,
"nranks": 1,
"amp": true,
"gpus": 1,
"avoid_fork_if_possible": false
}
[Mar 27, 16:23:19] [0] # of sampled PIDs = 3 sampled_pids[:3] = [1, 0, 2]
[Mar 27, 16:23:19] [0] #> Encoding 3 passages..
[Mar 27, 16:23:21] [0] avg_doclen_est = 8.333333015441895 len(local_sample) = 3
[Mar 27, 16:23:21] [0] Creating 64 partitions.
[Mar 27, 16:23:21] [0] Estimated 24 embeddings.
[Mar 27, 16:23:21] [0] #> Saving the indexing plan to /home/user/Desktop/colbert/experiments/colbert/indexes/my_index/plan.json ..
Process Process-32:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/colbert/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/home/user/anaconda3/envs/colbert/lib/python3.11/multiprocessing/process.py", line 108, in run
self._target(*self._args, self._kwargs)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/infra/launcher.py", line 134, in setup_new_process
return_val = callee(config, args)
^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 33, in encode
encoder.run(shared_lists)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 68, in run
self.train(shared_lists) # Trains centroids from selected passages
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 232, in train
centroids = self._train_kmeans(sample, shared_lists)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 304, in train_kmeans
centroids = compute_faiss_kmeans(*args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 507, in compute_faiss_kmeans
kmeans.train(sample)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/init.py", line 1560, in train
clus.train(x, self.index, weights)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/init.py", line 68, in replacement_train
self.train_c(n, swig_ptr(x), index)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/swigfaiss_avx2.py", line 2286, in train
return _swigfaiss_avx2.Clustering_train(self, n, x, index, x_weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Error in void faiss::Clustering::train_encoded(faiss::Clustering::idx_t, const uint8_t, const faiss::Index, faiss::Index&, const float) at /home/conda/feedstock_root/build_artifacts/faiss-split_1685015639137/work/faiss/Clustering.cpp:277: Error: 'nx >= k' failed: Number of training points (24) should be at least as large as number of clusters (64)

Question-1: I am trying to run huggingface usage https://huggingface.co/antoinelouis/colbert-xm but it gives the error above. Could you help me to solve ?
Question-2: In config.json file there is a line: "max_position_embeddings": 514 Why is max_position_embeddings not equal to 512?

The text was updated successfully, but these errors were encountered:

onurozcan361 changed the title ~~Huggingface example usage is not working!~~ Huggingface example usage is not working. Mar 27, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Huggingface example usage is not working. #1

Huggingface example usage is not working. #1

onurozcan361 commented Mar 27, 2024 •

edited

Loading

Huggingface example usage is not working. #1

Huggingface example usage is not working. #1

Comments

onurozcan361 commented Mar 27, 2024 • edited Loading

onurozcan361 commented Mar 27, 2024 •

edited

Loading