You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Use of custom modules that automatically detect the language of the passages to index and activate the language-specific adapters accordingly
from custom import CustomIndexer, CustomSearcher
from colbert.infra import Run, RunConfig
n_gpu: int = 1 # Set your number of available GPUs
experiment: str = "colbert" # Name of the folder where the logs and created indices will be stored
index_name: str = "my_index" # The name of your index, i.e. the name of your vector database
documents: list = ["Ceci est un premier document.", "Voici un second document.", "etc."] # Corpus
# Step 1: Indexing. This step encodes all passages into matrices, stores them on disk, and builds data structures for efficient search.
with Run().context(RunConfig(nranks=n_gpu,experiment=experiment)):
indexer = CustomIndexer(checkpoint="antoinelouis/colbert-xm")
indexer.index(name=index_name, collection=documents)
# Step 2: Searching. Given the model and index, you can issue queries over the collection to retrieve the top-k passages for each query.
with Run().context(RunConfig(nranks=n_gpu,experiment=experiment)):
searcher = CustomSearcher(index=index_name) # You don't need to specify checkpoint again, the model name is stored in the index.
results = searcher.search(query="Comment effectuer une recherche avec ColBERT ?", k=10)
# results: tuple of tuples of length k containing ((passage_id, passage_rank, passage_score), ...)
Output:
nranks = 1 num_gpus = 1 device=0
{
"query_token_id": "[unused0]",
"doc_token_id": "[unused1]",
"query_token": "[Q]",
"doc_token": "[D]",
"ncells": null,
"centroid_score_threshold": null,
"ndocs": null,
"load_index_with_mmap": false,
"index_path": null,
"index_bsize": 64,
"nbits": 2,
"kmeans_niters": 4,
"resume": false,
"similarity": "cosine",
"bsize": 64,
"accumsteps": 1,
"lr": 5e-6,
"maxsteps": 200000,
"save_every": null,
"warmup": 20000,
"warmup_bert": null,
"relu": false,
"nway": 2,
"use_ib_negatives": true,
"reranker": false,
"distillation_alpha": 1.0,
"ignore_scores": true,
"model_name": "facebook/xmod-base",
"query_maxlen": 32,
"attend_to_mask_tokens": true,
"interaction": "colbert",
"dim": 128,
"doc_maxlen": 256,
"mask_punctuation": true,
"checkpoint": "antoinelouis/colbert-xm",
"triples": "data/mmarco/tuples.train.scores-ids.2nway.25.6M.jsonl",
"collection": [
"Ceci est un premier document.",
"Voici un second document.",
"etc."
],
"queries": "data/mmarco/english_queries.train.tsv",
"index_name": "my_index",
"overwrite": false,
"root": "/home/user/Desktop/colbert/experiments",
"experiment": "colbert",
"index_root": null,
"name": "2024-03/27/15.00.59",
"rank": 0,
"nranks": 1,
"amp": true,
"gpus": 1,
"avoid_fork_if_possible": false
}
[Mar 27, 16:23:19] [0] # of sampled PIDs = 3 sampled_pids[:3] = [1, 0, 2]
[Mar 27, 16:23:19] [0] #> Encoding 3 passages..
[Mar 27, 16:23:21] [0] avg_doclen_est = 8.333333015441895 len(local_sample) = 3
[Mar 27, 16:23:21] [0] Creating 64 partitions.
[Mar 27, 16:23:21] [0] Estimated 24 embeddings.
[Mar 27, 16:23:21] [0] #> Saving the indexing plan to /home/user/Desktop/colbert/experiments/colbert/indexes/my_index/plan.json ..
Process Process-32:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/colbert/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/home/user/anaconda3/envs/colbert/lib/python3.11/multiprocessing/process.py", line 108, in run
self._target(*self._args, self._kwargs)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/infra/launcher.py", line 134, in setup_new_process
return_val = callee(config, args)
^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 33, in encode
encoder.run(shared_lists)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 68, in run
self.train(shared_lists) # Trains centroids from selected passages
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 232, in train
centroids = self._train_kmeans(sample, shared_lists)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 304, in train_kmeans
centroids = compute_faiss_kmeans(*args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 507, in compute_faiss_kmeans
kmeans.train(sample)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/init.py", line 1560, in train
clus.train(x, self.index, weights)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/init.py", line 68, in replacement_train
self.train_c(n, swig_ptr(x), index)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/swigfaiss_avx2.py", line 2286, in train
return _swigfaiss_avx2.Clustering_train(self, n, x, index, x_weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Error in void faiss::Clustering::train_encoded(faiss::Clustering::idx_t, const uint8_t, const faiss::Index, faiss::Index&, const float) at /home/conda/feedstock_root/build_artifacts/faiss-split_1685015639137/work/faiss/Clustering.cpp:277: Error: 'nx >= k' failed: Number of training points (24) should be at least as large as number of clusters (64)
Question-1: I am trying to run huggingface usage https://huggingface.co/antoinelouis/colbert-xm but it gives the error above. Could you help me to solve ?
Question-2: In config.json file there is a line: "max_position_embeddings": 514 Why is max_position_embeddings not equal to 512?
The text was updated successfully, but these errors were encountered:
onurozcan361
changed the title
Huggingface example usage is not working!
Huggingface example usage is not working.
Mar 27, 2024
Here is my code and output.
Output:
nranks = 1 num_gpus = 1 device=0
{
"query_token_id": "[unused0]",
"doc_token_id": "[unused1]",
"query_token": "[Q]",
"doc_token": "[D]",
"ncells": null,
"centroid_score_threshold": null,
"ndocs": null,
"load_index_with_mmap": false,
"index_path": null,
"index_bsize": 64,
"nbits": 2,
"kmeans_niters": 4,
"resume": false,
"similarity": "cosine",
"bsize": 64,
"accumsteps": 1,
"lr": 5e-6,
"maxsteps": 200000,
"save_every": null,
"warmup": 20000,
"warmup_bert": null,
"relu": false,
"nway": 2,
"use_ib_negatives": true,
"reranker": false,
"distillation_alpha": 1.0,
"ignore_scores": true,
"model_name": "facebook/xmod-base",
"query_maxlen": 32,
"attend_to_mask_tokens": true,
"interaction": "colbert",
"dim": 128,
"doc_maxlen": 256,
"mask_punctuation": true,
"checkpoint": "antoinelouis/colbert-xm",
"triples": "data/mmarco/tuples.train.scores-ids.2nway.25.6M.jsonl",
"collection": [
"Ceci est un premier document.",
"Voici un second document.",
"etc."
],
"queries": "data/mmarco/english_queries.train.tsv",
"index_name": "my_index",
"overwrite": false,
"root": "/home/user/Desktop/colbert/experiments",
"experiment": "colbert",
"index_root": null,
"name": "2024-03/27/15.00.59",
"rank": 0,
"nranks": 1,
"amp": true,
"gpus": 1,
"avoid_fork_if_possible": false
}
[Mar 27, 16:23:19] [0] # of sampled PIDs = 3 sampled_pids[:3] = [1, 0, 2]
[Mar 27, 16:23:19] [0] #> Encoding 3 passages..
[Mar 27, 16:23:21] [0] avg_doclen_est = 8.333333015441895 len(local_sample) = 3
[Mar 27, 16:23:21] [0] Creating 64 partitions.
[Mar 27, 16:23:21] [0] Estimated 24 embeddings.
[Mar 27, 16:23:21] [0] #> Saving the indexing plan to /home/user/Desktop/colbert/experiments/colbert/indexes/my_index/plan.json ..
Process Process-32:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/colbert/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/home/user/anaconda3/envs/colbert/lib/python3.11/multiprocessing/process.py", line 108, in run
self._target(*self._args, self._kwargs)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/infra/launcher.py", line 134, in setup_new_process
return_val = callee(config, args)
^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 33, in encode
encoder.run(shared_lists)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 68, in run
self.train(shared_lists) # Trains centroids from selected passages
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 232, in train
centroids = self._train_kmeans(sample, shared_lists)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 304, in train_kmeans
centroids = compute_faiss_kmeans(*args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/colbert/indexing/collection_indexer.py", line 507, in compute_faiss_kmeans
kmeans.train(sample)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/init.py", line 1560, in train
clus.train(x, self.index, weights)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/init.py", line 68, in replacement_train
self.train_c(n, swig_ptr(x), index)
File "/home/user/anaconda3/envs/colbert/lib/python3.11/site-packages/faiss/swigfaiss_avx2.py", line 2286, in train
return _swigfaiss_avx2.Clustering_train(self, n, x, index, x_weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Error in void faiss::Clustering::train_encoded(faiss::Clustering::idx_t, const uint8_t, const faiss::Index, faiss::Index&, const float) at /home/conda/feedstock_root/build_artifacts/faiss-split_1685015639137/work/faiss/Clustering.cpp:277: Error: 'nx >= k' failed: Number of training points (24) should be at least as large as number of clusters (64)
Question-1: I am trying to run huggingface usage https://huggingface.co/antoinelouis/colbert-xm but it gives the error above. Could you help me to solve ?
Question-2: In config.json file there is a line:
"max_position_embeddings": 514
Why is max_position_embeddings not equal to 512?The text was updated successfully, but these errors were encountered: