Row sampling option added.

refresh-bio · Oct 3, 2024 · 074b294 · 074b294
1 parent 7f72108
commit 074b294
Show file tree

Hide file tree

Showing 30 changed files with 634 additions and 179 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -87,7 +87,11 @@ jobs:
 
     - name: distance
       run: | 
-        ./kmer-db distance jaccard min max cosine mash k18.csv
+        ./kmer-db distance jaccard k18.csv k18.csv.jaccard
+        ./kmer-db distance min k18.csv k18.csv.min
+        ./kmer-db distance max k18.csv k18.csv.max
+        ./kmer-db distance cosine k18.csv k18.csv.cosine
+        ./kmer-db distance mash k18.csv k18.csv.mash
         cmp k18.csv.jaccard ${INPUT_DIR}/k18.csv.jaccard
         cmp k18.csv.min ${INPUT_DIR}/k18.csv.min
         cmp k18.csv.max ${INPUT_DIR}/k18.csv.max
@@ -128,7 +132,7 @@ jobs:
     - name: minhash (default k, fraction 0.1) + build + all2all 
       run: |
         ./kmer-db minhash -f 0.1 ${INPUT_DIR}/seqs.list
-        ./kmer-db build -from-minhash -k 25 ${INPUT_DIR}/seqs.list k18.minhash.db
+        ./kmer-db build -from-minhash ${INPUT_DIR}/seqs.list k18.minhash.db
         ./kmer-db all2all k18.minhash.db k18.minhash.csv
         cmp k18.minhash.csv ${INPUT_DIR}/k18.frac.csv
         

diff --git a/README.md b/README.md
@@ -71,16 +71,16 @@ echo $OUTPUT/k18.parts2.db >> $OUTPUT/db.list
 
 # 1. Installation
 
-Kmer-db comes with a set of [precompiled binaries](https://github.com/refresh-bio/kmer-db/releases) for Linux, OS X, and Windows. 
+Kmer-db comes with a set of [precompiled binaries](https://github.com/refresh-bio/kmer-db/releases) for Linux, macOS, and Windows. 
 The software is also available on [Bioconda](https://anaconda.org/bioconda/kmer-db):
 ```
 conda install -c bioconda kmer-db
 ```
 For detailed instructions how to set up Bioconda, please refer to the [Bioconda manual](https://bioconda.github.io/user/install.html#install-conda).
 Kmer-db can be also built from the sources distributed as:
 
-* MAKE project (G++ 11 tested) for Linux and OS X,
-* Visual Studio 2015 solution for Windows.
+* MAKE project (C++-20-compatible compiler required, e.g., g++-11) for Linux and macOS,
+* Visual Studio 2022 solution for Windows.
 
 
 ## Vector extensions
@@ -155,15 +155,15 @@ Parameters:
 
 Dense computations - recomended when the distance matrix contains few zeros. Output can be stored in the dense or sparse form (`-sparse` switch).
 
-`kmer-db all2all [-buffer <size_mb>] [-t <threads>] [-sparse [-min <th>]* [-max <th>]* ] <database> <common_table>`
+`kmer-db all2all [-buffer <size_mb>] [-t <threads>] [-sparse [-min [<criterion>:]<value>]* [-max [<criterion>:]<value>]* ] <database> <common_table>`
 
 Sparse computations - recommended when the distance matrix contains many zeros. Output matrix is always in the sparse form:
 
-`kmer-db all2all-sp [-buffer <size_mb>] [-t <threads>] [-min <th>]* [-max <th>]* <database> <common_table>`
+`kmer-db all2all-sp [-buffer <size_mb>] [-t <threads>] [-min [<criterion>:]<value>]* [-max [<criterion>:]<value>]* [-sample-rows [<criterion>:]<count>] <database> <common_table>`
 
 Sparse computations, partial databases - use when the distance matrix contains many zeros and there are multiple partial databases. Output matrix is always in the sparse form:
 
-`kmer-db all2all-parts [-buffer <size_mb>] [-t <threads>] [-min <th>]* [-max <th>]* <db_list> <common_table>`
+`kmer-db all2all-parts [-buffer <size_mb>] [-t <threads>] [-min [<criterion>:]<value>]* [-max [<criterion>:]<value>]* [-sample-rows [<criterion>:]<count>] <db_list> <common_table>`
 
 Parameters:
 * `database` (input) - k-mer database file created by `build` mode,
@@ -172,14 +172,15 @@ Parameters:
 * `-buffer <size_mb>` - size of cache buffer in megabytes; use L3 size for Intel CPUs and L2 for AMD for best performance; default: 8,
 * `-t <threads>` - number of threads (default: number of available cores),
 * `-sparse` - stores output matrix in a sparse form (always on in `all2all-sp` and `all2all-parts` modes),
-* `-min <th>` - minimum output filtering, 
-* `-max <th>` - maximum output filtering.
+* `-min [<criterion>:]<value>` - retains elements with `criterion` greater than or equal to `value` (see details below), 
+* `-max [<criterion>:]<value>` - retains elements with `criterion` lower than or equal to `value` (see details below),
+* `-sample-rows [<criterion>:]<count>` - retains `count` elements in every row using one of the strategies: (i) random selection (no `criterion`); (ii) the best elements with respect to `criterion`.
 
-Filtering threshold `<th>` has the form `<[criterion:]value>` with `criterion` being `num-kmers` (number of common k-mers) or one of the distance/similarity measures (`jaccard`, `min`, `max`, `cosine`, `mash`, `ani`, `ani-shorder`, see 2.3 for definitions). If no `criterion` is specified, `num-kmers` is used by default. Multiple filters can be combined. 
+`criterion` can be `num-kmers` (number of common k-mers) or one of the distance/similarity measures: `jaccard`, `min`, `max`, `cosine`, `mash`, `ani`, `ani-shorder` (see 2.3 for definitions). No `criterion` indicates `num-kmers` (filtering) or random elements selection (sampling). Multiple filters can be combined. 
 
 ### New samples against the database:
 
-`kmer-db new2all [-multisample-fasta | -from-kmers | -from-minhash] [-t <threads>]  [-sparse [-min <th>]* [-max <th>]* ] <database> <sample_list> <common_table>`
+`kmer-db new2all [-multisample-fasta | -from-kmers | -from-minhash] [-t <threads>]  [-sparse [-min [<criterion>:]<value>]* [-max [<criterion>:]<value>]* ] <database> <sample_list> <common_table>`
 
 Parameters:
 * `database` (input) - k-mer database file created by `build` mode,
@@ -188,11 +189,10 @@ Parameters:
 * `-multisample-fasta` / `-from-kmers` / `-from-minhash` - see `build` mode for details,
 * `-t <threads>` - number of threads (default: number of available cores),
 * `-sparse` - stores output matrix in a sparse form,
-* `-min <th>` - minimum output filtering, 
-* `-max <th>` - maximum output filtering.
-
-Filtering threshold `<th>` has the form `<[criterion:]value>` with `criterion` being `num-kmers` (number of common k-mers) or one of the distance/similarity measures (`jaccard`, `min`, `max`, `cosine`, `mash`, `ani`, `ani-shorder`, see 2.3 for definitions). If no `criterion` is specified, `num-kmers` is used by default. Multiple filters can be combined.
+* `-min [<criterion>:]<value>` - retains elements with `criterion` greater than or equal to `value` (see details below), 
+* `-max [<criterion>:]<value>` - retains elements with `criterion` lower than or equal to `value` (see details below),
 
+`criterion` can be `num-kmers` (number of common k-mers) or one of the distance/similarity measures: `jaccard`, `min`, `max`, `cosine`, `mash`, `ani`, `ani-shorder` (see 2.3 for definitions). No `criterion` indicates `num-kmers`. Multiple filters can be combined.
 
 ### Single sample against the database:
 
@@ -202,7 +202,7 @@ The meaning of the parameters is the same as in `new2all` mode, but instead of s
 
 ### Output format
 
-Modes `all2all`, `new2all`, and `one2all` produce a comma-separated table with numbers of common k-mers. The table is by default stored in a dense form:
+Modes `all2all`, `all2all-sp`, `all2all-parts`, `new2all`, and `one2all` produce a comma-separated table with numbers of common k-mers. For `all2all`, `new2all`, and `one2all` modes, the table is by default stored in a dense form:
 
 | 									| 								| 					| 					|		|			|	
 | :---: 							| :---: 						| :---: 			| :---:				| :---:	|  :---:	| 
@@ -221,9 +221,7 @@ where:
 * &#124;*a*&#124; - number of k-mers in sample *a*,
 * &#124;*a &cap; b*&#124; - number of k-mers common for samples *a* and *b*.
 
-For performance reasons, `all2all` mode produces a lower triangular matrix.
-
-When `-sparse` switch is specified, the table is stored in a sparse form. In particular, zeros are omitted while non-zero elements are represented as pairs (*column_id*: *value*) with 1-based column indexing. Thus, rows may have different number of elements, e.g.:
+When `-sparse` switch is specified or `all2all-sp`, `all2all-parts` modes are used, the table is stored in a sparse form. In particular, zeros are omitted while non-zero elements are represented as pairs (*column_id*: *value*) with 1-based column indexing. Thus, rows may have different number of elements, e.g.:
 
 | 									| 								| 					| 				|		|			|	
 | :---: 							| :---: 						| :---: 			| :---:			| :---:	|  :---:	| 
@@ -234,10 +232,13 @@ When `-sparse` switch is specified, the table is stored in a sparse form. In par
 | *q<sub>2</sub>* 					| &#124;*q<sub>2</sub>*&#124;	| ||||
 | ... 								| ...							| ... ||||
 | *q<sub>m</sub>* 					| &#124;*q<sub>m</sub>*&#124;	| *i<sub>m1</sub>*: &#124;*q<sub>m</sub> &cap; s<sub>i<sub>m1</sub></sub>*&#124;	| |||
+
+For performance reasons, `all2all`, `all2all-sp`, and `all2all-parts` modes produce a lower triangular matrix.
+
 
  ## 2.3. Calculating similarities or distances
 
-`kmer-db distance <measure> [-sparse [-min <th>]* [-max <th>]* ] <common_table> <output_table>`
+`kmer-db distance <measure> [-sparse [-min [<criterion>:]<value>]* [-max [<criterion>:]<value>]* ] <common_table> <output_table>`
 
 Parameters:
 * `measure` - names of the similarity/distance measure to be calculated, can be one of the following: 
@@ -252,12 +253,11 @@ Parameters:
 * `output_table` (output) - file containing table with calculated distance measure,  
 * `-phylip-out` - store output distance matrix in a Phylip format,
 * `-sparse` - outputs a sparse matrix (only for dense input matrices - sparse inputs always produce sparse outputs),
-* `-min <th>` - minimum output filtering, 
-* `-max <th>` - maximum output filtering.
-
-Filtering threshold `<th>` has the form `<[criterion:]value>` with `criterion` being `num-kmers` (number of common k-mers) or one of the distance/similarity measures (`jaccard`, `min`, `max`, `cosine`, `mash`, `ani`, `ani-shorder`, see 2.3 for definitions). If no `criterion` is specified, `measure` argument is used by default. Multiple filters can be combined.
-
+* `-min [<criterion>:]<value>` - retains elements with `criterion` greater than or equal to `value` (see details below), 
+* `-max [<criterion>:]<value>` - retains elements with `criterion` lower than or equal to `value` (see details below),
 
+`criterion` can be `num-kmers` (number of common k-mers) or one of the distance/similarity measures: `jaccard`, `min`, `max`, `cosine`, `mash`, `ani`, `ani-shorder` (see 2.3 for definitions). If no `criterion` is specified, `measure` argument is used by default. Multiple filters can be combined.
+
         
 ## 2.4. Storing minhashed k-mers
 

diff --git a/libs/mimalloc b/libs/mimalloc
diff --git a/libs/refresh/active_thread_pool/lib/active_thread_pool.h b/libs/refresh/active_thread_pool/lib/active_thread_pool.h
@@ -77,15 +77,16 @@ namespace refresh
 		void inc()
 		{
 			no_working.fetch_add(1);
-			is_operating.test_and_set();
+//			is_operating.test_and_set();
 		}
 
 		void dec()
 		{
-			if (no_working.fetch_sub(1) == 1)
+			no_working.fetch_sub(1);
+/*			if (no_working.fetch_sub(1) == 1)
 				is_operating.clear();
 
-			is_operating.notify_all();
+			is_operating.notify_all();*/
 		}
 
 		void set_exception(std::exception_ptr ptr)
@@ -114,20 +115,23 @@ namespace refresh
 			return exception_ptr != nullptr;
 		}
 
-		void wait()
+/*		void wait()
 		{
+			// !!! FIXME: can work improperly
 			is_operating.wait(true);
-		}
+		}*/
 
 		void heavy_wait()
 		{
-			while (is_operating.test())
+//			while (is_operating.test())
+			while (no_working.load() != 0)
 				;
 		}
 
 		void busy_wait()
 		{
-			while (is_operating.test())
+//			while (is_operating.test())
+			while (no_working.load() != 0)
 				refresh::utils::noop();
 		}
 
@@ -231,27 +235,23 @@ namespace refresh
 
 							if (pool_ptr)
 								pool_ptr->dec();
+							thread_state.store(thread_state_waiting);
 						}
 						catch (const std::exception &e)
 						{
 							if (pool_ptr)
 							{
-								pool_ptr->dec();
-
 								std::cerr << "Exception: " + std::string(e.what()) + "\n";
 								fflush(stderr);
 
 								pool_ptr->set_exception(std::current_exception());
-
+								pool_ptr->dec();
+								thread_state.store(thread_state_waiting);
+
 								return;
 							}
 						}
 
-						{
-//							auto tmp = thread_state_working;
-//							thread_state.compare_exchange_strong(tmp, thread_state_waiting);
-							thread_state.store(thread_state_waiting);
-						}
 						break;
 					}
 				}

diff --git a/libs/refresh/compression/lib/file_wrapper.h b/libs/refresh/compression/lib/file_wrapper.h
@@ -145,17 +145,30 @@ namespace refresh
 	{
 		std::string file_name;
 		size_t io_buffer_size;
-		FILE* file = nullptr;
-		bool test_extension = true;
+		FILE* file;
+		bool test_extension;
+
+		void _open()
+		{
+			file = fopen(file_name.c_str(), "rb");
+
+			if (!file)
+				return;
+
+			setvbuf(file, nullptr, _IOFBF, io_buffer_size);
+			buffer_released = true;
+		}
 
 	public:
 		stream_in_file(const std::string& file_name, size_t io_buffer_size = 16 << 20, size_t buffer_size = 8 << 20, bool test_extension = true) :
 			stream_in_buffered(buffer_size),
-			file_name(file_name),
-			io_buffer_size(io_buffer_size),
-			test_extension(test_extension)
+			file_name{ file_name },
+			io_buffer_size{ io_buffer_size },
+			file{ nullptr },
+			test_extension{test_extension}
 		{
-			open(file_name);
+//			open(file_name);
+			_open();
 		}
 
 		virtual ~stream_in_file()
@@ -170,16 +183,9 @@ namespace refresh
 			if (file)
 				close();
 
-			file = fopen(file_name.c_str(), "rb");
+			_open();
 
-			if (!file)
-				return false;
-
-			setvbuf(file, nullptr, _IOFBF, io_buffer_size);
-
-			buffer_released = true;
-
-			return true;
+			return file != nullptr;
 		}
 
 		virtual bool close()