From fef7a2152c4d69b86852e49a09e6b79281a75527 Mon Sep 17 00:00:00 2001 From: dwelch-spike <53876192+dwelch-spike@users.noreply.github.com> Date: Mon, 5 Aug 2024 12:22:17 -0700 Subject: [PATCH] feat: vec-270 support separate index a nd vector data namespaces (#50) * feat: vec-270 support separate data and index namespaces in the quote search example * feat: support separate index and vector data namespaces in the image search example * chore: change docker example aerospike config to use separate index and vector namespaces * feat: add avs_index_set and avs_index_namespace to image and quote example configs --- docker/config/aerospike.conf | 38 +++++++--- .../templates/deployment.yaml | 4 + prism-image-search/README.md | 5 +- .../aerospike/aerospike-single-namespace.conf | 74 +++++++++++++++++++ .../aerospike/etc/aerospike/aerospike.conf | 19 ++++- prism-image-search/docker-compose.yml | 7 ++ prism-image-search/prism/config.py | 2 + prism-image-search/prism/indexer.py | 1 + quote-semantic-search/README.md | 5 +- .../aerospike/aerospike-single-namespace.conf | 74 +++++++++++++++++++ .../aerospike/etc/aerospike/aerospike.conf | 16 +++- quote-semantic-search/docker-compose.yml | 7 ++ quote-semantic-search/quote-search/config.py | 2 + quote-semantic-search/quote-search/indexer.py | 1 + 14 files changed, 236 insertions(+), 19 deletions(-) create mode 100644 prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf create mode 100644 quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf diff --git a/docker/config/aerospike.conf b/docker/config/aerospike.conf index 661282e..a1fa910 100644 --- a/docker/config/aerospike.conf +++ b/docker/config/aerospike.conf @@ -1,7 +1,7 @@ # Aerospike database configuration file for use with systemd. service { - cluster-name prism-demo + cluster-name docker-demo proto-fd-max 15000 } @@ -25,9 +25,9 @@ network { } heartbeat { - address any - mode mesh - port 3002 + mode multicast + multicast-group 239.1.99.222 + port 9918 # To use unicast-mesh heartbeats, remove the 3 lines above, and see # aerospike_mesh.conf for alternative. @@ -45,13 +45,23 @@ network { } } -namespace test { +namespace avs-index { replication-factor 1 nsup-period 60 + + storage-engine device { + file /opt/aerospike/data/index.dat + filesize 8G + } +} +namespace avs-data { + replication-factor 2 + nsup-period 60 + storage-engine device { - file /opt/aerospike/data/test.dat - filesize 5G + file /opt/aerospike/data/data.dat + filesize 8G } } @@ -59,8 +69,16 @@ namespace avs-meta { replication-factor 1 nsup-period 100 - storage-engine device { - file /opt/aerospike/data/avs-meta.dat - filesize 5G + storage-engine memory { + data-size 1G } + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } } + diff --git a/kubernetes/helm/quote-semantic-search/templates/deployment.yaml b/kubernetes/helm/quote-semantic-search/templates/deployment.yaml index e5ed839..f2a0b54 100644 --- a/kubernetes/helm/quote-semantic-search/templates/deployment.yaml +++ b/kubernetes/helm/quote-semantic-search/templates/deployment.yaml @@ -69,6 +69,10 @@ spec: value: {{ .Values.quoteSearchConfig.avsNamespace | quote }} - name: AVS_SET value: {{ .Values.quoteSearchConfig.avsSet | quote }} + - name: AVS_INDEX_NAMESPACE + value: {{ .Values.quoteSearchConfig.avsIndexNamespace | quote }} + - name: AVS_INDEX_SET + value: {{ .Values.quoteSearchConfig.avsIndexSet | quote }} - name: AVS_VERIFY_TLS value: {{ .Values.quoteSearchConfig.avsVerifyTls | quote }} - name: AVS_MAX_RESULTS diff --git a/prism-image-search/README.md b/prism-image-search/README.md index a212147..05570b9 100644 --- a/prism-image-search/README.md +++ b/prism-image-search/README.md @@ -110,7 +110,10 @@ If not set defaults are used. | AVS_HOST | localhost | AVS server seed host | | AVS_PORT | 5000 | AVS server seed host port | | AVS_ADVERTISED_LISTENER| | An optional advertised listener to use if configured on the AVS server | -| AVS_NAMESPACE | test | The aerospike namespace for storing the image records and index | +| AVS_NAMESPACE | test | The Aerospike namespace for storing the image records | +| AVS_SET | image-data | The Aerospike set for storing the image records | +| AVS_INDEX_NAMESPACE | test | The Aerospike namespace for storing the HNSW index | +| AVS_INDEX_SET | image-index | The Aerospike set for storing the HNSW index | | AVS_INDEX_NAME | prism-image-search | The name of the index | | AVS_MAX_RESULTS | 20 | Maximum number of vector search results to return | | AVS_IS_LOADBALANCER | False | If true, the first seed address will be treated as a load balancer node.``` diff --git a/prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf b/prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf new file mode 100644 index 0000000..5057850 --- /dev/null +++ b/prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf @@ -0,0 +1,74 @@ +# Aerospike database configuration file for use with systemd. + +service { + cluster-name prism-demo + proto-fd-max 15000 +} + + +logging { + file /var/log/aerospike/aerospike.log { + context any info + } + + # Send log messages to stdout + console { + context any info + context query critical + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + + # To use unicast-mesh heartbeats, remove the 3 lines above, and see + # aerospike_mesh.conf for alternative. + + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 1 + nsup-period 60 + + storage-engine device { + file /opt/aerospike/data/index.dat + filesize 16G + } +} + +namespace avs-meta { + replication-factor 1 + nsup-period 100 + + storage-engine memory { + data-size 1G + } + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } +} + diff --git a/prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike.conf b/prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike.conf index 0f01b0a..636d6aa 100644 --- a/prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike.conf +++ b/prism-image-search/container-volumes/aerospike/etc/aerospike/aerospike.conf @@ -45,19 +45,30 @@ network { } } -namespace test { +namespace avs-index { replication-factor 1 nsup-period 60 + + storage-engine device { + file /opt/aerospike/data/index.dat + filesize 8G + } +} - storage-engine memory { - data-size 2G +namespace avs-data { + replication-factor 2 + nsup-period 60 + + storage-engine device { + file /opt/aerospike/data/data.dat + filesize 8G } } namespace avs-meta { replication-factor 1 nsup-period 100 - + storage-engine memory { data-size 1G } diff --git a/prism-image-search/docker-compose.yml b/prism-image-search/docker-compose.yml index 1ddedb3..09b5078 100644 --- a/prism-image-search/docker-compose.yml +++ b/prism-image-search/docker-compose.yml @@ -10,6 +10,8 @@ services: command: - "--config-file" - "/opt/aerospike/etc/aerospike/aerospike.conf" + # use this line to store all index and vector data in the default namespace (test) + # - "/opt/aerospike/etc/aerospike/aerospike-single-namespace.conf" healthcheck: # test: [ "CMD", "asinfo", "-U", "admin", "-P", "admin", "-p", "3000", "-v", "build" ] test: [ "CMD", "asinfo", "-p", "3000", "-v", "build" ] @@ -44,6 +46,11 @@ services: AVS_PORT: "5000" APP_NUM_QUOTES: "5000" GRPC_DNS_RESOLVER: native + # comment out the following lines to use the default namespace (test) to store all index and vector data + AVS_NAMESPACE: avs-data + AVS_SET: quote-data + AVS_INDEX_NAMESPACE: avs-index + AVS_INDEX_SET: quote-index volumes: - ./container-volumes/prism/images:/prism/static/images/data diff --git a/prism-image-search/prism/config.py b/prism-image-search/prism/config.py index 2dd4b53..8bb3407 100644 --- a/prism-image-search/prism/config.py +++ b/prism-image-search/prism/config.py @@ -25,6 +25,8 @@ class Config(object): AVS_INDEX_NAME = os.environ.get("AVS_INDEX_NAME") or "prism-image-search" AVS_NAMESPACE = os.environ.get("AVS_NAMESPACE") or "test" AVS_SET = os.environ.get("AVS_SET") or "image-data" + AVS_INDEX_NAMESPACE = os.environ.get("AVS_INDEX_NAMESPACE") or "test" + AVS_INDEX_SET = os.environ.get("AVS_INDEX_SET") or "image-index" AVS_VERIFY_TLS = get_bool_env("VERIFY_TLS", True) AVS_MAX_RESULTS = int(os.environ.get("AVS_MAX_RESULTS") or 20) MAX_CONTENT_LENGTH = int(os.environ.get("MAX_CONTENT_LENGTH") or 10485760) diff --git a/prism-image-search/prism/indexer.py b/prism-image-search/prism/indexer.py index 4fe9552..498356f 100644 --- a/prism-image-search/prism/indexer.py +++ b/prism-image-search/prism/indexer.py @@ -40,6 +40,7 @@ def create_index(): vector_field="image_embedding", dimensions=MODEL_DIM, vector_distance_metric=types.VectorDistanceMetric.COSINE, + index_storage=types.IndexStorage(namespace=Config.AVS_INDEX_NAMESPACE, set_name=Config.AVS_INDEX_SET), ) except Exception as e: logger.critical("Failed to connect to avs client %s", str(e)) diff --git a/quote-semantic-search/README.md b/quote-semantic-search/README.md index 79dc580..f9cb5f2 100644 --- a/quote-semantic-search/README.md +++ b/quote-semantic-search/README.md @@ -101,7 +101,10 @@ If not set defaults are used. | AVS_HOST | localhost | AVS server seed host | | AVS_PORT | 5000 | AVS server seed host port | | AVS_ADVERTISED_LISTENER| | An optional advertised listener to use if configured on the AVS server | -| AVS_NAMESPACE | test | The aerospike namespace for storing the image records and index | +| AVS_NAMESPACE | test | The Aerospike namespace for storing the quote records | +| AVS_SET | quote-data | The Aerospike set for storing the quote records | +| AVS_INDEX_NAMESPACE | test | The Aerospike namespace for storing the HNSW index | +| AVS_INDEX_SET | quote-index | The Aerospike set for storing the HNSW index | | AVS_INDEX_NAME | quote-search | The name of the index | | AVS_MAX_RESULTS | 20 | Maximum number of vector search results to return | | AVS_IS_LOADBALANCER | False | If true, the first seed address will be treated as a load balancer node.``` diff --git a/quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf b/quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf new file mode 100644 index 0000000..8a01f56 --- /dev/null +++ b/quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike-single-namespace.conf @@ -0,0 +1,74 @@ +# Aerospike database configuration file for use with systemd. + +service { + cluster-name quote-demo + proto-fd-max 15000 +} + + +logging { + file /var/log/aerospike/aerospike.log { + context any info + } + + # Send log messages to stdout + console { + context any info + context query critical + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + + # To use unicast-mesh heartbeats, remove the 3 lines above, and see + # aerospike_mesh.conf for alternative. + + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 1 + nsup-period 60 + + storage-engine device { + file /opt/aerospike/data/test.dat + filesize 16G + } +} + +namespace avs-meta { + replication-factor 1 + nsup-period 100 + + storage-engine memory { + data-size 1G + } + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } +} + diff --git a/quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike.conf b/quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike.conf index 8a01f56..08e127b 100644 --- a/quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike.conf +++ b/quote-semantic-search/container-volumes/aerospike/etc/aerospike/aerospike.conf @@ -45,13 +45,23 @@ network { } } -namespace test { +namespace avs-index { replication-factor 1 nsup-period 60 storage-engine device { - file /opt/aerospike/data/test.dat - filesize 16G + file /opt/aerospike/data/index.dat + filesize 8G + } +} + +namespace avs-data { + replication-factor 2 + nsup-period 60 + + storage-engine device { + file /opt/aerospike/data/data.dat + filesize 8G } } diff --git a/quote-semantic-search/docker-compose.yml b/quote-semantic-search/docker-compose.yml index 5cb143f..6931747 100644 --- a/quote-semantic-search/docker-compose.yml +++ b/quote-semantic-search/docker-compose.yml @@ -10,6 +10,8 @@ services: command: - "--config-file" - "/opt/aerospike/etc/aerospike/aerospike.conf" + # use this line to store all index and vector data in the default namespace (test) + # - "/opt/aerospike/etc/aerospike/aerospike-single-namespace.conf" healthcheck: # test: [ "CMD", "asinfo", "-U", "admin", "-P", "admin", "-p", "3000", "-v", "build" ] test: [ "CMD", "asinfo", "-p", "3000", "-v", "build" ] @@ -48,6 +50,11 @@ services: AVS_PORT: "5000" APP_NUM_QUOTES: "5000" GRPC_DNS_RESOLVER: native + # comment out the following lines to use the default namespace (test) to store all index and vector data + AVS_NAMESPACE: avs-data + AVS_SET: quote-data + AVS_INDEX_NAMESPACE: avs-index + AVS_INDEX_SET: quote-index networks: avs-demo: {} \ No newline at end of file diff --git a/quote-semantic-search/quote-search/config.py b/quote-semantic-search/quote-search/config.py index 7b4f378..a50fb25 100644 --- a/quote-semantic-search/quote-search/config.py +++ b/quote-semantic-search/quote-search/config.py @@ -23,6 +23,8 @@ class Config(object): AVS_INDEX_NAME = os.environ.get("AVS_INDEX_NAME") or "quote-semantic-search" AVS_NAMESPACE = os.environ.get("AVS_NAMESPACE") or "test" AVS_SET = os.environ.get("AVS_SET") or "quote-data" + AVS_INDEX_NAMESPACE = os.environ.get("AVS_INDEX_NAMESPACE") or "test" + AVS_INDEX_SET = os.environ.get("AVS_INDEX_SET") or "quote-index" AVS_VERIFY_TLS = get_bool_env("VERIFY_TLS", True) AVS_MAX_RESULTS = int(os.environ.get("AVS_MAX_RESULTS") or 5) INDEXER_PARALLELISM = int(os.environ.get("APP_INDEXER_PARALLELISM") or 1) diff --git a/quote-semantic-search/quote-search/indexer.py b/quote-semantic-search/quote-search/indexer.py index 3e9b849..471845a 100644 --- a/quote-semantic-search/quote-search/indexer.py +++ b/quote-semantic-search/quote-search/indexer.py @@ -53,6 +53,7 @@ def create_index(): vector_field="quote_embedding", dimensions=MODEL_DIM, vector_distance_metric=types.VectorDistanceMetric.COSINE, + index_storage=types.IndexStorage(namespace=Config.AVS_INDEX_NAMESPACE, set_name=Config.AVS_INDEX_SET), ) index_created = True