runtime_config.yaml

# its contents configure the TGIS server & caikit
jvm_options: []

runtime:
  library: caikit_nlp
  lazy_load_local_models: true
  batching:
    standalone-model:
      size: 0 # Set to batch size for batching

model_management:
  finders:
    default:
      type: LOCAL
    remote_tgis:
      type: TGIS-AUTO
      config:
        test_connection: true
  initializers:
    default:
      type: LOCAL
      config:
        backend_priority:
          - type: TGIS
            config:
              local:
                load_timeout: 120
                grpc_port: null
                http_port: null
                health_poll_delay: 1.0
              remote_models:
                flan-t5-xl:
                  hostname: localhost:8033
                  prompt_dir: tgis_prompts
                llama-70b:
                  hostname: localhost:8034
                  prompt_dir: tgis_prompts

              connection:
                hostname: "foo.{model_id}:1234"
                ca_cert_file: null
                client_cert_file: null
                client_key_file: null

# Config used only in EmbeddingModule. Set here or use env vars like EMBEDDING_RETRIES=32
embedding:
  # Allow models with remote code.
  trust_remote_code: false
  # Number of times to retry on error. Most deployments should use 0 retries.
  retries: 0
  # Batch size for encode() if <= 0 or invalid, the sentence-transformers default is used
  batch_size: 0
  # Should implicit truncation (with truncate_input_tokens=0) throw error for truncation (default) or disable this
  implicit_truncation_errors: true
  # Attempt to optimize with PyTorch compile()
  pt2_compile: false
  # Use IPEX optimize. Works best when used with autocast (bfloat16) below.
  ipex: false
  # Use autocast in encode with its default dtype (bfloat16)
  autocast: false
  # For testing, set device to "mps" on MacOS or "xpu" for IPEX GPU.
  # Otherwise, the default does automatic checks for cuda GPU (else cpu).
  device: ""