diff --git a/.github/workflows/autodocs.yml b/.github/workflows/autodocs.yml
new file mode 100644
index 00000000000..a981c09cb39
--- /dev/null
+++ b/.github/workflows/autodocs.yml
@@ -0,0 +1,21 @@
+name: Automatic Documentation for Launcher
+
+on:
+  pull_request:
+
+jobs:
+  update_docs:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+    
+    - name: Install Launcher
+      id: install-launcher
+      run: cargo install --git https://github.com/${{ github.repository }} --branch ${{ github.head_ref }} text-generation-launcher
+    
+    - name: Check launcher Docs are up-to-date
+      run: |
+        echo text-generation-launcher --help
+        python update_doc.py --check
diff --git a/Cargo.lock b/Cargo.lock
index 0f3b39de385..8fa7b726665 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.20.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
 dependencies = [
  "gimli",
 ]
@@ -17,17 +17,6 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
-[[package]]
-name = "aes"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac1f845298e95f983ff1944b728ae08b8cebab80d684f0a832ed0fc74dfa27e2"
-dependencies = [
- "cfg-if",
- "cipher",
- "cpufeatures",
-]
-
 [[package]]
 name = "ahash"
 version = "0.8.3"
@@ -50,33 +39,32 @@ dependencies = [
 
 [[package]]
 name = "aho-corasick"
-version = "1.0.3"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86b8f9420f797f2d9e935edf629310eb938a0d839f984e25327f3c7eed22300c"
+checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "anstream"
-version = "0.3.2"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
+checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
- "is-terminal",
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle"
-version = "1.0.1"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd"
+checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46"
 
 [[package]]
 name = "anstyle-parse"
@@ -98,9 +86,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.2"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c"
+checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd"
 dependencies = [
  "anstyle",
  "windows-sys 0.48.0",
@@ -108,9 +96,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.72"
+version = "1.0.75"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854"
+checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
 
 [[package]]
 name = "arc-swap"
@@ -125,7 +113,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "93b21a03b7c21702a0110f9f8d228763a533570deb376119042dabf33c37a01a"
 dependencies = [
  "futures-io",
- "rustls",
+ "rustls 0.20.9",
  "webpki",
 ]
 
@@ -148,7 +136,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -159,7 +147,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -170,9 +158,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "average"
-version = "0.13.1"
+version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "843ec791d3f24503bbf72bbd5e49a3ab4dbb4bcd0a8ef6b0c908efa73caa27b1"
+checksum = "6d804c74bb2d66e9b7047658d21af0f1c937d7d2466410cbf1aed3b0c04048d4"
 dependencies = [
  "easy-cast",
  "float-ord",
@@ -242,25 +230,27 @@ dependencies = [
 
 [[package]]
 name = "axum-tracing-opentelemetry"
-version = "0.10.0"
+version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "164b95427e83b79583c7699a72b4a6b485a12bbdef5b5c054ee5ff2296d82f52"
+checksum = "06985105829f176e9a3f113b1c71cc24e08f600ef0df4e70cd90d144f889e19f"
 dependencies = [
  "axum",
- "futures",
+ "futures-core",
+ "futures-util",
  "http",
- "opentelemetry 0.18.0",
+ "opentelemetry",
+ "pin-project-lite",
  "tower",
- "tower-http 0.3.5",
  "tracing",
- "tracing-opentelemetry 0.18.0",
+ "tracing-opentelemetry",
+ "tracing-opentelemetry-instrumentation-sdk",
 ]
 
 [[package]]
 name = "backtrace"
-version = "0.3.68"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
 dependencies = [
  "addr2line",
  "cc",
@@ -279,15 +269,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
 
 [[package]]
 name = "base64"
-version = "0.21.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d"
-
-[[package]]
-name = "base64ct"
-version = "1.6.0"
+version = "0.21.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2"
 
 [[package]]
 name = "bitflags"
@@ -312,9 +296,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.13.0"
+version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
 
 [[package]]
 name = "bytecount"
@@ -330,52 +314,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
-
-[[package]]
-name = "bzip2"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
-dependencies = [
- "bzip2-sys",
- "libc",
-]
-
-[[package]]
-name = "bzip2-sys"
-version = "0.1.11+1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
-]
-
-[[package]]
-name = "cached-path"
-version = "0.6.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "097968e38f1319207f057d0f4d76452e4f4f847a5de61c5215379f297fa034f3"
-dependencies = [
- "flate2",
- "fs2",
- "glob",
- "indicatif 0.16.2",
- "log",
- "rand",
- "reqwest",
- "serde",
- "serde_json",
- "sha2",
- "tar",
- "tempfile",
- "thiserror",
- "zip",
-]
+checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 
 [[package]]
 name = "cassowary"
@@ -385,11 +326,10 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
 
 [[package]]
 name = "cc"
-version = "1.0.82"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "305fe645edc1442a0fa8b6726ba61d422798d37a52e12eaecf4b022ebbb88f01"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
 dependencies = [
- "jobserver",
  "libc",
 ]
 
@@ -399,32 +339,21 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
-[[package]]
-name = "cipher"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
-dependencies = [
- "crypto-common",
- "inout",
-]
-
 [[package]]
 name = "clap"
-version = "4.3.21"
+version = "4.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c27cdf28c0f604ba3f512b0c9a409f8de8513e4816705deb0498b627e7c3a3fd"
+checksum = "824956d0dca8334758a5b7f7e50518d66ea319330cbceedcf76905c2f6ab30e3"
 dependencies = [
  "clap_builder",
  "clap_derive",
- "once_cell",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.3.21"
+version = "4.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08a9f1ab5e9f01a9b81f202e8562eb9a10de70abf9eaeac1be465c28b75aa4aa"
+checksum = "122ec64120a49b4563ccaedcbea7818d069ed8e9aa6d829b82d8a4128936b2ab"
 dependencies = [
  "anstream",
  "anstyle",
@@ -434,21 +363,21 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.3.12"
+version = "4.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050"
+checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873"
 dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.5.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
+checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961"
 
 [[package]]
 name = "colorchoice"
@@ -469,12 +398,6 @@ dependencies = [
  "windows-sys 0.45.0",
 ]
 
-[[package]]
-name = "constant_time_eq"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
-
 [[package]]
 name = "core-foundation"
 version = "0.9.3"
@@ -539,7 +462,7 @@ dependencies = [
  "autocfg",
  "cfg-if",
  "crossbeam-utils",
- "memoffset 0.9.0",
+ "memoffset",
  "scopeguard",
 ]
 
@@ -554,11 +477,11 @@ dependencies = [
 
 [[package]]
 name = "crossterm"
-version = "0.26.1"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a84cda67535339806297f1b331d6dd6320470d2a0fe65381e79ee9e156dd3d13"
+checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.0",
  "crossterm_winapi",
  "libc",
  "mio",
@@ -589,9 +512,9 @@ dependencies = [
 
 [[package]]
 name = "ctrlc"
-version = "3.4.0"
+version = "3.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a011bbe2c35ce9c1f143b7af6f94f29a167beb4cd1d29e6740ce836f723120e"
+checksum = "82e95fbd621905b854affdc67943b043a0fbb6ed7385fd5a25650d19a8a6cfdf"
 dependencies = [
  "nix",
  "windows-sys 0.48.0",
@@ -632,24 +555,11 @@ dependencies = [
  "syn 1.0.109",
 ]
 
-[[package]]
-name = "dashmap"
-version = "5.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
-dependencies = [
- "cfg-if",
- "hashbrown 0.14.0",
- "lock_api",
- "once_cell",
- "parking_lot_core",
-]
-
 [[package]]
 name = "deranged"
-version = "0.3.7"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7684a49fb1af197853ef7b2ee694bc1f5b4179556f1e5710e1760c5db6f5e929"
+checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946"
 
 [[package]]
 name = "derive_builder"
@@ -690,7 +600,6 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
  "crypto-common",
- "subtle",
 ]
 
 [[package]]
@@ -699,7 +608,16 @@ version = "4.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059"
 dependencies = [
- "dirs-sys",
+ "dirs-sys 0.3.7",
+]
+
+[[package]]
+name = "dirs"
+version = "5.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
+dependencies = [
+ "dirs-sys 0.4.1",
 ]
 
 [[package]]
@@ -713,11 +631,23 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "dirs-sys"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
+dependencies = [
+ "libc",
+ "option-ext",
+ "redox_users",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "easy-cast"
-version = "0.4.4"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bd102ee8c418348759919b83b81cdbdc933ffe29740b903df448b4bafaa348e"
+checksum = "10936778145f3bea71fd9bf61332cce28c28e96a380714f7ab34838b80733fd6"
 dependencies = [
  "libm",
 ]
@@ -736,9 +666,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
 
 [[package]]
 name = "encoding_rs"
-version = "0.8.32"
+version = "0.8.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
+checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1"
 dependencies = [
  "cfg-if",
 ]
@@ -751,9 +681,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f"
+checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd"
 dependencies = [
  "errno-dragonfly",
  "libc",
@@ -781,21 +711,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
-
-[[package]]
-name = "filetime"
-version = "0.2.22"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0"
-dependencies = [
- "cfg-if",
- "libc",
- "redox_syscall 0.3.5",
- "windows-sys 0.48.0",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
 
 [[package]]
 name = "fixedbitset"
@@ -805,9 +723,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
 [[package]]
 name = "flate2"
-version = "1.0.26"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
@@ -827,14 +745,13 @@ checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853"
 
 [[package]]
 name = "flume"
-version = "0.10.14"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577"
+checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181"
 dependencies = [
  "futures-core",
  "futures-sink",
  "nanorand",
- "pin-project",
  "spin 0.9.8",
 ]
 
@@ -868,16 +785,6 @@ dependencies = [
  "percent-encoding",
 ]
 
-[[package]]
-name = "fs2"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "futures"
 version = "0.3.28"
@@ -934,7 +841,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -1001,31 +908,25 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.27.3"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e"
-
-[[package]]
-name = "glob"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
 
 [[package]]
 name = "grpc-metadata"
 version = "0.1.0"
 dependencies = [
- "opentelemetry 0.19.0",
- "tonic 0.9.2",
+ "opentelemetry",
+ "tonic 0.10.1",
  "tracing",
- "tracing-opentelemetry 0.19.0",
+ "tracing-opentelemetry",
 ]
 
 [[package]]
 name = "h2"
-version = "0.3.20"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97ec8491ebaf99c8eaa73058b045fe58073cd6be7f596ac993ced0b0a0c01049"
+checksum = "91fc23aa11be92976ef4729127f1a74adf36d8436f7816b185d18df956790833"
 dependencies = [
  "bytes",
  "fnv",
@@ -1069,17 +970,50 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
 [[package]]
 name = "hermit-abi"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
 
 [[package]]
-name = "hmac"
-version = "0.12.1"
+name = "hf-hub"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+checksum = "ca0a2d78bbd80fdb804b213f675d261eb87060571e1c389bcad8c24add066cf9"
 dependencies = [
- "digest",
+ "dirs 5.0.1",
+ "indicatif 0.17.7",
+ "native-tls",
+ "rand",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "ureq",
+]
+
+[[package]]
+name = "hf-hub"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9caf63dfcf84b1b62532c836035516f5b475b06c906da6f16990d22847aba216"
+dependencies = [
+ "dirs 5.0.1",
+ "indicatif 0.17.7",
+ "log",
+ "native-tls",
+ "rand",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "ureq",
+]
+
+[[package]]
+name = "home"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb"
+dependencies = [
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -1233,42 +1167,51 @@ dependencies = [
 
 [[package]]
 name = "indicatif"
-version = "0.16.2"
+version = "0.17.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b"
+checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25"
 dependencies = [
  "console",
- "lazy_static",
+ "instant",
  "number_prefix 0.4.0",
- "regex",
+ "portable-atomic",
+ "unicode-width",
 ]
 
 [[package]]
-name = "inout"
-version = "0.1.3"
+name = "indoc"
+version = "2.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
-dependencies = [
- "generic-array",
-]
+checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8"
 
 [[package]]
-name = "ipnet"
-version = "2.8.0"
+name = "init-tracing-opentelemetry"
+version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
+checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17"
+dependencies = [
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "thiserror",
+ "tracing",
+ "tracing-opentelemetry",
+]
 
 [[package]]
-name = "is-terminal"
-version = "0.4.9"
+name = "instant"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
+checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
 dependencies = [
- "hermit-abi",
- "rustix",
- "windows-sys 0.48.0",
+ "cfg-if",
 ]
 
+[[package]]
+name = "ipnet"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
+
 [[package]]
 name = "itertools"
 version = "0.8.2"
@@ -1297,19 +1240,19 @@ dependencies = [
 ]
 
 [[package]]
-name = "itoa"
-version = "1.0.9"
+name = "itertools"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
 
 [[package]]
-name = "jobserver"
-version = "0.1.26"
+name = "itoa"
+version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
-dependencies = [
- "libc",
-]
+checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
 
 [[package]]
 name = "js-sys"
@@ -1328,9 +1271,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.147"
+version = "0.2.148"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b"
 
 [[package]]
 name = "libm"
@@ -1340,9 +1283,9 @@ checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.5"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
+checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128"
 
 [[package]]
 name = "lock_api"
@@ -1402,24 +1345,15 @@ dependencies = [
 
 [[package]]
 name = "matchit"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed1202b2a6f884ae56f04cff409ab315c5ce26b5e58d7412e484f01fd52f52ef"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
 
 [[package]]
 name = "memchr"
-version = "2.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
-
-[[package]]
-name = "memoffset"
-version = "0.7.1"
+version = "2.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
-dependencies = [
- "autocfg",
-]
+checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
 
 [[package]]
 name = "memoffset"
@@ -1447,7 +1381,7 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a4964177ddfdab1e3a2b37aec7cf320e14169abb0ed73999f558136409178d5"
 dependencies = [
- "base64 0.21.2",
+ "base64 0.21.4",
  "hyper",
  "indexmap 1.9.3",
  "ipnet",
@@ -1467,7 +1401,7 @@ checksum = "ddece26afd34c31585c74a4db0630c376df271c285d682d1e55012197830b6df"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -1546,7 +1480,7 @@ checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -1603,9 +1537,9 @@ dependencies = [
 
 [[package]]
 name = "ngrok"
-version = "0.12.4"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87e211f407b0a084f720823a00c956aeab2c15dfe7a61760d93227bbaf048026"
+checksum = "1454b1edbc5f2c8ff3242c237cb84388b50eced8eb26b4204e49698ed6511784"
 dependencies = [
  "arc-swap",
  "async-rustls",
@@ -1634,16 +1568,13 @@ dependencies = [
 
 [[package]]
 name = "nix"
-version = "0.26.2"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.0",
  "cfg-if",
  "libc",
- "memoffset 0.7.1",
- "pin-utils",
- "static_assertions",
 ]
 
 [[package]]
@@ -1724,9 +1655,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
 [[package]]
 name = "object"
-version = "0.31.1"
+version = "0.32.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1"
+checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0"
 dependencies = [
  "memchr",
 ]
@@ -1761,11 +1692,11 @@ dependencies = [
 
 [[package]]
 name = "openssl"
-version = "0.10.56"
+version = "0.10.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "729b745ad4a5575dd06a3e1af1414bd330ee561c01b3899eb584baeaa8def17e"
+checksum = "bac25ee399abb46215765b1cb35bc0212377e58a061560d8b29b024fd0430e7c"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.0",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -1782,7 +1713,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -1793,9 +1724,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.91"
+version = "0.9.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "866b5f16f90776b9bb8dc1e1802ac6f0513de3a7a7465867bfbc563dc737faac"
+checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d"
 dependencies = [
  "cc",
  "libc",
@@ -1805,81 +1736,76 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry"
-version = "0.18.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
 dependencies = [
- "opentelemetry_api 0.18.0",
- "opentelemetry_sdk 0.18.0",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
 ]
 
 [[package]]
-name = "opentelemetry"
-version = "0.19.0"
+name = "opentelemetry-http"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
 dependencies = [
- "opentelemetry_api 0.19.0",
- "opentelemetry_sdk 0.19.0",
+ "async-trait",
+ "bytes",
+ "http",
+ "opentelemetry_api",
 ]
 
 [[package]]
 name = "opentelemetry-otlp"
-version = "0.12.0"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
+checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
 dependencies = [
  "async-trait",
- "futures",
- "futures-util",
+ "futures-core",
  "http",
- "opentelemetry 0.19.0",
  "opentelemetry-proto",
- "prost",
+ "opentelemetry-semantic-conventions",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+ "prost 0.11.9",
  "thiserror",
  "tokio",
- "tonic 0.8.3",
+ "tonic 0.9.2",
 ]
 
 [[package]]
 name = "opentelemetry-proto"
-version = "0.2.0"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
+checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
 dependencies = [
- "futures",
- "futures-util",
- "opentelemetry 0.19.0",
- "prost",
- "tonic 0.8.3",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+ "prost 0.11.9",
+ "tonic 0.9.2",
 ]
 
 [[package]]
-name = "opentelemetry_api"
-version = "0.18.0"
+name = "opentelemetry-semantic-conventions"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
+checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
 dependencies = [
- "fnv",
- "futures-channel",
- "futures-util",
- "indexmap 1.9.3",
- "js-sys",
- "once_cell",
- "pin-project-lite",
- "thiserror",
+ "opentelemetry",
 ]
 
 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
 dependencies = [
- "fnv",
  "futures-channel",
  "futures-util",
  "indexmap 1.9.3",
+ "js-sys",
  "once_cell",
  "pin-project-lite",
  "thiserror",
@@ -1888,46 +1814,40 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.18.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
 dependencies = [
  "async-trait",
  "crossbeam-channel",
- "dashmap",
- "fnv",
  "futures-channel",
  "futures-executor",
  "futures-util",
  "once_cell",
- "opentelemetry_api 0.18.0",
+ "opentelemetry_api",
+ "ordered-float",
  "percent-encoding",
  "rand",
+ "regex",
+ "serde_json",
  "thiserror",
  "tokio",
  "tokio-stream",
 ]
 
 [[package]]
-name = "opentelemetry_sdk"
-version = "0.19.0"
+name = "option-ext"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+
+[[package]]
+name = "ordered-float"
+version = "3.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a54938017eacd63036332b4ae5c8a49fc8c0c1d6d629893057e4f13609edd06"
 dependencies = [
- "async-trait",
- "crossbeam-channel",
- "dashmap",
- "fnv",
- "futures-channel",
- "futures-executor",
- "futures-util",
- "once_cell",
- "opentelemetry_api 0.19.0",
- "percent-encoding",
- "rand",
- "thiserror",
- "tokio",
- "tokio-stream",
+ "num-traits",
 ]
 
 [[package]]
@@ -1938,9 +1858,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 
 [[package]]
 name = "papergrid"
-version = "0.9.1"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae7891b22598926e4398790c8fe6447930c72a67d36d983a49d6ce682ce83290"
+checksum = "a2ccbe15f2b6db62f9a9871642746427e297b0ceb85f9a7f1ee5ff47d184d0c8"
 dependencies = [
  "bytecount",
  "fnv",
@@ -1967,18 +1887,7 @@ dependencies = [
  "libc",
  "redox_syscall 0.3.5",
  "smallvec",
- "windows-targets 0.48.1",
-]
-
-[[package]]
-name = "password-hash"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700"
-dependencies = [
- "base64ct",
- "rand_core",
- "subtle",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -1987,18 +1896,6 @@ version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
-[[package]]
-name = "pbkdf2"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917"
-dependencies = [
- "digest",
- "hmac",
- "password-hash",
- "sha2",
-]
-
 [[package]]
 name = "percent-encoding"
 version = "2.3.0"
@@ -2007,12 +1904,12 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
 
 [[package]]
 name = "petgraph"
-version = "0.6.3"
+version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
+checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9"
 dependencies = [
  "fixedbitset",
- "indexmap 1.9.3",
+ "indexmap 2.0.0",
 ]
 
 [[package]]
@@ -2032,14 +1929,14 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.12"
+version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cc1b0bf1727a77a54b6654e7b5f1af8604923edc8b81885f8ec92f9e3f0a05"
+checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
 
 [[package]]
 name = "pin-utils"
@@ -2055,9 +1952,9 @@ checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
 
 [[package]]
 name = "portable-atomic"
-version = "1.4.2"
+version = "1.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f32154ba0af3a075eefa1eda8bb414ee928f62303a54ea85b8d6638ff1a6ee9e"
+checksum = "31114a898e107c51bb1609ffaf55a0e011cf6a4d7f1170d0015a165082c0338b"
 
 [[package]]
 name = "ppv-lite86"
@@ -2067,12 +1964,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "prettyplease"
-version = "0.1.25"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86"
+checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d"
 dependencies = [
  "proc-macro2",
- "syn 1.0.109",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -2101,9 +1998,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.67"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328"
 dependencies = [
  "unicode-ident",
 ]
@@ -2115,27 +2012,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
 dependencies = [
  "bytes",
- "prost-derive",
+ "prost-derive 0.11.9",
+]
+
+[[package]]
+name = "prost"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4fdd22f3b9c31b53c060df4a0613a1c7f062d4115a2b984dd15b1858f7e340d"
+dependencies = [
+ "bytes",
+ "prost-derive 0.12.1",
 ]
 
 [[package]]
 name = "prost-build"
-version = "0.11.9"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
+checksum = "8bdf592881d821b83d471f8af290226c8d51402259e9bb5be7f9f8bdebbb11ac"
 dependencies = [
  "bytes",
  "heck",
- "itertools 0.10.5",
- "lazy_static",
+ "itertools 0.11.0",
  "log",
  "multimap",
+ "once_cell",
  "petgraph",
  "prettyplease",
- "prost",
+ "prost 0.12.1",
  "prost-types",
  "regex",
- "syn 1.0.109",
+ "syn 2.0.37",
  "tempfile",
  "which",
 ]
@@ -2153,13 +2060,26 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "prost-derive"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "265baba7fabd416cf5078179f7d2cbeca4ce7a9041111900675ea7c4cb8a4c32"
+dependencies = [
+ "anyhow",
+ "itertools 0.11.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.37",
+]
+
 [[package]]
 name = "prost-types"
-version = "0.11.9"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13"
+checksum = "e081b29f63d83a4bc75cfc9f3fe424f9156cf92d8a4f0c9407cce9a1b67327cf"
 dependencies = [
- "prost",
+ "prost 0.12.1",
 ]
 
 [[package]]
@@ -2180,9 +2100,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.32"
+version = "1.0.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
 dependencies = [
  "proc-macro2",
 ]
@@ -2219,13 +2139,17 @@ dependencies = [
 
 [[package]]
 name = "ratatui"
-version = "0.20.1"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcc0d032bccba900ee32151ec0265667535c230169f5a011154cdcd984e16829"
+checksum = "2e2e4cd95294a85c3b4446e63ef054eea43e0205b1fd60120c16b74ff7ff96ad"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.0",
  "cassowary",
  "crossterm",
+ "indoc",
+ "itertools 0.11.0",
+ "paste",
+ "strum",
  "unicode-segmentation",
  "unicode-width",
 ]
@@ -2241,9 +2165,9 @@ dependencies = [
 
 [[package]]
 name = "rayon"
-version = "1.7.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
+checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1"
 dependencies = [
  "either",
  "rayon-core",
@@ -2262,14 +2186,12 @@ dependencies = [
 
 [[package]]
 name = "rayon-core"
-version = "1.11.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
+checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed"
 dependencies = [
- "crossbeam-channel",
  "crossbeam-deque",
  "crossbeam-utils",
- "num_cpus",
 ]
 
 [[package]]
@@ -2303,14 +2225,14 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.9.3"
+version = "1.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a"
+checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
 dependencies = [
- "aho-corasick 1.0.3",
+ "aho-corasick 1.1.1",
  "memchr",
- "regex-automata 0.3.6",
- "regex-syntax 0.7.4",
+ "regex-automata 0.3.8",
+ "regex-syntax 0.7.5",
 ]
 
 [[package]]
@@ -2324,13 +2246,13 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.3.6"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69"
+checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
 dependencies = [
- "aho-corasick 1.0.3",
+ "aho-corasick 1.1.1",
  "memchr",
- "regex-syntax 0.7.4",
+ "regex-syntax 0.7.5",
 ]
 
 [[package]]
@@ -2341,17 +2263,17 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
 [[package]]
 name = "regex-syntax"
-version = "0.7.4"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
+checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
 
 [[package]]
 name = "reqwest"
-version = "0.11.18"
+version = "0.11.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55"
+checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1"
 dependencies = [
- "base64 0.21.2",
+ "base64 0.21.4",
  "bytes",
  "encoding_rs",
  "futures-core",
@@ -2418,7 +2340,7 @@ dependencies = [
  "quote",
  "rust-embed-utils",
  "shellexpand",
- "syn 2.0.28",
+ "syn 2.0.37",
  "walkdir",
 ]
 
@@ -2449,9 +2371,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.8"
+version = "0.38.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f"
+checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f"
 dependencies = [
  "bitflags 2.4.0",
  "errno",
@@ -2462,9 +2384,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.20.8"
+version = "0.20.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f"
+checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99"
 dependencies = [
  "log",
  "ring",
@@ -2472,13 +2394,45 @@ dependencies = [
  "webpki",
 ]
 
+[[package]]
+name = "rustls"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd8d6c9f025a446bc4d18ad9632e69aec8f287aa84499ee335599fabd20c3fd8"
+dependencies = [
+ "log",
+ "ring",
+ "rustls-webpki 0.101.6",
+ "sct",
+]
+
 [[package]]
 name = "rustls-pemfile"
 version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2"
 dependencies = [
- "base64 0.21.2",
+ "base64 0.21.4",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.100.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f6a5fc258f1c1276dfe3016516945546e2d5383911efc0fc4f1cdc5df3a4ae3"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.101.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c7d5dece342910d9ba34d259310cae3e0154b873b35408b787b59bce53d34fe"
+dependencies = [
+ "ring",
+ "untrusted",
 ]
 
 [[package]]
@@ -2552,35 +2506,35 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.18"
+version = "1.0.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918"
+checksum = "ad977052201c6de01a8ef2aa3378c4bd23217a056337d1d6da40468d267a4fb0"
 
 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.188"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.188"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.104"
+version = "1.0.107"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c"
+checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65"
 dependencies = [
  "itoa",
  "ryu",
@@ -2609,22 +2563,11 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "sha1"
-version = "0.10.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "digest",
-]
-
 [[package]]
 name = "sha2"
-version = "0.10.7"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
  "cfg-if",
  "cpufeatures",
@@ -2646,7 +2589,7 @@ version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ccc8076840c4da029af4f87e4e8daeb0fca6b87bbb02e10cb60b791450e11e4"
 dependencies = [
- "dirs",
+ "dirs 4.0.0",
 ]
 
 [[package]]
@@ -2687,9 +2630,9 @@ checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1"
 
 [[package]]
 name = "slab"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
 dependencies = [
  "autocfg",
 ]
@@ -2705,9 +2648,9 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
+checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a"
 
 [[package]]
 name = "socket2"
@@ -2721,9 +2664,9 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.3"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
+checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e"
 dependencies = [
  "libc",
  "windows-sys 0.48.0",
@@ -2756,12 +2699,6 @@ dependencies = [
  "unicode-segmentation",
 ]
 
-[[package]]
-name = "static_assertions"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
-
 [[package]]
 name = "strsim"
 version = "0.10.0"
@@ -2769,10 +2706,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
 [[package]]
-name = "subtle"
-version = "2.5.0"
+name = "strum"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.25.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad8d03b598d3d0fff69bf533ee3ef19b8eeb342729596df84bcc7e1f96ec4059"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.37",
+]
 
 [[package]]
 name = "syn"
@@ -2787,9 +2740,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.28"
+version = "2.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
+checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2804,9 +2757,9 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
 [[package]]
 name = "sysinfo"
-version = "0.29.8"
+version = "0.29.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d10ed79c22663a35a255d289a7fdcb43559fc77ff15df5ce6c341809e7867528"
+checksum = "0a18d114d420ada3a891e6bc8e96a2023402203296a47cdd65083377dad18ba5"
 dependencies = [
  "cfg-if",
  "core-foundation-sys",
@@ -2818,9 +2771,9 @@ dependencies = [
 
 [[package]]
 name = "tabled"
-version = "0.12.2"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce69a5028cd9576063ec1f48edb2c75339fd835e6094ef3e05b3a079bf594a6"
+checksum = "dfe9c3632da101aba5131ed63f9eed38665f8b3c68703a6bb18124835c1a5d22"
 dependencies = [
  "papergrid",
  "tabled_derive",
@@ -2840,22 +2793,11 @@ dependencies = [
  "syn 1.0.109",
 ]
 
-[[package]]
-name = "tar"
-version = "0.4.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
-dependencies = [
- "filetime",
- "libc",
- "xattr",
-]
-
 [[package]]
 name = "tempfile"
-version = "3.7.1"
+version = "3.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651"
+checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
 dependencies = [
  "cfg-if",
  "fastrand",
@@ -2866,12 +2808,13 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "1.0.1"
+version = "1.1.0"
 dependencies = [
  "average",
  "clap",
  "crossterm",
  "float-ord",
+ "hf-hub 0.3.1",
  "ratatui",
  "serde",
  "serde_json",
@@ -2886,15 +2829,15 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "1.0.1"
+version = "1.1.0"
 dependencies = [
  "futures",
  "grpc-metadata",
- "prost",
+ "prost 0.12.1",
  "prost-build",
  "thiserror",
  "tokio",
- "tonic 0.9.2",
+ "tonic 0.10.1",
  "tonic-build",
  "tower",
  "tracing",
@@ -2902,7 +2845,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "1.0.1"
+version = "1.1.0"
 dependencies = [
  "clap",
  "ctrlc",
@@ -2918,7 +2861,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "1.0.1"
+version = "1.1.0"
 dependencies = [
  "async-stream",
  "axum",
@@ -2926,11 +2869,13 @@ dependencies = [
  "clap",
  "flume",
  "futures",
+ "hf-hub 0.3.1",
+ "init-tracing-opentelemetry",
  "metrics",
  "metrics-exporter-prometheus",
  "ngrok",
  "nohash-hasher",
- "opentelemetry 0.19.0",
+ "opentelemetry",
  "opentelemetry-otlp",
  "rand",
  "reqwest",
@@ -2940,9 +2885,9 @@ dependencies = [
  "thiserror",
  "tokenizers",
  "tokio",
- "tower-http 0.4.3",
+ "tower-http",
  "tracing",
- "tracing-opentelemetry 0.19.0",
+ "tracing-opentelemetry",
  "tracing-subscriber",
  "utoipa",
  "utoipa-swagger-ui",
@@ -2951,22 +2896,22 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.44"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90"
+checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.44"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96"
+checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -2981,9 +2926,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.25"
+version = "0.3.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fdd63d58b18d663fbdf70e049f00a22c8e42be082203be7f26589213cd75ea"
+checksum = "426f806f4089c493dcac0d24c29c01e2c38baf8e30f1b716ee37e83d200b18fe"
 dependencies = [
  "deranged",
  "itoa",
@@ -2996,15 +2941,15 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.11"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb71511c991639bb078fd5bf97757e03914361c48100d52878b8e52b46fb92cd"
+checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20"
 dependencies = [
  "time-core",
 ]
@@ -3026,17 +2971,16 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokenizers"
-version = "0.13.4"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aea68938177975ab09da68552b720eac941779ff386baceaf77e0f5f9cea645f"
+checksum = "12b515a66453a4d68f03398054f7204fd0dde6b93d3f20ea90b08025ab49b499"
 dependencies = [
  "aho-corasick 0.7.20",
- "cached-path",
  "clap",
  "derive_builder",
- "dirs",
  "esaxx-rs",
  "getrandom",
+ "hf-hub 0.2.0",
  "indicatif 0.15.0",
  "itertools 0.9.0",
  "lazy_static",
@@ -3049,8 +2993,7 @@ dependencies = [
  "rayon",
  "rayon-cond",
  "regex",
- "regex-syntax 0.7.4",
- "reqwest",
+ "regex-syntax 0.7.5",
  "serde",
  "serde_json",
  "spm_precompiled",
@@ -3062,9 +3005,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.31.0"
+version = "1.32.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40de3a2ba249dcb097e01be5e67a5ff53cf250397715a071a81543e8a832a920"
+checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9"
 dependencies = [
  "backtrace",
  "bytes",
@@ -3074,7 +3017,7 @@ dependencies = [
  "parking_lot",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2 0.5.3",
+ "socket2 0.5.4",
  "tokio-macros",
  "windows-sys 0.48.0",
 ]
@@ -3097,7 +3040,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -3134,9 +3077,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.8"
+version = "0.7.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
+checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d"
 dependencies = [
  "bytes",
  "futures-core",
@@ -3149,14 +3092,13 @@ dependencies = [
 
 [[package]]
 name = "tonic"
-version = "0.8.3"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
+checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a"
 dependencies = [
- "async-stream",
  "async-trait",
  "axum",
- "base64 0.13.1",
+ "base64 0.21.4",
  "bytes",
  "futures-core",
  "futures-util",
@@ -3167,30 +3109,26 @@ dependencies = [
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
- "prost",
- "prost-derive",
+ "prost 0.11.9",
  "tokio",
  "tokio-stream",
- "tokio-util",
  "tower",
  "tower-layer",
  "tower-service",
  "tracing",
- "tracing-futures",
 ]
 
 [[package]]
 name = "tonic"
-version = "0.9.2"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a"
+checksum = "14c00bc15e49625f3d2f20b17082601e5e17cf27ead69e805174026c194b6664"
 dependencies = [
+ "async-stream",
  "async-trait",
  "axum",
- "base64 0.21.2",
+ "base64 0.21.4",
  "bytes",
- "futures-core",
- "futures-util",
  "h2",
  "http",
  "http-body",
@@ -3198,7 +3136,7 @@ dependencies = [
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
- "prost",
+ "prost 0.12.1",
  "tokio",
  "tokio-stream",
  "tower",
@@ -3209,15 +3147,15 @@ dependencies = [
 
 [[package]]
 name = "tonic-build"
-version = "0.9.2"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07"
+checksum = "c9d37bb15da06ae9bb945963066baca6561b505af93a52e949a85d28558459a2"
 dependencies = [
  "prettyplease",
  "proc-macro2",
  "prost-build",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -3242,28 +3180,9 @@ dependencies = [
 
 [[package]]
 name = "tower-http"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858"
-dependencies = [
- "bitflags 1.3.2",
- "bytes",
- "futures-core",
- "futures-util",
- "http",
- "http-body",
- "http-range-header",
- "pin-project-lite",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
-[[package]]
-name = "tower-http"
-version = "0.4.3"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55ae70283aba8d2a8b411c695c437fe25b8b5e44e23e780662002fc72fb47a82"
+checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
 dependencies = [
  "bitflags 2.4.0",
  "bytes",
@@ -3310,7 +3229,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -3323,16 +3242,6 @@ dependencies = [
  "valuable",
 ]
 
-[[package]]
-name = "tracing-futures"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2"
-dependencies = [
- "pin-project",
- "tracing",
-]
-
 [[package]]
 name = "tracing-log"
 version = "0.1.3"
@@ -3346,12 +3255,14 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.18.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
 dependencies = [
  "once_cell",
- "opentelemetry 0.18.0",
+ "opentelemetry",
+ "opentelemetry_sdk",
+ "smallvec",
  "tracing",
  "tracing-core",
  "tracing-log",
@@ -3359,17 +3270,16 @@ dependencies = [
 ]
 
 [[package]]
-name = "tracing-opentelemetry"
-version = "0.19.0"
+name = "tracing-opentelemetry-instrumentation-sdk"
+version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "752ddd669b14a08036a89045e4ac4497ff7ce254e11386647751f36d7c7849ea"
 dependencies = [
- "once_cell",
- "opentelemetry 0.19.0",
+ "http",
+ "opentelemetry-http",
+ "opentelemetry_api",
  "tracing",
- "tracing-core",
- "tracing-log",
- "tracing-subscriber",
+ "tracing-opentelemetry",
 ]
 
 [[package]]
@@ -3411,15 +3321,15 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
 
 [[package]]
 name = "typenum"
-version = "1.16.0"
+version = "1.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
+checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
 
 [[package]]
 name = "unicase"
-version = "2.6.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
 dependencies = [
  "version_check",
 ]
@@ -3432,9 +3342,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.11"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "unicode-normalization"
@@ -3462,9 +3372,9 @@ checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
 
 [[package]]
 name = "unicode-width"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
 
 [[package]]
 name = "unicode_categories"
@@ -3478,11 +3388,30 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
 
+[[package]]
+name = "ureq"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
+dependencies = [
+ "base64 0.21.4",
+ "flate2",
+ "log",
+ "native-tls",
+ "once_cell",
+ "rustls 0.21.7",
+ "rustls-webpki 0.100.3",
+ "serde",
+ "serde_json",
+ "url",
+ "webpki-roots",
+]
+
 [[package]]
 name = "url"
-version = "2.4.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb"
+checksum = "143b538f18257fac9cad154828a57c6bf5157e1aa604d4816b5995bf6de87ae5"
 dependencies = [
  "form_urlencoded",
  "idna",
@@ -3503,9 +3432,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
 [[package]]
 name = "utoipa"
-version = "3.4.4"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de634b7f8178c9c246c88ea251f3a0215c9a4d80778db2d7bd4423a78b5170ec"
+checksum = "d82b1bc5417102a73e8464c686eef947bdfb99fcdfc0a4f228e81afa9526470a"
 dependencies = [
  "indexmap 2.0.0",
  "serde",
@@ -3515,15 +3444,15 @@ dependencies = [
 
 [[package]]
 name = "utoipa-gen"
-version = "3.4.5"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fcba79cb3e5020d9bcc8313cd5aadaf51d6d54a6b3fd08c3d0360ae6b3c83d0"
+checksum = "05d96dcd6fc96f3df9b3280ef480770af1b7c5d14bc55192baa9b067976d920c"
 dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
  "regex",
- "syn 2.0.28",
+ "syn 2.0.37",
 ]
 
 [[package]]
@@ -3556,9 +3485,9 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
 [[package]]
 name = "vergen"
-version = "8.2.4"
+version = "8.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbc5ad0d9d26b2c49a5ab7da76c3e79d3ee37e7821799f8223fcb8f2f391a2e7"
+checksum = "85e7dc29b3c54a2ea67ef4f953d5ec0c4085035c0ae2d325be1c0d2144bd9f16"
 dependencies = [
  "anyhow",
  "rustc_version",
@@ -3575,9 +3504,9 @@ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee"
 dependencies = [
  "same-file",
  "winapi-util",
@@ -3619,7 +3548,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
  "wasm-bindgen-shared",
 ]
 
@@ -3653,7 +3582,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.37",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -3676,23 +3605,33 @@ dependencies = [
 
 [[package]]
 name = "webpki"
-version = "0.22.0"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd"
+checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e"
 dependencies = [
  "ring",
  "untrusted",
 ]
 
+[[package]]
+name = "webpki-roots"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
+dependencies = [
+ "rustls-webpki 0.100.3",
+]
+
 [[package]]
 name = "which"
-version = "4.4.0"
+version = "4.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
 dependencies = [
  "either",
- "libc",
+ "home",
  "once_cell",
+ "rustix",
 ]
 
 [[package]]
@@ -3713,9 +3652,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
 [[package]]
 name = "winapi-util"
-version = "0.1.5"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
 dependencies = [
  "winapi",
 ]
@@ -3741,7 +3680,7 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets 0.48.1",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -3761,17 +3700,17 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.48.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm 0.48.0",
- "windows_aarch64_msvc 0.48.0",
- "windows_i686_gnu 0.48.0",
- "windows_i686_msvc 0.48.0",
- "windows_x86_64_gnu 0.48.0",
- "windows_x86_64_gnullvm 0.48.0",
- "windows_x86_64_msvc 0.48.0",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
 ]
 
 [[package]]
@@ -3782,9 +3721,9 @@ checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -3794,9 +3733,9 @@ checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -3806,9 +3745,9 @@ checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -3818,9 +3757,9 @@ checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -3830,9 +3769,9 @@ checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -3842,9 +3781,9 @@ checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -3854,26 +3793,18 @@ checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
 name = "winreg"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d"
-dependencies = [
- "winapi",
-]
-
-[[package]]
-name = "xattr"
-version = "1.0.1"
+version = "0.50.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4686009f71ff3e5c4dbcf1a282d0a44db3f021ba69350cd42086b3e5f1c6985"
+checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
 dependencies = [
- "libc",
+ "cfg-if",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -3882,46 +3813,8 @@ version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
 dependencies = [
- "aes",
  "byteorder",
- "bzip2",
- "constant_time_eq",
  "crc32fast",
  "crossbeam-utils",
  "flate2",
- "hmac",
- "pbkdf2",
- "sha1",
- "time",
- "zstd",
-]
-
-[[package]]
-name = "zstd"
-version = "0.11.2+zstd.1.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
-dependencies = [
- "zstd-safe",
-]
-
-[[package]]
-name = "zstd-safe"
-version = "5.0.2+zstd.1.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
-dependencies = [
- "libc",
- "zstd-sys",
-]
-
-[[package]]
-name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 9f526b270cb..9ca1e6d2cba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,7 +8,7 @@ members = [
 ]
 
 [workspace.package]
-version = "1.0.3"
+version = "1.1.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/Dockerfile b/Dockerfile
index 45e304c4f33..9c15f023e3d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -111,22 +111,29 @@ RUN make build-flash-attention-v2
 
 # Build Transformers exllama kernels
 FROM kernel-builder as exllama-kernels-builder
-
 WORKDIR /usr/src
-
 COPY server/exllama_kernels/ .
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
+# Build Transformers awq kernels
+FROM kernel-builder as awq-kernels-builder
+WORKDIR /usr/src
+COPY server/Makefile-awq Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
 
+# Build eetq kernels
+FROM kernel-builder as eetq-kernels-builder
+WORKDIR /usr/src
+COPY server/Makefile-eetq Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
 
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
-
 WORKDIR /usr/src
-
 COPY server/custom_kernels/ .
-
 # Build specific version of transformers
 RUN python setup.py build
 
@@ -158,6 +165,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         libssl-dev \
         ca-certificates \
         make \
+        curl \
         && rm -rf /var/lib/apt/lists/*
 
 # Copy conda with PyTorch installed
@@ -175,6 +183,10 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 # Copy build artifacts from exllama kernels builder
 COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from eetq kernels builder
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 
 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
diff --git a/README.md b/README.md
index de1ca05e3ec..7785b1c6f0d 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,16 @@
 <div align="center">
-
 ![image](https://github.com/huggingface/text-generation-inference/assets/3841370/38ba1531-ea0d-4851-b31a-a6d4ddc944b0)
-
 # Text Generation Inference
-
 <a href="https://github.com/huggingface/text-generation-inference">
   <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/huggingface/text-generation-inference?style=social">
 </a>
 <a href="https://huggingface.github.io/text-generation-inference">
   <img alt="Swagger API documentation" src="https://img.shields.io/badge/API-Swagger-informational">
 </a>
-
 A Rust, Python and gRPC server for text generation inference. Used in production at [HuggingFace](https://huggingface.co)
 to power Hugging Chat, the Inference API and Inference Endpoint.
-
 </div>
-
 ## Table of contents
-
 - [Get Started](#get-started)
   - [API Documentation](#api-documentation)
   - [Using a private or gated model](#using-a-private-or-gated-model)
@@ -31,9 +24,7 @@ to power Hugging Chat, the Inference API and Inference Endpoint.
   - [Quantization](#quantization)
 - [Develop](#develop)
 - [Testing](#testing)
-
 Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:
-
 - Simple launcher to serve most popular LLMs
 - Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
 - Tensor Parallelism for faster inference on multiple GPUs
@@ -48,89 +39,75 @@ Text Generation Inference (TGI) is a toolkit for deploying and serving Large Lan
 - Log probabilities
 - Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output
 - Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance
-
-
 ## Get Started
-
+### Docker
 For a detailed starting guide, please see the [Quick Tour](https://huggingface.co/docs/text-generation-inference/quicktour). The easiest way of getting started is using the official Docker container:
-
 ```shell
 model=tiiuae/falcon-7b-instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
-
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
 ```
-
 And then you can make requests like
-
 ```bash
 curl 127.0.0.1:8080/generate \
     -X POST \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
     -H 'Content-Type: application/json'
 ```
-
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
-
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
 text-generation-launcher --help
 ```
-
 ### API documentation
-
-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The 
-Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).
-
+You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
+The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).
+### Using a private or gated model
+You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
+`text-generation-inference`. This allows you to gain access to protected resources.
+For example, if you want to serve the gated Llama V2 model variants:
+1. Go to https://huggingface.co/settings/tokens
+2. Copy your cli READ token
+3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
+or with Docker:
+```shell
+model=meta-llama/Llama-2-7b-chat-hf
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+token=<your cli READ token>
+docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
+```
 ### A note on Shared Memory (shm)
-
 [`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
 `PyTorch` to do distributed training/inference. `text-generation-inference` make
 use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
-
 In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
 peer-to-peer using NVLink or PCI is not possible.
-
 To allow the container to use 1G of Shared Memory and support SHM sharing, we add `--shm-size 1g` on the above command.
-
 If you are running `text-generation-inference` inside `Kubernetes`. You can also add Shared Memory to the container by
 creating a volume with:
-
 ```yaml
 - name: shm
   emptyDir:
    medium: Memory
    sizeLimit: 1Gi
 ```
-
 and mounting it to `/dev/shm`.
-
 Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
 this will impact performance.
-
 ### Distributed Tracing
-
 `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
 by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
-
 ### Local install
-
 You can also opt to install `text-generation-inference` locally.
-
 First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
 Python 3.9, e.g. using `conda`:
-
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
-
 conda create -n text-generation-inference python=3.9
 conda activate text-generation-inference
 ```
-
 You may also need to install Protoc.
-
 On Linux:
-
 ```shell
 PROTOC_ZIP=protoc-21.12-linux-x86_64.zip
 curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP
@@ -138,74 +115,46 @@ sudo unzip -o $PROTOC_ZIP -d /usr/local bin/protoc
 sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
 rm -f $PROTOC_ZIP
 ```
-
 On MacOS, using Homebrew:
-
 ```shell
 brew install protobuf
 ```
-
 Then run:
-
 ```shell
 BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
 make run-falcon-7b-instruct
 ```
-
 **Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
-
 ```shell
 sudo apt-get install libssl-dev gcc -y
 ```
-
 ### CUDA Kernels
-
 The custom CUDA kernels are only tested on NVIDIA A100s. If you have any installation or runtime issues, you can remove
 the kernels by using the `DISABLE_CUSTOM_KERNELS=True` environment variable.
-
 Be aware that the official Docker image has them enabled by default.
-
 ## Optimized architectures
-
 TGI works out of the box to serve optimized models in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).
-
 Other architectures are supported on a best-effort basis using:
-
 `AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
-
 or
-
 `AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
-
-
-
 ## Run Falcon
-
 ### Run
-
 ```shell
 make run-falcon-7b-instruct
 ```
-
 ### Quantization
-
 You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:
-
 ```shell
 make run-falcon-7b-instruct-quantize
 ```
-
 4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.
-
 ## Develop
-
 ```shell
 make server-dev
 make router-dev
 ```
-
 ## Testing
-
 ```shell
 # python
 make python-server-tests
@@ -216,4 +165,4 @@ make python-tests
 make rust-tests
 # integration tests
 make integration-tests
-```
+```
\ No newline at end of file
diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml
index 67e04f0a791..2dd2e64dd07 100644
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@@ -14,18 +14,19 @@ name = "text-generation-benchmark"
 path = "src/main.rs"
 
 [dependencies]
-average = "0.13"
-clap = { version = "4.1.4", features = ["derive", "env"] }
-crossterm = "0.26"
+average = "0.14"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+crossterm = "0.27"
 float-ord = "0.3.2"
-serde = {version = "1.0.142", features = ["derive"]}
+serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
-tabled = "0.12.0"
+tabled = "0.14.0"
 text-generation-client = { path = "../router/client" }
-thiserror = "1.0.38"
-tokenizers = "0.13.3"
-tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tui = {package = "ratatui", version = "0.20", default-features = false, features = ["crossterm"]}
+thiserror = "1.0.48"
+tokenizers = { version = "0.14.0", features = ["http"] }
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
+tui = {package = "ratatui", version = "0.23", default-features = false, features = ["crossterm"]}
 tracing = "0.1.37"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+hf-hub = "0.3.1"
 
diff --git a/benchmark/src/app.rs b/benchmark/src/app.rs
index 6a9881fbd50..49654c1b099 100644
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@@ -6,7 +6,7 @@ use tokio::sync::mpsc;
 use tui::backend::Backend;
 use tui::layout::{Alignment, Constraint, Direction, Layout};
 use tui::style::{Color, Modifier, Style};
-use tui::text::{Span, Spans};
+use tui::text::{Line, Span};
 use tui::widgets::{
     Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
 };
@@ -244,7 +244,7 @@ impl App {
             .batch_size
             .iter()
             .map(|b| {
-                Spans::from(vec![Span::styled(
+                Line::from(vec![Span::styled(
                     format!("Batch: {b}"),
                     Style::default().fg(Color::White),
                 )])
@@ -468,7 +468,7 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
     // Latency p50/p90/p99 texts
     let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
     for (i, (name, value)) in latency_percentiles.iter().enumerate() {
-        let span = Spans::from(vec![Span::styled(
+        let span = Line::from(vec![Span::styled(
             format!("{name}:     {value:.2} ms"),
             Style::default().fg(colors[i]),
         )]);
@@ -483,16 +483,16 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
 }
 
 /// Average/High/Low spans
-fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
+fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Line<'a>> {
     vec![
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
             format!(
                 "Average: {:.2} {unit}",
                 data.iter().sum::<f64>() / data.len() as f64
             ),
             Style::default().fg(Color::LightBlue),
         )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
             format!(
                 "Lowest:  {:.2} {unit}",
                 data.iter()
@@ -501,7 +501,7 @@ fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
             ),
             Style::default().fg(Color::Reset),
         )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
             format!(
                 "Highest: {:.2} {unit}",
                 data.iter()
diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs
index 433c6f67f1f..1875652c0b0 100644
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@@ -33,7 +33,7 @@ pub async fn run(
     watermark: bool,
     do_sample: bool,
     client: ShardedClient,
-) -> Result<(), crossterm::ErrorKind> {
+) -> Result<(), std::io::Error> {
     let parameters = NextTokenChooserParameters {
         temperature: temperature.unwrap_or(1.0),
         top_k: top_k.unwrap_or(0),
diff --git a/clients/python/README.md b/clients/python/README.md
index 4e0e564cbbc..82f3ee0c446 100644
--- a/clients/python/README.md
+++ b/clients/python/README.md
@@ -140,6 +140,8 @@ class Parameters:
     watermark: bool
     # Get decoder input token logprobs and ids
     decoder_input_details: bool
+    # Return the N most likely tokens at each step
+    top_n_tokens: Optional[int] 
 
 # Decoder input tokens
 class InputToken:
@@ -189,6 +191,8 @@ class BestOfSequence:
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]] 
 
 
 # `generate` details
@@ -203,6 +207,8 @@ class Details:
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]
     # Additional sequences when using the `best_of` parameter
     best_of_sequences: Optional[List[BestOfSequence]]
 
@@ -229,6 +235,8 @@ class StreamDetails:
 class StreamResponse:
     # Generated token
     token: Token
+    # Most likely tokens
+    top_tokens: Optional[List[Token]] 
     # Complete generated text
     # Only available when the generation is finished
     generated_text: Optional[str]
diff --git a/clients/python/poetry.lock b/clients/python/poetry.lock
index e038ad9b478..2d4e45d2517 100644
--- a/clients/python/poetry.lock
+++ b/clients/python/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -124,6 +124,20 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"
 
+[[package]]
+name = "annotated-types"
+version = "0.5.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "annotated_types-0.5.0-py3-none-any.whl", hash = "sha256:58da39888f92c276ad970249761ebea80ba544b77acddaa1a4d6cf78287d45fd"},
+    {file = "annotated_types-0.5.0.tar.gz", hash = "sha256:47cdc3490d9ac1506ce92c7aaa76c579dc3509ff11e098fc867e5130ab7be802"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""}
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -693,55 +707,140 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "1.10.12"
-description = "Data validation and settings management using python type hints"
+version = "2.4.2"
+description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"},
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"},
-    {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"},
-    {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"},
-    {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"},
-    {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"},
-    {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"},
-    {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"},
-    {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"},
-    {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"},
+    {file = "pydantic-2.4.2-py3-none-any.whl", hash = "sha256:bc3ddf669d234f4220e6e1c4d96b061abe0998185a8d7855c0126782b7abc8c1"},
+    {file = "pydantic-2.4.2.tar.gz", hash = "sha256:94f336138093a5d7f426aac732dcfe7ab4eb4da243c88f891d65deb4a2556ee7"},
 ]
 
 [package.dependencies]
-typing-extensions = ">=4.2.0"
+annotated-types = ">=0.4.0"
+pydantic-core = "2.10.1"
+typing-extensions = ">=4.6.1"
 
 [package.extras]
-dotenv = ["python-dotenv (>=0.10.4)"]
-email = ["email-validator (>=1.0.3)"]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.10.1"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pydantic_core-2.10.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:d64728ee14e667ba27c66314b7d880b8eeb050e58ffc5fec3b7a109f8cddbd63"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:48525933fea744a3e7464c19bfede85df4aba79ce90c60b94d8b6e1eddd67096"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef337945bbd76cce390d1b2496ccf9f90b1c1242a3a7bc242ca4a9fc5993427a"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1392e0638af203cee360495fd2cfdd6054711f2db5175b6e9c3c461b76f5175"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0675ba5d22de54d07bccde38997e780044dcfa9a71aac9fd7d4d7a1d2e3e65f7"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:128552af70a64660f21cb0eb4876cbdadf1a1f9d5de820fed6421fa8de07c893"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f6e6aed5818c264412ac0598b581a002a9f050cb2637a84979859e70197aa9e"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ecaac27da855b8d73f92123e5f03612b04c5632fd0a476e469dfc47cd37d6b2e"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b3c01c2fb081fced3bbb3da78510693dc7121bb893a1f0f5f4b48013201f362e"},
+    {file = "pydantic_core-2.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:92f675fefa977625105708492850bcbc1182bfc3e997f8eecb866d1927c98ae6"},
+    {file = "pydantic_core-2.10.1-cp310-none-win32.whl", hash = "sha256:420a692b547736a8d8703c39ea935ab5d8f0d2573f8f123b0a294e49a73f214b"},
+    {file = "pydantic_core-2.10.1-cp310-none-win_amd64.whl", hash = "sha256:0880e239827b4b5b3e2ce05e6b766a7414e5f5aedc4523be6b68cfbc7f61c5d0"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:073d4a470b195d2b2245d0343569aac7e979d3a0dcce6c7d2af6d8a920ad0bea"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:600d04a7b342363058b9190d4e929a8e2e715c5682a70cc37d5ded1e0dd370b4"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39215d809470f4c8d1881758575b2abfb80174a9e8daf8f33b1d4379357e417c"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eeb3d3d6b399ffe55f9a04e09e635554012f1980696d6b0aca3e6cf42a17a03b"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a7902bf75779bc12ccfc508bfb7a4c47063f748ea3de87135d433a4cca7a2f"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3625578b6010c65964d177626fde80cf60d7f2e297d56b925cb5cdeda6e9925a"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:caa48fc31fc7243e50188197b5f0c4228956f97b954f76da157aae7f67269ae8"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:07ec6d7d929ae9c68f716195ce15e745b3e8fa122fc67698ac6498d802ed0fa4"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e6f31a17acede6a8cd1ae2d123ce04d8cca74056c9d456075f4f6f85de055607"},
+    {file = "pydantic_core-2.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8f1ebca515a03e5654f88411420fea6380fc841d1bea08effb28184e3d4899f"},
+    {file = "pydantic_core-2.10.1-cp311-none-win32.whl", hash = "sha256:6db2eb9654a85ada248afa5a6db5ff1cf0f7b16043a6b070adc4a5be68c716d6"},
+    {file = "pydantic_core-2.10.1-cp311-none-win_amd64.whl", hash = "sha256:4a5be350f922430997f240d25f8219f93b0c81e15f7b30b868b2fddfc2d05f27"},
+    {file = "pydantic_core-2.10.1-cp311-none-win_arm64.whl", hash = "sha256:5fdb39f67c779b183b0c853cd6b45f7db84b84e0571b3ef1c89cdb1dfc367325"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:b1f22a9ab44de5f082216270552aa54259db20189e68fc12484873d926426921"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8572cadbf4cfa95fb4187775b5ade2eaa93511f07947b38f4cd67cf10783b118"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db9a28c063c7c00844ae42a80203eb6d2d6bbb97070cfa00194dff40e6f545ab"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e2a35baa428181cb2270a15864ec6286822d3576f2ed0f4cd7f0c1708472aff"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05560ab976012bf40f25d5225a58bfa649bb897b87192a36c6fef1ab132540d7"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d6495008733c7521a89422d7a68efa0a0122c99a5861f06020ef5b1f51f9ba7c"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ac492c686defc8e6133e3a2d9eaf5261b3df26b8ae97450c1647286750b901"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8282bab177a9a3081fd3d0a0175a07a1e2bfb7fcbbd949519ea0980f8a07144d"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:aafdb89fdeb5fe165043896817eccd6434aee124d5ee9b354f92cd574ba5e78f"},
+    {file = "pydantic_core-2.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f6defd966ca3b187ec6c366604e9296f585021d922e666b99c47e78738b5666c"},
+    {file = "pydantic_core-2.10.1-cp312-none-win32.whl", hash = "sha256:7c4d1894fe112b0864c1fa75dffa045720a194b227bed12f4be7f6045b25209f"},
+    {file = "pydantic_core-2.10.1-cp312-none-win_amd64.whl", hash = "sha256:5994985da903d0b8a08e4935c46ed8daf5be1cf217489e673910951dc533d430"},
+    {file = "pydantic_core-2.10.1-cp312-none-win_arm64.whl", hash = "sha256:0d8a8adef23d86d8eceed3e32e9cca8879c7481c183f84ed1a8edc7df073af94"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:9badf8d45171d92387410b04639d73811b785b5161ecadabf056ea14d62d4ede"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:ebedb45b9feb7258fac0a268a3f6bec0a2ea4d9558f3d6f813f02ff3a6dc6698"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfe1090245c078720d250d19cb05d67e21a9cd7c257698ef139bc41cf6c27b4f"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e357571bb0efd65fd55f18db0a2fb0ed89d0bb1d41d906b138f088933ae618bb"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b3dcd587b69bbf54fc04ca157c2323b8911033e827fffaecf0cafa5a892a0904"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c120c9ce3b163b985a3b966bb701114beb1da4b0468b9b236fc754783d85aa3"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15d6bca84ffc966cc9976b09a18cf9543ed4d4ecbd97e7086f9ce9327ea48891"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5cabb9710f09d5d2e9e2748c3e3e20d991a4c5f96ed8f1132518f54ab2967221"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:82f55187a5bebae7d81d35b1e9aaea5e169d44819789837cdd4720d768c55d15"},
+    {file = "pydantic_core-2.10.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1d40f55222b233e98e3921df7811c27567f0e1a4411b93d4c5c0f4ce131bc42f"},
+    {file = "pydantic_core-2.10.1-cp37-none-win32.whl", hash = "sha256:14e09ff0b8fe6e46b93d36a878f6e4a3a98ba5303c76bb8e716f4878a3bee92c"},
+    {file = "pydantic_core-2.10.1-cp37-none-win_amd64.whl", hash = "sha256:1396e81b83516b9d5c9e26a924fa69164156c148c717131f54f586485ac3c15e"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:6835451b57c1b467b95ffb03a38bb75b52fb4dc2762bb1d9dbed8de31ea7d0fc"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b00bc4619f60c853556b35f83731bd817f989cba3e97dc792bb8c97941b8053a"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fa467fd300a6f046bdb248d40cd015b21b7576c168a6bb20aa22e595c8ffcdd"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d99277877daf2efe074eae6338453a4ed54a2d93fb4678ddfe1209a0c93a2468"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa7db7558607afeccb33c0e4bf1c9a9a835e26599e76af6fe2fcea45904083a6"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aad7bd686363d1ce4ee930ad39f14e1673248373f4a9d74d2b9554f06199fb58"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:443fed67d33aa85357464f297e3d26e570267d1af6fef1c21ca50921d2976302"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:042462d8d6ba707fd3ce9649e7bf268633a41018d6a998fb5fbacb7e928a183e"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ecdbde46235f3d560b18be0cb706c8e8ad1b965e5c13bbba7450c86064e96561"},
+    {file = "pydantic_core-2.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ed550ed05540c03f0e69e6d74ad58d026de61b9eaebebbaaf8873e585cbb18de"},
+    {file = "pydantic_core-2.10.1-cp38-none-win32.whl", hash = "sha256:8cdbbd92154db2fec4ec973d45c565e767ddc20aa6dbaf50142676484cbff8ee"},
+    {file = "pydantic_core-2.10.1-cp38-none-win_amd64.whl", hash = "sha256:9f6f3e2598604956480f6c8aa24a3384dbf6509fe995d97f6ca6103bb8c2534e"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:655f8f4c8d6a5963c9a0687793da37b9b681d9ad06f29438a3b2326d4e6b7970"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e570ffeb2170e116a5b17e83f19911020ac79d19c96f320cbfa1fa96b470185b"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64322bfa13e44c6c30c518729ef08fda6026b96d5c0be724b3c4ae4da939f875"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:485a91abe3a07c3a8d1e082ba29254eea3e2bb13cbbd4351ea4e5a21912cc9b0"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7c2b8eb9fc872e68b46eeaf835e86bccc3a58ba57d0eedc109cbb14177be531"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5cb87bdc2e5f620693148b5f8f842d293cae46c5f15a1b1bf7ceeed324a740c"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25bd966103890ccfa028841a8f30cebcf5875eeac8c4bde4fe221364c92f0c9a"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f323306d0556351735b54acbf82904fe30a27b6a7147153cbe6e19aaaa2aa429"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0c27f38dc4fbf07b358b2bc90edf35e82d1703e22ff2efa4af4ad5de1b3833e7"},
+    {file = "pydantic_core-2.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f1365e032a477c1430cfe0cf2856679529a2331426f8081172c4a74186f1d595"},
+    {file = "pydantic_core-2.10.1-cp39-none-win32.whl", hash = "sha256:a1c311fd06ab3b10805abb72109f01a134019739bd3286b8ae1bc2fc4e50c07a"},
+    {file = "pydantic_core-2.10.1-cp39-none-win_amd64.whl", hash = "sha256:ae8a8843b11dc0b03b57b52793e391f0122e740de3df1474814c700d2622950a"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d43002441932f9a9ea5d6f9efaa2e21458221a3a4b417a14027a1d530201ef1b"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fcb83175cc4936a5425dde3356f079ae03c0802bbdf8ff82c035f8a54b333521"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:962ed72424bf1f72334e2f1e61b68f16c0e596f024ca7ac5daf229f7c26e4208"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cf5bb4dd67f20f3bbc1209ef572a259027c49e5ff694fa56bed62959b41e1f9"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e544246b859f17373bed915182ab841b80849ed9cf23f1f07b73b7c58baee5fb"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c0877239307b7e69d025b73774e88e86ce82f6ba6adf98f41069d5b0b78bd1bf"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:53df009d1e1ba40f696f8995683e067e3967101d4bb4ea6f667931b7d4a01357"},
+    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a1254357f7e4c82e77c348dabf2d55f1d14d19d91ff025004775e70a6ef40ada"},
+    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:524ff0ca3baea164d6d93a32c58ac79eca9f6cf713586fdc0adb66a8cdeab96a"},
+    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f0ac9fb8608dbc6eaf17956bf623c9119b4db7dbb511650910a82e261e6600f"},
+    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:320f14bd4542a04ab23747ff2c8a778bde727158b606e2661349557f0770711e"},
+    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:63974d168b6233b4ed6a0046296803cb13c56637a7b8106564ab575926572a55"},
+    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:417243bf599ba1f1fef2bb8c543ceb918676954734e2dcb82bf162ae9d7bd514"},
+    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:dda81e5ec82485155a19d9624cfcca9be88a405e2857354e5b089c2a982144b2"},
+    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:14cfbb00959259e15d684505263d5a21732b31248a5dd4941f73a3be233865b9"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:631cb7415225954fdcc2a024119101946793e5923f6c4d73a5914d27eb3d3a05"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:bec7dd208a4182e99c5b6c501ce0b1f49de2802448d4056091f8e630b28e9a52"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:149b8a07712f45b332faee1a2258d8ef1fb4a36f88c0c17cb687f205c5dc6e7d"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d966c47f9dd73c2d32a809d2be529112d509321c5310ebf54076812e6ecd884"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7eb037106f5c6b3b0b864ad226b0b7ab58157124161d48e4b30c4a43fef8bc4b"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:154ea7c52e32dce13065dbb20a4a6f0cc012b4f667ac90d648d36b12007fa9f7"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e562617a45b5a9da5be4abe72b971d4f00bf8555eb29bb91ec2ef2be348cd132"},
+    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f23b55eb5464468f9e0e9a9935ce3ed2a870608d5f534025cd5536bca25b1402"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:e9121b4009339b0f751955baf4543a0bfd6bc3f8188f8056b1a25a2d45099934"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0523aeb76e03f753b58be33b26540880bac5aa54422e4462404c432230543f33"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e0e2959ef5d5b8dc9ef21e1a305a21a36e254e6a34432d00c72a92fdc5ecda5"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da01bec0a26befab4898ed83b362993c844b9a607a86add78604186297eb047e"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f2e9072d71c1f6cfc79a36d4484c82823c560e6f5599c43c1ca6b5cdbd54f881"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f36a3489d9e28fe4b67be9992a23029c3cec0babc3bd9afb39f49844a8c721c5"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f64f82cc3443149292b32387086d02a6c7fb39b8781563e0ca7b8d7d9cf72bd7"},
+    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b4a6db486ac8e99ae696e09efc8b2b9fea67b63c8f88ba7a1a16c24a057a0776"},
+    {file = "pydantic_core-2.10.1.tar.gz", hash = "sha256:0f8682dbdd2f67f8e1edddcbffcc29f60a6182b4901c367fc8c1c40d30bb0a82"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
 [[package]]
 name = "pytest"
@@ -816,6 +915,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -823,8 +923,15 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -841,6 +948,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -848,6 +956,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -929,13 +1038,13 @@ files = [
 
 [[package]]
 name = "urllib3"
-version = "2.0.4"
+version = "2.0.5"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"},
-    {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"},
+    {file = "urllib3-2.0.5-py3-none-any.whl", hash = "sha256:ef16afa8ba34a1f989db38e1dbbe0c302e4289a47856990d0682e374563ce35e"},
+    {file = "urllib3-2.0.5.tar.gz", hash = "sha256:13abf37382ea2ce6fb744d4dad67838eec857c9f4f57009891805e0b5e123594"},
 ]
 
 [package.extras]
@@ -1050,4 +1159,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.7"
-content-hash = "0db2f97d52c557dd7f90c55b4ad5bbe308c957c5f7f99fec53c57e0a13822cb4"
+content-hash = "b7fab8703967f2616ea59a98a437cd30f97f0c8d2a06e399d688814a2a2c64f8"
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 915ac7aa187..4fe6e8b0cb6 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.6.0"
+version = "0.6.1"
 description = "Hugging Face Text Generation Python Client"
 license = "Apache-2.0"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py
index 015613c23f7..0bf80f8ced7 100644
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@@ -137,7 +137,7 @@ def generate(
             typical_p=typical_p,
             watermark=watermark,
             decoder_input_details=decoder_input_details,
-            top_n_tokens=top_n_tokens
+            top_n_tokens=top_n_tokens,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -482,7 +482,6 @@ async def generate_stream(
             headers=self.headers, cookies=self.cookies, timeout=self.timeout
         ) as session:
             async with session.post(self.base_url, json=request.dict()) as resp:
-
                 if resp.status != 200:
                     raise parse_error(resp.status, await resp.json())
 
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index 20083b19b11..aa02d8d8c94 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -40,7 +40,7 @@ class Parameters(BaseModel):
     # Get decoder input token logprobs and ids
     decoder_input_details: bool = False
     # Return the N most likely tokens at each step
-    top_n_tokens: Optional[int]
+    top_n_tokens: Optional[int] = None
 
     @validator("best_of")
     def valid_best_of(cls, field_value, values):
@@ -133,7 +133,9 @@ def valid_best_of_stream(cls, field_value, values):
             and parameters.best_of > 1
             and field_value
         ):
-            raise ValidationError("`best_of` != 1 is not supported when `stream` == True")
+            raise ValidationError(
+                "`best_of` != 1 is not supported when `stream` == True"
+            )
         return field_value
 
 
@@ -186,7 +188,7 @@ class BestOfSequence(BaseModel):
     # Generated tokens
     tokens: List[Token]
     # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None
 
 
 # `generate` details
@@ -202,7 +204,7 @@ class Details(BaseModel):
     # Generated tokens
     tokens: List[Token]
     # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None
     # Additional sequences when using the `best_of` parameter
     best_of_sequences: Optional[List[BestOfSequence]] = None
 
@@ -230,7 +232,7 @@ class StreamResponse(BaseModel):
     # Generated token
     token: Token
     # Most likely tokens
-    top_tokens: Optional[List[Token]]
+    top_tokens: Optional[List[Token]] = None
     # Complete generated text
     # Only available when the generation is finished
     generated_text: Optional[str] = None
diff --git a/docs/openapi.json b/docs/openapi.json
index 23c4f198fde..4a1ab6dd8d4 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "1.0.3"
+    "version": "1.1.0"
   },
   "paths": {
     "/": {
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 6a8baaf6567..6fa50a6a541 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -17,10 +17,22 @@
     title: Serving Private & Gated Models
   - local: basic_tutorials/using_cli
     title: Using TGI CLI
+  - local: basic_tutorials/launcher
+    title: All TGI CLI  options
+  - local: basic_tutorials/non_core_models
+    title: Non-core Model Serving
   title: Tutorials
 - sections:
   - local: conceptual/streaming
     title: Streaming
+  - local: conceptual/quantization
+    title: Quantization
+  - local: conceptual/tensor_parallelism
+    title: Tensor Parallelism
+  - local: conceptual/paged_attention
+    title: PagedAttention
+  - local: conceptual/safetensors
+    title: Safetensors
   - local: conceptual/flash_attention
     title: Flash Attention
   title: Conceptual Guides
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 08c76de2090..827f6f4fff0 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HUGGING_FACE_HUB_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
     --model-id $model
 ```
diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md
new file mode 100644
index 00000000000..bdb8cb731be
--- /dev/null
+++ b/docs/source/basic_tutorials/launcher.md
@@ -0,0 +1,247 @@
+# Text-generation-launcher arguments
+
+<!-- WRAP CODE BLOCKS -->
+
+```
+Text Generation Launcher
+
+Usage: text-generation-launcher [OPTIONS]
+
+Options:
+      --model-id <MODEL_ID>
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          
+          [env: MODEL_ID=]
+          [default: bigscience/bloom-560m]
+
+      --revision <REVISION>
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
+          
+          [env: REVISION=]
+
+      --validation-workers <VALIDATION_WORKERS>
+          The number of tokenizer workers used for payload validation and truncation inside the router
+          
+          [env: VALIDATION_WORKERS=]
+          [default: 2]
+
+      --sharded <SHARDED>
+          Whether to shard the model across multiple GPUs By default text-generation-inference will use all available GPUs to run the model. Setting it to `false` deactivates `num_shard`
+          
+          [env: SHARDED=]
+          [possible values: true, false]
+
+      --num-shard <NUM_SHARD>
+          The number of shards to use if you don't want to use all GPUs on a given machine. You can use `CUDA_VISIBLE_DEVICES=0,1 text-generation-launcher... --num_shard 2` and `CUDA_VISIBLE_DEVICES=2,3 text-generation-launcher... --num_shard 2` to launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance
+          
+          [env: NUM_SHARD=]
+
+      --quantize <QUANTIZE>
+          Whether you want the model to be quantized
+          
+          [env: QUANTIZE=]
+
+          Possible values:
+          - awq:              4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=awq. Should replace GPTQ models whereever possible because of the better latency
+          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from https://github.com/NetEase-FuXi/EETQ.git
+          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq. text-generation-inference will use exllama (faster) kernels whereever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
+
+      --dtype <DTYPE>
+          The dtype to be forced upon the model. This option cannot be used with `--quantize`
+          
+          [env: DTYPE=]
+          [possible values: float16, bfloat16]
+
+      --trust-remote-code
+          Whether you want to execute hub modelling code. Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision
+          
+          [env: TRUST_REMOTE_CODE=]
+
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
+          
+          [env: MAX_CONCURRENT_REQUESTS=]
+          [default: 128]
+
+      --max-best-of <MAX_BEST_OF>
+          This is the maximum allowed value for clients to set `best_of`. Best of makes `n` generations at the same time, and return the best in terms of overall log probability over the entire generated sequence
+          
+          [env: MAX_BEST_OF=]
+          [default: 2]
+
+      --max-stop-sequences <MAX_STOP_SEQUENCES>
+          This is the maximum allowed value for clients to set `stop_sequences`. Stop sequences are used to allow the model to stop on more than just the EOS token, and enable more complex "prompting" where users can preprompt the model in a specific way and define their "own" stop token aligned with their prompt
+          
+          [env: MAX_STOP_SEQUENCES=]
+          [default: 4]
+
+      --max-top-n-tokens <MAX_TOP_N_TOKENS>
+          This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
+          
+          [env: MAX_TOP_N_TOKENS=]
+          [default: 5]
+
+      --max-input-length <MAX_INPUT_LENGTH>
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle
+          
+          [env: MAX_INPUT_LENGTH=]
+          [default: 1024]
+
+      --max-total-tokens <MAX_TOTAL_TOKENS>
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be
+          
+          [env: MAX_TOTAL_TOKENS=]
+          [default: 2048]
+
+      --waiting-served-ratio <WAITING_SERVED_RATIO>
+          This represents the ratio of waiting queries vs running queries where you want to start considering pausing the running queries to include the waiting ones into the same batch. `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's only 10 queries left in the current batch we check if we can fit those 12 waiting queries into the batching strategy, and if yes, then batching happens delaying the 10 running queries by a `prefill` run.
+          
+          This setting is only applied if there is room in the batch as defined by `max_batch_total_tokens`.
+          
+          [env: WAITING_SERVED_RATIO=]
+          [default: 1.2]
+
+      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
+          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent
+          
+          [env: MAX_BATCH_PREFILL_TOKENS=]
+          [default: 4096]
+
+      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
+          **IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
+          
+          This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
+          
+          However in the non-padded (flash attention) version this can be much finer.
+          
+          For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
+          
+          Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference cannot infer this number automatically.
+          
+          [env: MAX_BATCH_TOTAL_TOKENS=]
+
+      --max-waiting-tokens <MAX_WAITING_TOKENS>
+          This setting defines how many tokens can be passed before forcing the waiting queries to be put on the batch (if the size of the batch allows for it). New queries require 1 `prefill` forward, which is different from `decode` and therefore you need to pause the running batch in order to run `prefill` to create the correct values for the waiting queries to be able to join the batch.
+          
+          With a value too small, queries will always "steal" the compute to run `prefill` and running queries will be delayed by a lot.
+          
+          With a value too big, waiting queries could wait for a very long time before being allowed a slot in the running batch. If your server is busy that means that requests that could run in ~2s on an empty server could end up running in ~20s because the query had to wait for 18s.
+          
+          This number is expressed in number of tokens to make it a bit more "model" agnostic, but what should really matter is the overall latency for end users.
+          
+          [env: MAX_WAITING_TOKENS=]
+          [default: 20]
+
+      --hostname <HOSTNAME>
+          The IP address to listen on
+          
+          [env: HOSTNAME=]
+          [default: 0.0.0.0]
+
+  -p, --port <PORT>
+          The port to listen on
+          
+          [env: PORT=]
+          [default: 3000]
+
+      --shard-uds-path <SHARD_UDS_PATH>
+          The name of the socket for gRPC communication between the webserver and the shards
+          
+          [env: SHARD_UDS_PATH=]
+          [default: /tmp/text-generation-server]
+
+      --master-addr <MASTER_ADDR>
+          The address the master shard will listen on. (setting used by torch distributed)
+          
+          [env: MASTER_ADDR=]
+          [default: localhost]
+
+      --master-port <MASTER_PORT>
+          The address the master port will listen on. (setting used by torch distributed)
+          
+          [env: MASTER_PORT=]
+          [default: 29500]
+
+      --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          
+          [env: HUGGINGFACE_HUB_CACHE=]
+
+      --weights-cache-override <WEIGHTS_CACHE_OVERRIDE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          
+          [env: WEIGHTS_CACHE_OVERRIDE=]
+
+      --disable-custom-kernels
+          For some models (like bloom), text-generation-inference implemented custom cuda kernels to speed up inference. Those kernels were only tested on A100. Use this flag to disable them if you're running on different hardware and encounter issues
+          
+          [env: DISABLE_CUSTOM_KERNELS=]
+
+      --cuda-memory-fraction <CUDA_MEMORY_FRACTION>
+          Limit the CUDA available memory. The allowed value equals the total visible memory multiplied by cuda-memory-fraction
+          
+          [env: CUDA_MEMORY_FRACTION=]
+          [default: 1.0]
+
+      --rope-scaling <ROPE_SCALING>
+          Rope scaling will only be used for RoPE models and allow rescaling the position rotary to accomodate for larger prompts.
+          
+          Goes together with `rope_factor`.
+          
+          `--rope-factor 2.0` gives linear scaling with a factor of 2.0 `--rope-scaling dynamic` gives dynamic scaling with a factor of 1.0 `--rope-scaling linear` gives linear scaling with a factor of 1.0 (Nothing will be changed basically)
+          
+          `--rope-scaling linear --rope-factor` fully describes the scaling you want
+          
+          [env: ROPE_SCALING=]
+          [possible values: linear, dynamic]
+
+      --rope-factor <ROPE_FACTOR>
+          Rope scaling will only be used for RoPE models See `rope_scaling`
+          
+          [env: ROPE_FACTOR=]
+
+      --json-output
+          Outputs the logs in JSON format (useful for telemetry)
+          
+          [env: JSON_OUTPUT=]
+
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+
+      --watermark-gamma <WATERMARK_GAMMA>
+          [env: WATERMARK_GAMMA=]
+
+      --watermark-delta <WATERMARK_DELTA>
+          [env: WATERMARK_DELTA=]
+
+      --ngrok
+          Enable ngrok tunneling
+          
+          [env: NGROK=]
+
+      --ngrok-authtoken <NGROK_AUTHTOKEN>
+          ngrok authentication token
+          
+          [env: NGROK_AUTHTOKEN=]
+
+      --ngrok-edge <NGROK_EDGE>
+          ngrok edge
+          
+          [env: NGROK_EDGE=]
+
+  -e, --env
+          Display a lot of information about your runtime environment
+
+  -h, --help
+          Print help (see a summary with '-h')
+
+  -V, --version
+          Print version
+
+```
\ No newline at end of file
diff --git a/docs/source/basic_tutorials/non_core_models.md b/docs/source/basic_tutorials/non_core_models.md
new file mode 100644
index 00000000000..6f2e6cfa2f8
--- /dev/null
+++ b/docs/source/basic_tutorials/non_core_models.md
@@ -0,0 +1,24 @@
+# Non-core Model Serving
+
+TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.
+
+You can serve these models using the same Docker command-line invocation as with fully supported models 👇 
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇 
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇 
+
+```bash
+# Make sure your model is in the $volume directory
+docker run --shm-size 1g -p 8080:80 -v $volume:/data  ghcr.io/huggingface/text-generation-inference:latest --model-id /data/<PATH-TO-FOLDER>
+```
+
+You can refer to [transformers docs on custom models](https://huggingface.co/docs/transformers/main/en/custom_models) for more information.
diff --git a/docs/source/basic_tutorials/preparing_model.md b/docs/source/basic_tutorials/preparing_model.md
index 65a2a1973d5..0f5739ea854 100644
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@@ -4,7 +4,7 @@ Text Generation Inference improves the model in several aspects.
 
 ## Quantization
 
-TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes` or `gptq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq).
+TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to (./conceptual/quantization.md)
 
 
 ## RoPE Scaling
diff --git a/docs/source/conceptual/paged_attention.md b/docs/source/conceptual/paged_attention.md
new file mode 100644
index 00000000000..3fb2dcd86e5
--- /dev/null
+++ b/docs/source/conceptual/paged_attention.md
@@ -0,0 +1,9 @@
+# PagedAttention
+
+LLMs struggle with memory limitations during generation. In the decoding part of generation, all the attention keys and values generated for previous tokens are stored in GPU memory for reuse. This is called _KV cache_, and it may take up a large amount of memory for large models and long sequences.
+
+PagedAttention attempts to optimize memory use by partitioning the KV cache into blocks that are accessed through a lookup table. Thus, the KV cache does not need to be stored in contiguous memory, and blocks are allocated as needed. The memory efficiency can increase GPU utilization on memory-bound workloads, so more inference batches can be supported.
+
+The use of a lookup table to access the memory blocks can also help with KV sharing across multiple generations. This is helpful for techniques such as _parallel sampling_, where multiple outputs are generated simultaneously for the same prompt. In this case, the cached KV blocks can be shared among the generations.
+
+TGI's PagedAttention implementation leverages the custom cuda kernels developed by the [vLLM Project](https://github.com/vllm-project/vllm). You can learn more about this technique in the [project's page](https://vllm.ai/).
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
new file mode 100644
index 00000000000..1a44e3c2b21
--- /dev/null
+++ b/docs/source/conceptual/quantization.md
@@ -0,0 +1,59 @@
+# Quantization
+
+TGI offers GPTQ and bits-and-bytes quantization to quantize large language models.
+
+## Quantization with GPTQ
+
+GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇 
+
+Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\):
+
+$$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
+
+
+TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇 
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
+```
+
+Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. 
+
+To quantize a given model using GPTQ with a calibration dataset, simply run
+
+```bash
+text-generation-server quantize tiiuae/falcon-40b /data/falcon-40b-gptq
+# Add --upload-to-model-id MYUSERNAME/falcon-40b to push the created model to the hub directly
+```
+
+This will create a new directory with the quantized files which you can use with,
+
+```bash
+text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-shard 2 --quantize gptq
+```
+
+You can learn more about the quantization options by running `text-generation-server quantize --help`.
+
+If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
+You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
+
+## Quantization with bitsandbytes
+
+bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
+
+8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much. 
+In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize --bitsandbytes
+```
+
+4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
+
+In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇 
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize --bitsandbytes-nf4
+```
+
+You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
diff --git a/docs/source/conceptual/safetensors.md b/docs/source/conceptual/safetensors.md
new file mode 100644
index 00000000000..fcc31bac83d
--- /dev/null
+++ b/docs/source/conceptual/safetensors.md
@@ -0,0 +1,7 @@
+# Safetensors
+
+Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries). 
+
+TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format. 
+
+You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
\ No newline at end of file
diff --git a/docs/source/conceptual/tensor_parallelism.md b/docs/source/conceptual/tensor_parallelism.md
new file mode 100644
index 00000000000..886a349af7c
--- /dev/null
+++ b/docs/source/conceptual/tensor_parallelism.md
@@ -0,0 +1,14 @@
+# Tensor Parallelism
+
+Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇 
+
+![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png)
+
+
+<Tip warning={true}>
+
+Tensor Parallelism only works for [models officially supported](../supported_models), it will not work when falling back to `transformers`. You can get more information about unsupported models [here](../basic_tutorials/non_core_models).
+
+</Tip>
+
+You can learn a lot more details about tensor-parallelism from [the `transformers` docs](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism).
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index b91e77cbdff..0a874b57b7d 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -8,7 +8,7 @@ Let's say you want to deploy [Falcon-7B Instruct](https://huggingface.co/tiiuae/
 model=tiiuae/falcon-7b-instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
 ```
 
 <Tip warning={true}>
@@ -85,7 +85,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:1.0.3 --help
+docker run ghcr.io/huggingface/text-generation-inference:1.1.0 --help
 ```
 
 </Tip>
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index fe2a2a3a56f..8b4c33b12ee 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -18,7 +18,8 @@ The following models are optimized and can be served with TGI, which uses custom
 - [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
 - [MPT](https://huggingface.co/mosaicml/mpt-30b)
 - [Llama V2](https://huggingface.co/meta-llama)
-- [Codellama](https://huggingface.co/codellama)
+- [Code Llama](https://huggingface.co/codellama)
+- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
 
 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
 
@@ -29,6 +30,12 @@ AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
 AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")
 ```
 
+If you wish to serve a supported model that already exists on a local folder, just point to the local folder.
+
+```bash
+text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
+``````
+
 
 ## Supported Hardware
 
@@ -38,4 +45,3 @@ TGI is also supported on the following AI hardware accelerators:
 - *Habana first-gen Gaudi and Gaudi2:* check out this [example](https://github.com/huggingface/optimum-habana/tree/main/text-generation-inference) how to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
 
 
-
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
new file mode 100644
index 00000000000..dcd37cb992b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
@@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -7.703125,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.4765625,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.390625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.8583984,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.7548828,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.9306641,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 5618,
+        "logprob": -2.4550781,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.5732422,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 278,
+        "logprob": -1.5761719,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 4328,
+        "logprob": -1.5888672,
+        "special": false,
+        "text": " difference"
+      },
+      {
+        "id": 1546,
+        "logprob": -0.026504517,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 21784,
+        "logprob": -1.4287109,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -0.15856934,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 322,
+        "logprob": -0.17456055,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 6189,
+        "logprob": -0.62646484,
+        "special": false,
+        "text": " Machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
new file mode 100644
index 00000000000..d16d34f9be0
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 338,
+        "logprob": -9.0859375,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -10.90625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -2.65625,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -4.8085938,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.19958496,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 4013,
+        "logprob": -2.203125,
+        "special": false,
+        "text": "This"
+      },
+      {
+        "id": 1139,
+        "logprob": -0.23693848,
+        "special": false,
+        "text": " question"
+      },
+      {
+        "id": 756,
+        "logprob": 0.0,
+        "special": false,
+        "text": " has"
+      },
+      {
+        "id": 1063,
+        "logprob": -0.076538086,
+        "special": false,
+        "text": " been"
+      },
+      {
+        "id": 4433,
+        "logprob": 0.0,
+        "special": false,
+        "text": " asked"
+      },
+      {
+        "id": 1784,
+        "logprob": -1.1367188,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 3064,
+        "logprob": 0.0,
+        "special": false,
+        "text": " times"
+      },
+      {
+        "id": 322,
+        "logprob": -1.7460938,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 306,
+        "logprob": 0.0,
+        "special": false,
+        "text": " I"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is Deep Learning?\nThis question has been asked many times and I"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
new file mode 100644
index 00000000000..e6fb3dc0017
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
@@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8583984,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
new file mode 100644
index 00000000000..f1d9129d004
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
@@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json
new file mode 100644
index 00000000000..0f91eb3608e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json
@@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -7.6914062,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.4746094,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.390625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.8623047,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.7558594,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.9228516,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 5618,
+        "logprob": -2.4609375,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.57177734,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 278,
+        "logprob": -1.5722656,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 4328,
+        "logprob": -1.5927734,
+        "special": false,
+        "text": " difference"
+      },
+      {
+        "id": 1546,
+        "logprob": -0.026428223,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 21784,
+        "logprob": -1.4267578,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -0.16015625,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 322,
+        "logprob": -0.17382812,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 6189,
+        "logprob": -0.62060547,
+        "special": false,
+        "text": " Machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
index 49bc996c195..a7f7d2f0bdb 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
@@ -16,7 +16,7 @@
       },
       {
         "id": 2009,
-        "logprob": -11.5546875,
+        "logprob": -11.546875,
         "text": "request"
       }
     ],
@@ -24,65 +24,66 @@
     "tokens": [
       {
         "id": 363,
-        "logprob": -1.5380859,
+        "logprob": -1.5351562,
         "special": false,
         "text": " for"
       },
       {
         "id": 847,
-        "logprob": -2.5917969,
+        "logprob": -2.5722656,
         "special": false,
         "text": " /"
       },
       {
         "id": 2754,
-        "logprob": -2.2773438,
+        "logprob": -2.2714844,
         "special": false,
         "text": "api"
       },
       {
         "id": 29914,
-        "logprob": -0.034362793,
+        "logprob": -0.03414917,
         "special": false,
         "text": "/"
       },
       {
         "id": 29894,
-        "logprob": -0.96533203,
+        "logprob": -0.95996094,
         "special": false,
         "text": "v"
       },
       {
         "id": 29896,
-        "logprob": -0.36669922,
+        "logprob": -0.3635254,
         "special": false,
         "text": "1"
       },
       {
         "id": 29914,
-        "logprob": -0.013122559,
+        "logprob": -0.013031006,
         "special": false,
         "text": "/"
       },
       {
         "id": 16418,
-        "logprob": -3.1503906,
+        "logprob": -3.1523438,
         "special": false,
         "text": "projects"
       },
       {
         "id": 29914,
-        "logprob": -0.43652344,
+        "logprob": -0.43701172,
         "special": false,
         "text": "/"
       },
       {
         "id": 29896,
-        "logprob": -1.9404297,
+        "logprob": -1.9394531,
         "special": false,
         "text": "1"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "for /api/v1/projects/1"
+  "generated_text": " for /api/v1/projects/1"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
index 5be2870da8f..9f145377725 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
@@ -16,7 +16,7 @@
       },
       {
         "id": 2009,
-        "logprob": -11.5546875,
+        "logprob": -11.546875,
         "text": "request"
       }
     ],
@@ -24,19 +24,19 @@
     "tokens": [
       {
         "id": 5229,
-        "logprob": -2.5683594,
+        "logprob": -2.5839844,
         "special": false,
         "text": " failed"
       },
       {
         "id": 29901,
-        "logprob": -0.45336914,
+        "logprob": -0.44970703,
         "special": false,
         "text": ":"
       },
       {
         "id": 4829,
-        "logprob": -1.8408203,
+        "logprob": -1.8339844,
         "special": false,
         "text": " Error"
       },
@@ -52,7 +52,8 @@
         "special": false,
         "text": " test"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "Test requestfailed: Error in test"
+  "generated_text": "Test request failed: Error in test"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
index 9bbb5322576..3543dad2353 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
@@ -17,7 +17,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -25,25 +25,25 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5380859,
+          "logprob": -1.5351562,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5859375,
+          "logprob": -2.5566406,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.2695312,
+          "logprob": -2.2519531,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.03439331,
+          "logprob": -0.03414917,
           "special": false,
           "text": "/"
         },
@@ -55,13 +55,13 @@
         },
         {
           "id": 29896,
-          "logprob": -0.36694336,
+          "logprob": -0.3647461,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013114929,
+          "logprob": -0.012901306,
           "special": false,
           "text": "/"
         },
@@ -73,19 +73,20 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43847656,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9433594,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   },
   {
     "details": {
@@ -105,7 +106,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -113,43 +114,43 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5322266,
+          "logprob": -1.5332031,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5585938,
+          "logprob": -2.5625,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.265625,
+          "logprob": -2.2617188,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.034088135,
+          "logprob": -0.033996582,
           "special": false,
           "text": "/"
         },
         {
           "id": 29894,
-          "logprob": -0.96240234,
+          "logprob": -0.9609375,
           "special": false,
           "text": "v"
         },
         {
           "id": 29896,
-          "logprob": -0.36816406,
+          "logprob": -0.36572266,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013191223,
+          "logprob": -0.0129776,
           "special": false,
           "text": "/"
         },
@@ -161,19 +162,20 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43774414,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9443359,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   },
   {
     "details": {
@@ -193,7 +195,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -201,43 +203,43 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5322266,
+          "logprob": -1.5332031,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5585938,
+          "logprob": -2.5625,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.265625,
+          "logprob": -2.2617188,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.034088135,
+          "logprob": -0.033996582,
           "special": false,
           "text": "/"
         },
         {
           "id": 29894,
-          "logprob": -0.96240234,
+          "logprob": -0.9609375,
           "special": false,
           "text": "v"
         },
         {
           "id": 29896,
-          "logprob": -0.36816406,
+          "logprob": -0.36572266,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013191223,
+          "logprob": -0.0129776,
           "special": false,
           "text": "/"
         },
@@ -249,19 +251,20 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43774414,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9443359,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   },
   {
     "details": {
@@ -281,7 +284,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -289,43 +292,43 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5322266,
+          "logprob": -1.5332031,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5585938,
+          "logprob": -2.5625,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.265625,
+          "logprob": -2.2617188,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.034088135,
+          "logprob": -0.033996582,
           "special": false,
           "text": "/"
         },
         {
           "id": 29894,
-          "logprob": -0.96240234,
+          "logprob": -0.9609375,
           "special": false,
           "text": "v"
         },
         {
           "id": 29896,
-          "logprob": -0.36816406,
+          "logprob": -0.36572266,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013191223,
+          "logprob": -0.0129776,
           "special": false,
           "text": "/"
         },
@@ -337,18 +340,19 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43774414,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9443359,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json
new file mode 100644
index 00000000000..4e7de9a67f5
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -12.9140625,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -10.7578125,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 28747,
+        "logprob": -0.54785156,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 3169,
+        "logprob": -1.4091797,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 307,
+        "logprob": -3.0273438,
+        "special": false,
+        "text": " n"
+      },
+      {
+        "id": 327,
+        "logprob": -0.94433594,
+        "special": false,
+        "text": " ="
+      },
+      {
+        "id": 28705,
+        "logprob": -0.81347656,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -1.2958984,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28734,
+        "logprob": -2.0644531,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 387,
+        "logprob": -1.9580078,
+        "special": false,
+        "text": " -"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.5073242,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -1.1816406,
+        "special": false,
+        "text": "1"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ": Let n = 10 - 1"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json
new file mode 100644
index 00000000000..c0dc6471892
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -12.9140625,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -10.7578125,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 28747,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 3169,
+        "logprob": -0.1307373,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 332,
+        "logprob": -2.3359375,
+        "special": false,
+        "text": " u"
+      },
+      {
+        "id": 347,
+        "logprob": 0.0,
+        "special": false,
+        "text": " be"
+      },
+      {
+        "id": 325,
+        "logprob": -1.0234375,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 28734,
+        "logprob": -2.0292969,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 648,
+        "logprob": -1.0439453,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.24499512,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.5073242,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 387,
+        "logprob": -1.5507812,
+        "special": false,
+        "text": " -"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request: Let u be (0 + 3 -"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_load.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_load.json
new file mode 100644
index 00000000000..9d133077da6
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.55078125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4140625,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0273438,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94140625,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8173828,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2978516,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0664062,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9560547,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1787109,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.54785156,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4111328,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0292969,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94433594,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8178711,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2939453,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0644531,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9550781,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1796875,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.55078125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4140625,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0273438,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94140625,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8173828,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2978516,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0664062,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9560547,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1787109,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.55078125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4140625,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0273438,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94140625,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8173828,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2978516,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0664062,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9560547,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1787109,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
index 0edd81b6bda..2c5d05f6036 100644
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
@@ -11,22 +11,22 @@
       },
       {
         "id": 4911,
-        "logprob": -5.7773438,
+        "logprob": -5.7851562,
         "text": "User"
       },
       {
         "id": 29901,
-        "logprob": -0.0069999695,
+        "logprob": -0.006996155,
         "text": ":"
       },
       {
         "id": 32000,
-        "logprob": -0.8125,
+        "logprob": -0.81347656,
         "text": "<fake_token_around_image>"
       },
       {
         "id": 32001,
-        "logprob": -6.651878e-05,
+        "logprob": -6.687641e-05,
         "text": "<image>"
       },
       {
@@ -36,67 +36,67 @@
       },
       {
         "id": 1815,
-        "logprob": -4.2265625,
+        "logprob": -4.2148438,
         "text": "Can"
       },
       {
         "id": 366,
-        "logprob": -0.013977051,
+        "logprob": -0.014137268,
         "text": "you"
       },
       {
         "id": 2649,
-        "logprob": -4.4375,
+        "logprob": -4.4335938,
         "text": "tell"
       },
       {
         "id": 592,
-        "logprob": -0.29077148,
+        "logprob": -0.2919922,
         "text": "me"
       },
       {
         "id": 263,
-        "logprob": -4.2109375,
+        "logprob": -4.2070312,
         "text": "a"
       },
       {
         "id": 1407,
-        "logprob": -9.4296875,
+        "logprob": -9.421875,
         "text": "very"
       },
       {
         "id": 3273,
-        "logprob": -1.8671875,
+        "logprob": -1.8720703,
         "text": "short"
       },
       {
         "id": 5828,
-        "logprob": -0.26586914,
+        "logprob": -0.26489258,
         "text": "story"
       },
       {
         "id": 2729,
-        "logprob": -3.7460938,
+        "logprob": -3.7441406,
         "text": "based"
       },
       {
         "id": 373,
-        "logprob": -0.0005350113,
+        "logprob": -0.0005393028,
         "text": "on"
       },
       {
         "id": 278,
-        "logprob": -0.13867188,
+        "logprob": -0.140625,
         "text": "the"
       },
       {
         "id": 1967,
-        "logprob": -0.06842041,
+        "logprob": -0.06756592,
         "text": "image"
       },
       {
         "id": 29973,
-        "logprob": -0.15319824,
+        "logprob": -0.15454102,
         "text": "?"
       }
     ],
@@ -104,7 +104,7 @@
     "tokens": [
       {
         "id": 32002,
-        "logprob": -0.0019445419,
+        "logprob": -0.0019140244,
         "special": true,
         "text": "<end_of_utterance>"
       },
@@ -116,13 +116,13 @@
       },
       {
         "id": 13,
-        "logprob": -1.7881393e-05,
+        "logprob": -1.7642975e-05,
         "special": false,
         "text": "\n"
       },
       {
         "id": 7900,
-        "logprob": -3.0994415e-06,
+        "logprob": -2.9802322e-06,
         "special": false,
         "text": "Ass"
       },
@@ -140,30 +140,30 @@
       },
       {
         "id": 319,
-        "logprob": -0.9057617,
+        "logprob": -0.91064453,
         "special": false,
         "text": " A"
       },
       {
         "id": 696,
-        "logprob": -1.2314453,
+        "logprob": -1.2412109,
         "special": false,
         "text": " ro"
       },
       {
         "id": 15664,
-        "logprob": -0.00024914742,
+        "logprob": -0.0002439022,
         "special": false,
         "text": "oster"
       },
       {
         "id": 15028,
-        "logprob": -1.1621094,
+        "logprob": -1.1630859,
         "special": false,
         "text": " stands"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "\nAssistant: A rooster stands"
+  "generated_text": " \nAssistant: A rooster stands"
 }
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
index 81cc1b19841..f258e38da41 100644
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
@@ -12,22 +12,22 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7773438,
+          "logprob": -5.7851562,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0069999695,
+          "logprob": -0.006996155,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.81347656,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.651878e-05,
+          "logprob": -6.687641e-05,
           "text": "<image>"
         },
         {
@@ -37,67 +37,67 @@
         },
         {
           "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.2148438,
           "text": "Can"
         },
         {
           "id": 366,
-          "logprob": -0.013977051,
+          "logprob": -0.014137268,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4375,
+          "logprob": -4.4335938,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.29077148,
+          "logprob": -0.2919922,
           "text": "me"
         },
         {
           "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.2070312,
           "text": "a"
         },
         {
           "id": 1407,
-          "logprob": -9.4296875,
+          "logprob": -9.421875,
           "text": "very"
         },
         {
           "id": 3273,
-          "logprob": -1.8671875,
+          "logprob": -1.8720703,
           "text": "short"
         },
         {
           "id": 5828,
-          "logprob": -0.26586914,
+          "logprob": -0.26489258,
           "text": "story"
         },
         {
           "id": 2729,
-          "logprob": -3.7460938,
+          "logprob": -3.7441406,
           "text": "based"
         },
         {
           "id": 373,
-          "logprob": -0.0005350113,
+          "logprob": -0.0005393028,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.140625,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.06842041,
+          "logprob": -0.06756592,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15319824,
+          "logprob": -0.15454102,
           "text": "?"
         }
       ],
@@ -105,13 +105,13 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019445419,
+          "logprob": -0.0019140244,
           "special": true,
           "text": "<end_of_utterance>"
         },
         {
           "id": 29871,
-          "logprob": -8.416176e-05,
+          "logprob": -8.392334e-05,
           "special": false,
           "text": " "
         },
@@ -123,7 +123,7 @@
         },
         {
           "id": 7900,
-          "logprob": -3.0994415e-06,
+          "logprob": -2.9802322e-06,
           "special": false,
           "text": "Ass"
         },
@@ -135,38 +135,38 @@
         },
         {
           "id": 29901,
-          "logprob": -3.2186508e-06,
+          "logprob": -3.0994415e-06,
           "special": false,
           "text": ":"
         },
         {
           "id": 319,
-          "logprob": -0.89941406,
+          "logprob": -0.9057617,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.234375,
+          "logprob": -1.2294922,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.0002465248,
+          "logprob": -0.00024533272,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1660156,
+          "logprob": -1.1640625,
           "special": false,
           "text": " stands"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   },
   {
     "details": {
@@ -181,22 +181,22 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7890625,
+          "logprob": -5.7773438,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0070152283,
+          "logprob": -0.0070114136,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.8208008,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.651878e-05,
+          "logprob": -6.699562e-05,
           "text": "<image>"
         },
         {
@@ -211,17 +211,17 @@
         },
         {
           "id": 366,
-          "logprob": -0.014190674,
+          "logprob": -0.014175415,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4140625,
+          "logprob": -4.4296875,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.2919922,
+          "logprob": -0.29516602,
           "text": "me"
         },
         {
@@ -231,7 +231,7 @@
         },
         {
           "id": 1407,
-          "logprob": -9.4375,
+          "logprob": -9.4296875,
           "text": "very"
         },
         {
@@ -241,7 +241,7 @@
         },
         {
           "id": 5828,
-          "logprob": -0.26904297,
+          "logprob": -0.26879883,
           "text": "story"
         },
         {
@@ -251,22 +251,22 @@
         },
         {
           "id": 373,
-          "logprob": -0.0005402565,
+          "logprob": -0.0005354881,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.13671875,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.068359375,
+          "logprob": -0.06719971,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15539551,
+          "logprob": -0.15551758,
           "text": "?"
         }
       ],
@@ -274,7 +274,7 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019168854,
+          "logprob": -0.0019130707,
           "special": true,
           "text": "<end_of_utterance>"
         },
@@ -286,7 +286,7 @@
         },
         {
           "id": 13,
-          "logprob": -1.7642975e-05,
+          "logprob": -1.7881393e-05,
           "special": false,
           "text": "\n"
         },
@@ -310,32 +310,32 @@
         },
         {
           "id": 319,
-          "logprob": -0.90722656,
+          "logprob": -0.9013672,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.2373047,
+          "logprob": -1.2324219,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.00024938583,
+          "logprob": -0.0002477169,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1708984,
+          "logprob": -1.1660156,
           "special": false,
           "text": " stands"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   },
   {
     "details": {
@@ -350,22 +350,22 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7890625,
+          "logprob": -5.7773438,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0070152283,
+          "logprob": -0.0070114136,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.8208008,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.663799e-05,
+          "logprob": -6.699562e-05,
           "text": "<image>"
         },
         {
@@ -380,17 +380,17 @@
         },
         {
           "id": 366,
-          "logprob": -0.014190674,
+          "logprob": -0.014175415,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4140625,
+          "logprob": -4.4296875,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.2919922,
+          "logprob": -0.29516602,
           "text": "me"
         },
         {
@@ -400,7 +400,7 @@
         },
         {
           "id": 1407,
-          "logprob": -9.4375,
+          "logprob": -9.4296875,
           "text": "very"
         },
         {
@@ -410,7 +410,7 @@
         },
         {
           "id": 5828,
-          "logprob": -0.26904297,
+          "logprob": -0.26879883,
           "text": "story"
         },
         {
@@ -420,22 +420,22 @@
         },
         {
           "id": 373,
-          "logprob": -0.0005402565,
+          "logprob": -0.0005354881,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.13671875,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.068359375,
+          "logprob": -0.06719971,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15539551,
+          "logprob": -0.15551758,
           "text": "?"
         }
       ],
@@ -443,19 +443,19 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019168854,
+          "logprob": -0.001912117,
           "special": true,
           "text": "<end_of_utterance>"
         },
         {
           "id": 29871,
-          "logprob": -8.404255e-05,
+          "logprob": -8.392334e-05,
           "special": false,
           "text": " "
         },
         {
           "id": 13,
-          "logprob": -1.7642975e-05,
+          "logprob": -1.7762184e-05,
           "special": false,
           "text": "\n"
         },
@@ -479,32 +479,32 @@
         },
         {
           "id": 319,
-          "logprob": -0.90722656,
+          "logprob": -0.9013672,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.2373047,
+          "logprob": -1.2324219,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.00024938583,
+          "logprob": -0.0002477169,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1708984,
+          "logprob": -1.1660156,
           "special": false,
           "text": " stands"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   },
   {
     "details": {
@@ -519,22 +519,22 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7890625,
+          "logprob": -5.7773438,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0070152283,
+          "logprob": -0.0070114136,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.8208008,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.663799e-05,
+          "logprob": -6.699562e-05,
           "text": "<image>"
         },
         {
@@ -549,17 +549,17 @@
         },
         {
           "id": 366,
-          "logprob": -0.014190674,
+          "logprob": -0.014175415,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4140625,
+          "logprob": -4.4296875,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.2919922,
+          "logprob": -0.29516602,
           "text": "me"
         },
         {
@@ -569,7 +569,7 @@
         },
         {
           "id": 1407,
-          "logprob": -9.4375,
+          "logprob": -9.4296875,
           "text": "very"
         },
         {
@@ -579,7 +579,7 @@
         },
         {
           "id": 5828,
-          "logprob": -0.26904297,
+          "logprob": -0.26879883,
           "text": "story"
         },
         {
@@ -589,22 +589,22 @@
         },
         {
           "id": 373,
-          "logprob": -0.0005402565,
+          "logprob": -0.0005354881,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.13671875,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.068359375,
+          "logprob": -0.06719971,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15539551,
+          "logprob": -0.15551758,
           "text": "?"
         }
       ],
@@ -612,19 +612,19 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019159317,
+          "logprob": -0.001912117,
           "special": true,
           "text": "<end_of_utterance>"
         },
         {
           "id": 29871,
-          "logprob": -8.404255e-05,
+          "logprob": -8.392334e-05,
           "special": false,
           "text": " "
         },
         {
           "id": 13,
-          "logprob": -1.7642975e-05,
+          "logprob": -1.7762184e-05,
           "special": false,
           "text": "\n"
         },
@@ -648,31 +648,31 @@
         },
         {
           "id": 319,
-          "logprob": -0.90722656,
+          "logprob": -0.9013672,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.2373047,
+          "logprob": -1.2324219,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.00024938583,
+          "logprob": -0.0002477169,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1708984,
+          "logprob": -1.1660156,
           "special": false,
           "text": " stands"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   }
 ]
diff --git a/integration-tests/models/test_flash_awq.py b/integration-tests/models/test_flash_awq.py
new file mode 100644
index 00000000000..62a95f48791
--- /dev/null
+++ b/integration-tests/models/test_flash_awq.py
@@ -0,0 +1,73 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_awq_handle(launcher):
+    with launcher(
+        "abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq",
+        num_shard=1,
+        quantize="awq",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_awq(flash_llama_awq_handle):
+    await flash_llama_awq_handle.health(300)
+    return flash_llama_awq_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
+        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "\nWhat is the difference between Deep Learning and Machine"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
+        "What is Deep Learning?",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_awq_load(flash_llama_awq, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_llama_awq, "What is Deep Learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all(
+        [
+            r.generated_text
+            == "\nWhat is the difference between Deep Learning and Machine"
+            for r in responses
+        ]
+    )
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_awq_sharded.py b/integration-tests/models/test_flash_awq_sharded.py
new file mode 100644
index 00000000000..1c687fc9915
--- /dev/null
+++ b/integration-tests/models/test_flash_awq_sharded.py
@@ -0,0 +1,53 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_awq_handle_sharded(launcher):
+    with launcher(
+        "abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq",
+        num_shard=2,
+        quantize="awq",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded):
+    await flash_llama_awq_handle_sharded.health(300)
+    return flash_llama_awq_handle_sharded.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapshot):
+    response = await flash_llama_awq_sharded.generate(
+        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "\nWhat is the difference between Deep Learning and Machine"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_awq_load_sharded(
+    flash_llama_awq_sharded, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_llama_awq_sharded, "What is Deep Learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all(
+        [
+            r.generated_text
+            == "\nWhat is the difference between Deep Learning and Machine"
+            for r in responses
+        ]
+    )
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_mistral.py b/integration-tests/models/test_flash_mistral.py
new file mode 100644
index 00000000000..63cb09b5a21
--- /dev/null
+++ b/integration-tests/models/test_flash_mistral.py
@@ -0,0 +1,60 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_mistral_handle(launcher):
+    with launcher("mistralai/Mistral-7B-Instruct-v0.1") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_mistral(flash_mistral_handle):
+    await flash_mistral_handle.health(300)
+    return flash_mistral_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_mistral(flash_mistral, response_snapshot):
+    response = await flash_mistral.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_mistral_all_params(flash_mistral, response_snapshot):
+    response = await flash_mistral.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_mistral_load(flash_mistral, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_mistral, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py
index 5659dd5c7d2..5f4571b57ed 100644
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
@@ -3,9 +3,7 @@
 
 @pytest.fixture(scope="module")
 def idefics_handle(launcher):
-    with launcher(
-        "HuggingFaceM4/idefics-9b-instruct", num_shard=2
-    ) as handle:
+    with launcher("HuggingFaceM4/idefics-9b-instruct", num_shard=2) as handle:
         yield handle
 
 
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index bb881d8ef69..aff6a3774d3 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.0.3"
+version = "1.1.0"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]
 
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index 3e7f86d4e6f..77fb5ee04c0 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -7,17 +7,17 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
-clap = { version = "4.1.4", features = ["derive", "env"] }
-ctrlc = { version = "3.2.5", features = ["termination"] }
-nix = "0.26.2"
-serde = { version = "1.0.152", features = ["derive"]  }
-serde_json = "1.0.93"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+ctrlc = { version = "3.4.1", features = ["termination"] }
+nix = "0.27.1"
+serde = { version = "1.0.188", features = ["derive"]  }
+serde_json = "1.0.107"
 tracing = "0.1.37"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
 
 [dev-dependencies]
 float_eq = "1.0.1"
-reqwest = { version = "0.11.14", features = ["blocking", "json"] }
+reqwest = { version = "0.11.20", features = ["blocking", "json"] }
 
 [build-dependencies]
-vergen = { version = "8.0.0", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }
+vergen = { version = "8.2.5", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index cbb6f25d59c..b4fc86b7bee 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -21,10 +21,32 @@ mod env_runtime;
 
 #[derive(Clone, Copy, Debug, ValueEnum)]
 enum Quantization {
+    /// 4 bit quantization. Requires a specific GTPQ quantized model:
+    ///   https://hf.co/models?search=awq.
+    /// Should replace GPTQ models whereever possible because of the better latency
+    Awq,
+    /// 8 bit quantization, doesn't require specific model.
+    /// Should be a drop-in replacement to bitsandbytes with much better performance.
+    /// Kernels are from https://github.com/NetEase-FuXi/EETQ.git
+    Eetq,
+    /// 4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq.
+    /// text-generation-inference will use exllama (faster) kernels whereever possible, and use
+    /// triton kernel (wider support) when it's not.
+    /// AWQ has faster kernels.
+    Gptq,
+    /// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
+    /// but it is known that the model will be much slower to run than the native f16.
+    #[deprecated(
+        since = "1.1.0",
+        note = "Use `eetq` instead, which provides better latencies overall and is drop-in in most cases"
+    )]
     Bitsandbytes,
+    /// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
+    /// but it is known that the model will be much slower to run than the native f16.
     BitsandbytesNF4,
+    /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
+    /// perplexity performance for you model
     BitsandbytesFP4,
-    Gptq,
 }
 
 impl std::fmt::Display for Quantization {
@@ -43,6 +65,12 @@ impl std::fmt::Display for Quantization {
             Quantization::Gptq => {
                 write!(f, "gptq")
             }
+            Quantization::Awq => {
+                write!(f, "awq")
+            }
+            Quantization::Eetq => {
+                write!(f, "eetq")
+            }
         }
     }
 }
@@ -123,9 +151,7 @@ struct Args {
     #[clap(long, env)]
     num_shard: Option<usize>,
 
-    /// Whether you want the model to be quantized. This will use `bitsandbytes` for
-    /// quantization on the fly, or `gptq`. 4bit quantization is available through
-    /// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
+    /// Whether you want the model to be quantized.
     #[clap(long, env, value_enum)]
     quantize: Option<Quantization>,
 
diff --git a/proto/generate.proto b/proto/generate.proto
index 3f607dc5769..c873e6615ef 100644
--- a/proto/generate.proto
+++ b/proto/generate.proto
@@ -31,6 +31,7 @@ message InfoResponse {
     bool requires_padding = 1;
     string dtype = 2;
     string device_type = 3;
+    optional uint32 window_size = 4;
 }
 
 /// Empty request
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 10396826702..87b5a8d39f5 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -15,36 +15,38 @@ name = "text-generation-router"
 path = "src/main.rs"
 
 [dependencies]
-async-stream = "0.3.3"
-axum = { version = "0.6.4", features = ["json"] }
-axum-tracing-opentelemetry = "0.10.0"
+async-stream = "0.3.5"
+axum = { version = "0.6.20", features = ["json"] }
+axum-tracing-opentelemetry = "0.14.1"
 text-generation-client = { path = "client" }
-clap = { version = "4.1.4", features = ["derive", "env"] }
-flume = "0.10.14"
-futures = "0.3.26"
-metrics = "0.21.0"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+flume = "0.11.0"
+futures = "0.3.28"
+metrics = "0.21.1"
 metrics-exporter-prometheus = { version = "0.12.1", features = [] }
 nohash-hasher = "0.2.0"
-opentelemetry = { version = "0.19.0", features = ["rt-tokio"] }
-opentelemetry-otlp = "0.12.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
 rand = "0.8.5"
-reqwest = { version = "0.11.14", features = [] }
-serde = "1.0.152"
-serde_json = "1.0.93"
-thiserror = "1.0.38"
-tokenizers = "0.13.3"
-tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tower-http = { version = "0.4.0", features = ["cors"] }
+reqwest = { version = "0.11.20", features = [] }
+serde = "1.0.188"
+serde_json = "1.0.107"
+thiserror = "1.0.48"
+tokenizers = { version = "0.14.0", features = ["http"] }
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tower-http = { version = "0.4.4", features = ["cors"] }
 tracing = "0.1.37"
-tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
-utoipa = { version = "3.0.1", features = ["axum_extras"] }
-utoipa-swagger-ui = { version = "3.0.2", features = ["axum"] }
-ngrok = { version = "0.12.3", features = ["axum"], optional = true }
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+utoipa = { version = "3.5.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "3.1.5", features = ["axum"] }
+ngrok = { version = "0.13.1", features = ["axum"], optional = true }
+hf-hub = "0.3.1"
+init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
 
 [build-dependencies]
-vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
+vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }
 
 [features]
 default = ["ngrok"]
-ngrok = ["dep:ngrok"]
\ No newline at end of file
+ngrok = ["dep:ngrok"]
diff --git a/router/client/Cargo.toml b/router/client/Cargo.toml
index 43f444e620b..d01317846a0 100644
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@@ -8,13 +8,13 @@ homepage.workspace = true
 [dependencies]
 futures = "^0.3"
 grpc-metadata = { path = "../grpc-metadata" }
-prost = "^0.11"
+prost = "^0.12"
 thiserror = "^1.0"
-tokio = { version = "^1.25", features = ["sync"] }
-tonic = "^0.9"
+tokio = { version = "^1.32", features = ["sync"] }
+tonic = "^0.10"
 tower = "^0.4"
 tracing = "^0.1"
 
 [build-dependencies]
-tonic-build = "0.9.2"
-prost-build = "0.11.6"
+tonic-build = "0.10.1"
+prost-build = "0.12.1"
diff --git a/router/grpc-metadata/Cargo.toml b/router/grpc-metadata/Cargo.toml
index 9e01f527747..da163ec5cf5 100644
--- a/router/grpc-metadata/Cargo.toml
+++ b/router/grpc-metadata/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-opentelemetry = "^0.19"
-tonic = "^0.9"
+opentelemetry = "^0.20"
+tonic = "^0.10"
 tracing = "^0.1"
-tracing-opentelemetry = "^0.19"
+tracing-opentelemetry = "^0.21"
diff --git a/router/src/infer.rs b/router/src/infer.rs
index 67b5bde24a5..787ccfcf10a 100644
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@@ -50,10 +50,11 @@ impl Infer {
         max_waiting_tokens: usize,
         max_concurrent_requests: usize,
         requires_padding: bool,
+        window_size: Option<u32>,
         generation_health: Arc<AtomicBool>,
     ) -> Self {
         // Infer shared state
-        let queue = Queue::new(requires_padding, 16);
+        let queue = Queue::new(requires_padding, 16, window_size);
         let shared = Arc::new(Shared {
             batching_task: Notify::new(),
         });
diff --git a/router/src/main.rs b/router/src/main.rs
index 4903c06601f..f30286749c2 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -324,7 +324,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
 
         if let Ok(tracer) = tracer {
             layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
-            axum_tracing_opentelemetry::init_propagator().unwrap();
+            init_tracing_opentelemetry::init_propagator().unwrap();
         };
     }
 
diff --git a/router/src/queue.rs b/router/src/queue.rs
index e97a168e227..1ab9eb11ed4 100644
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
@@ -2,6 +2,7 @@ use crate::infer::InferError;
 use crate::infer::InferStreamResponse;
 use crate::validation::ValidGenerateRequest;
 use nohash_hasher::{BuildNoHashHasher, IntMap};
+use std::cmp::min;
 use std::collections::VecDeque;
 use text_generation_client::{Batch, Request};
 use tokio::sync::oneshot;
@@ -33,12 +34,17 @@ pub(crate) struct Queue {
 }
 
 impl Queue {
-    pub(crate) fn new(requires_padding: bool, block_size: u32) -> Self {
+    pub(crate) fn new(requires_padding: bool, block_size: u32, window_size: Option<u32>) -> Self {
         // Create channel
         let (queue_sender, queue_receiver) = flume::unbounded();
 
         // Launch background queue task
-        tokio::spawn(queue_task(requires_padding, block_size, queue_receiver));
+        tokio::spawn(queue_task(
+            requires_padding,
+            block_size,
+            window_size,
+            queue_receiver,
+        ));
 
         Self { queue_sender }
     }
@@ -84,9 +90,10 @@ impl Queue {
 async fn queue_task(
     requires_padding: bool,
     block_size: u32,
+    window_size: Option<u32>,
     receiver: flume::Receiver<QueueCommand>,
 ) {
-    let mut state = State::new(requires_padding, block_size);
+    let mut state = State::new(requires_padding, block_size, window_size);
 
     while let Ok(cmd) = receiver.recv_async().await {
         match cmd {
@@ -126,16 +133,20 @@ struct State {
 
     /// Paged Attention block size
     block_size: u32,
+
+    /// Sliding window
+    window_size: Option<u32>,
 }
 
 impl State {
-    fn new(requires_padding: bool, block_size: u32) -> Self {
+    fn new(requires_padding: bool, block_size: u32, window_size: Option<u32>) -> Self {
         Self {
             entries: VecDeque::with_capacity(128),
             next_id: 0,
             next_batch_id: 0,
             requires_padding,
             block_size,
+            window_size,
         }
     }
 
@@ -204,11 +215,17 @@ impl State {
             if self.requires_padding {
                 decode_tokens += entry.request.stopping_parameters.max_new_tokens;
             } else {
+                let max_new_tokens = match self.window_size {
+                    None => entry.request.stopping_parameters.max_new_tokens,
+                    Some(window_size) => min(
+                        window_size.saturating_sub(entry.request.input_length),
+                        entry.request.stopping_parameters.max_new_tokens,
+                    ),
+                };
+
                 // pad to block size
                 decode_tokens +=
-                    ((entry.request.stopping_parameters.max_new_tokens + self.block_size - 1)
-                        / self.block_size)
-                        * self.block_size;
+                    ((max_new_tokens + self.block_size - 1) / self.block_size) * self.block_size;
             }
 
             if prefill_tokens > prefill_token_budget
@@ -342,7 +359,7 @@ mod tests {
 
     #[test]
     fn test_append() {
-        let mut state = State::new(false, 1);
+        let mut state = State::new(false, 1, None);
         let (entry, _guard) = default_entry();
 
         assert_eq!(state.next_id, 0);
@@ -358,7 +375,7 @@ mod tests {
 
     #[test]
     fn test_next_batch_empty() {
-        let mut state = State::new(false, 1);
+        let mut state = State::new(false, 1, None);
 
         assert!(state.next_batch(None, 1, 1).is_none());
         assert!(state.next_batch(Some(1), 1, 1).is_none());
@@ -366,7 +383,7 @@ mod tests {
 
     #[test]
     fn test_next_batch_min_size() {
-        let mut state = State::new(false, 1);
+        let mut state = State::new(false, 1, None);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -398,7 +415,7 @@ mod tests {
 
     #[test]
     fn test_next_batch_token_budget() {
-        let mut state = State::new(false, 1);
+        let mut state = State::new(false, 1, None);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -431,14 +448,14 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_append() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None);
         let (entry, _guard) = default_entry();
         queue.append(entry);
     }
 
     #[tokio::test]
     async fn test_queue_next_batch_empty() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None);
 
         assert!(queue.next_batch(None, 1, 1).await.is_none());
         assert!(queue.next_batch(Some(1), 1, 1).await.is_none());
@@ -446,7 +463,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_min_size() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -479,7 +496,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_token_budget() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -504,7 +521,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_dropped_receiver() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None);
         let (entry, _) = default_entry();
         queue.append(entry);
 
diff --git a/router/src/server.rs b/router/src/server.rs
index 91164098d73..f254afd8aa1 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -13,7 +13,7 @@ use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
 use axum::{http, Json, Router};
-use axum_tracing_opentelemetry::opentelemetry_tracing_layer;
+use axum_tracing_opentelemetry::middleware::OtelAxumLayer;
 use futures::stream::StreamExt;
 use futures::Stream;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
@@ -396,7 +396,7 @@ async fn generate_stream(
                                         // StreamResponse
                                         let stream_token = StreamResponse {
                                             token,
-                                            top_tokens: top_tokens,
+                                            top_tokens,
                                             generated_text: None,
                                             details: None,
                                         };
@@ -458,7 +458,7 @@ async fn generate_stream(
 
                                         let stream_token = StreamResponse {
                                             token,
-                                            top_tokens: top_tokens,
+                                            top_tokens,
                                             generated_text: Some(output_text),
                                             details
                                         };
@@ -595,6 +595,7 @@ pub async fn run(
         max_waiting_tokens,
         max_concurrent_requests,
         shard_info.requires_padding,
+        shard_info.window_size,
         generation_health,
     );
 
@@ -695,7 +696,7 @@ pub async fn run(
         .layer(Extension(compat_return_full_text))
         .layer(Extension(infer))
         .layer(Extension(prom_handle.clone()))
-        .layer(opentelemetry_tracing_layer())
+        .layer(OtelAxumLayer::default())
         .layer(cors_layer);
 
     if ngrok {
@@ -792,7 +793,7 @@ async fn shutdown_signal() {
 
 impl From<i32> for FinishReason {
     fn from(finish_reason: i32) -> Self {
-        let finish_reason = text_generation_client::FinishReason::from_i32(finish_reason).unwrap();
+        let finish_reason = text_generation_client::FinishReason::try_from(finish_reason).unwrap();
         match finish_reason {
             text_generation_client::FinishReason::Length => FinishReason::Length,
             text_generation_client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 6c67f0ff827..36cbfb9ba67 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -276,7 +276,7 @@ impl Validation {
             truncate: truncate.unwrap_or(self.max_input_length) as u32,
             parameters,
             stopping_parameters,
-            top_n_tokens: top_n_tokens,
+            top_n_tokens,
         })
     }
 
diff --git a/server/.gitignore b/server/.gitignore
index 2e1db12437f..dcb8fe6743b 100644
--- a/server/.gitignore
+++ b/server/.gitignore
@@ -159,3 +159,5 @@ safetensors
 flash-attention/
 flash-attention-v2/
 vllm/
+llm-awq/
+eetq/
diff --git a/server/Makefile b/server/Makefile
index a4ce6d8b7a2..52543e3d215 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -1,6 +1,8 @@
 include Makefile-flash-att
 include Makefile-flash-att-v2
 include Makefile-vllm
+include Makefile-awq
+include Makefile-eetq
 
 unit-tests:
 	pytest -s -vv -m "not private" tests
diff --git a/server/Makefile-awq b/server/Makefile-awq
new file mode 100644
index 00000000000..80e78c08fd3
--- /dev/null
+++ b/server/Makefile-awq
@@ -0,0 +1,13 @@
+awq_commit := f084f40bd996f3cf3a0633c1ad7d9d476c318aaa
+
+awq: 
+	rm -rf llm-awq
+	git clone https://github.com/mit-han-lab/llm-awq
+
+build-awq: awq
+	cd llm-awq/ && git fetch && git checkout $(awq_commit)
+	cd llm-awq/awq/kernels && python setup.py build
+
+install-awq: build-awq
+	pip uninstall awq_inference_engine -y || true
+	cd llm-awq/awq/kernels && python setup.py install
diff --git a/server/Makefile-eetq b/server/Makefile-eetq
new file mode 100644
index 00000000000..5e8e9830e8a
--- /dev/null
+++ b/server/Makefile-eetq
@@ -0,0 +1,13 @@
+eetq_commit := 323827dd471458a84e9c840f614e4592b157a4b1
+
+eetq:
+    # Clone eetq
+	pip install packaging
+	git clone https://github.com/NetEase-FuXi/EETQ.git eetq
+
+build-eetq: eetq
+	cd eetq && git fetch && git checkout $(eetq_commit)
+	cd eetq && python setup.py build
+
+install-eetq: build-eetq
+	cd eetq && python setup.py install
diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
index a7d633563d8..cdea843119a 100644
--- a/server/Makefile-flash-att-v2
+++ b/server/Makefile-flash-att-v2
@@ -1,4 +1,4 @@
-flash_att_v2_commit := 4f285b354796fb17df8636485b9a04df3ebbb7dc
+flash_att_v2_commit := 601b4dc48dbe9d87c468daa2b4c0c8388b83753c
 
 flash-attention-v2:
     # Clone flash attention
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
index 96bfc10880b..2e965da0177 100644
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@@ -1,4 +1,4 @@
-vllm_commit := e86af624d059969b0fb07b075b1d338bf10c3365
+vllm_commit := 25dbff97d5a8f2ba331847237b458b2692e9ae78
 
 vllm:
     # Clone vllm
diff --git a/server/poetry.lock b/server/poetry.lock
index fd4d427ddac..7c18ec76d5a 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -323,19 +323,19 @@ files = [
 
 [[package]]
 name = "datasets"
-version = "2.14.4"
+version = "2.14.5"
 description = "HuggingFace community-driven open-source library of datasets"
 optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "datasets-2.14.4-py3-none-any.whl", hash = "sha256:29336bd316a7d827ccd4da2236596279b20ca2ac78f64c04c9483da7cbc2459b"},
-    {file = "datasets-2.14.4.tar.gz", hash = "sha256:ef29c2b5841de488cd343cfc26ab979bff77efa4d2285af51f1ad7db5c46a83b"},
+    {file = "datasets-2.14.5-py3-none-any.whl", hash = "sha256:dd4155091034cba04d5a28711f2ed3944275ed15c5d0c5a2d0b6b9ea34a2bdfe"},
+    {file = "datasets-2.14.5.tar.gz", hash = "sha256:b738a86540ab8e1a7806c8a3790b67be0056318d0c5d5a58a1b0dbdd76c0f568"},
 ]
 
 [package.dependencies]
 aiohttp = "*"
 dill = ">=0.3.0,<0.3.8"
-fsspec = {version = ">=2021.11.1", extras = ["http"]}
+fsspec = {version = ">=2023.1.0,<2023.9.0", extras = ["http"]}
 huggingface-hub = ">=0.14.0,<1.0.0"
 multiprocess = "*"
 numpy = ">=1.17"
@@ -421,21 +421,19 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "filelock"
-version = "3.12.3"
+version = "3.12.4"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"},
-    {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"},
+    {file = "filelock-3.12.4-py3-none-any.whl", hash = "sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4"},
+    {file = "filelock-3.12.4.tar.gz", hash = "sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd"},
 ]
 
-[package.dependencies]
-typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""}
-
 [package.extras]
 docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"]
 testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"]
+typing = ["typing-extensions (>=4.7.1)"]
 
 [[package]]
 name = "frozenlist"
@@ -582,148 +580,148 @@ testing = ["protobuf (>=4.21.9)"]
 
 [[package]]
 name = "grpcio"
-version = "1.57.0"
+version = "1.58.0"
 description = "HTTP/2-based RPC framework"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "grpcio-1.57.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:092fa155b945015754bdf988be47793c377b52b88d546e45c6a9f9579ac7f7b6"},
-    {file = "grpcio-1.57.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f7349786da979a94690cc5c2b804cab4e8774a3cf59be40d037c4342c906649"},
-    {file = "grpcio-1.57.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:82640e57fb86ea1d71ea9ab54f7e942502cf98a429a200b2e743d8672171734f"},
-    {file = "grpcio-1.57.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40b72effd4c789de94ce1be2b5f88d7b9b5f7379fe9645f198854112a6567d9a"},
-    {file = "grpcio-1.57.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f708a6a17868ad8bf586598bee69abded4996b18adf26fd2d91191383b79019"},
-    {file = "grpcio-1.57.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:60fe15288a0a65d5c1cb5b4a62b1850d07336e3ba728257a810317be14f0c527"},
-    {file = "grpcio-1.57.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6907b1cf8bb29b058081d2aad677b15757a44ef2d4d8d9130271d2ad5e33efca"},
-    {file = "grpcio-1.57.0-cp310-cp310-win32.whl", hash = "sha256:57b183e8b252825c4dd29114d6c13559be95387aafc10a7be645462a0fc98bbb"},
-    {file = "grpcio-1.57.0-cp310-cp310-win_amd64.whl", hash = "sha256:7b400807fa749a9eb286e2cd893e501b110b4d356a218426cb9c825a0474ca56"},
-    {file = "grpcio-1.57.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:c6ebecfb7a31385393203eb04ed8b6a08f5002f53df3d59e5e795edb80999652"},
-    {file = "grpcio-1.57.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:00258cbe3f5188629828363ae8ff78477ce976a6f63fb2bb5e90088396faa82e"},
-    {file = "grpcio-1.57.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:23e7d8849a0e58b806253fd206ac105b328171e01b8f18c7d5922274958cc87e"},
-    {file = "grpcio-1.57.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5371bcd861e679d63b8274f73ac281751d34bd54eccdbfcd6aa00e692a82cd7b"},
-    {file = "grpcio-1.57.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aed90d93b731929e742967e236f842a4a2174dc5db077c8f9ad2c5996f89f63e"},
-    {file = "grpcio-1.57.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fe752639919aad9ffb0dee0d87f29a6467d1ef764f13c4644d212a9a853a078d"},
-    {file = "grpcio-1.57.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fada6b07ec4f0befe05218181f4b85176f11d531911b64c715d1875c4736d73a"},
-    {file = "grpcio-1.57.0-cp311-cp311-win32.whl", hash = "sha256:bb396952cfa7ad2f01061fbc7dc1ad91dd9d69243bcb8110cf4e36924785a0fe"},
-    {file = "grpcio-1.57.0-cp311-cp311-win_amd64.whl", hash = "sha256:e503cb45ed12b924b5b988ba9576dc9949b2f5283b8e33b21dcb6be74a7c58d0"},
-    {file = "grpcio-1.57.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:fd173b4cf02b20f60860dc2ffe30115c18972d7d6d2d69df97ac38dee03be5bf"},
-    {file = "grpcio-1.57.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:d7f8df114d6b4cf5a916b98389aeaf1e3132035420a88beea4e3d977e5f267a5"},
-    {file = "grpcio-1.57.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:76c44efa4ede1f42a9d5b2fed1fe9377e73a109bef8675fb0728eb80b0b8e8f2"},
-    {file = "grpcio-1.57.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4faea2cfdf762a664ab90589b66f416274887641ae17817de510b8178356bf73"},
-    {file = "grpcio-1.57.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c60b83c43faeb6d0a9831f0351d7787a0753f5087cc6fa218d78fdf38e5acef0"},
-    {file = "grpcio-1.57.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b363bbb5253e5f9c23d8a0a034dfdf1b7c9e7f12e602fc788c435171e96daccc"},
-    {file = "grpcio-1.57.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:f1fb0fd4a1e9b11ac21c30c169d169ef434c6e9344ee0ab27cfa6f605f6387b2"},
-    {file = "grpcio-1.57.0-cp37-cp37m-win_amd64.whl", hash = "sha256:34950353539e7d93f61c6796a007c705d663f3be41166358e3d88c45760c7d98"},
-    {file = "grpcio-1.57.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:871f9999e0211f9551f368612460442a5436d9444606184652117d6a688c9f51"},
-    {file = "grpcio-1.57.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:a8a8e560e8dbbdf29288872e91efd22af71e88b0e5736b0daf7773c1fecd99f0"},
-    {file = "grpcio-1.57.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:2313b124e475aa9017a9844bdc5eafb2d5abdda9d456af16fc4535408c7d6da6"},
-    {file = "grpcio-1.57.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4098b6b638d9e0ca839a81656a2fd4bc26c9486ea707e8b1437d6f9d61c3941"},
-    {file = "grpcio-1.57.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e5b58e32ae14658085c16986d11e99abd002ddbf51c8daae8a0671fffb3467f"},
-    {file = "grpcio-1.57.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0f80bf37f09e1caba6a8063e56e2b87fa335add314cf2b78ebf7cb45aa7e3d06"},
-    {file = "grpcio-1.57.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5b7a4ce8f862fe32b2a10b57752cf3169f5fe2915acfe7e6a1e155db3da99e79"},
-    {file = "grpcio-1.57.0-cp38-cp38-win32.whl", hash = "sha256:9338bacf172e942e62e5889b6364e56657fbf8ac68062e8b25c48843e7b202bb"},
-    {file = "grpcio-1.57.0-cp38-cp38-win_amd64.whl", hash = "sha256:e1cb52fa2d67d7f7fab310b600f22ce1ff04d562d46e9e0ac3e3403c2bb4cc16"},
-    {file = "grpcio-1.57.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:fee387d2fab144e8a34e0e9c5ca0f45c9376b99de45628265cfa9886b1dbe62b"},
-    {file = "grpcio-1.57.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:b53333627283e7241fcc217323f225c37783b5f0472316edcaa4479a213abfa6"},
-    {file = "grpcio-1.57.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f19ac6ac0a256cf77d3cc926ef0b4e64a9725cc612f97228cd5dc4bd9dbab03b"},
-    {file = "grpcio-1.57.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fdf04e402f12e1de8074458549337febb3b45f21076cc02ef4ff786aff687e"},
-    {file = "grpcio-1.57.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5613a2fecc82f95d6c51d15b9a72705553aa0d7c932fad7aed7afb51dc982ee5"},
-    {file = "grpcio-1.57.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b670c2faa92124b7397b42303e4d8eb64a4cd0b7a77e35a9e865a55d61c57ef9"},
-    {file = "grpcio-1.57.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a635589201b18510ff988161b7b573f50c6a48fae9cb567657920ca82022b37"},
-    {file = "grpcio-1.57.0-cp39-cp39-win32.whl", hash = "sha256:d78d8b86fcdfa1e4c21f8896614b6cc7ee01a2a758ec0c4382d662f2a62cf766"},
-    {file = "grpcio-1.57.0-cp39-cp39-win_amd64.whl", hash = "sha256:20ec6fc4ad47d1b6e12deec5045ec3cd5402d9a1597f738263e98f490fe07056"},
-    {file = "grpcio-1.57.0.tar.gz", hash = "sha256:4b089f7ad1eb00a104078bab8015b0ed0ebcb3b589e527ab009c53893fd4e613"},
+    {file = "grpcio-1.58.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:3e6bebf1dfdbeb22afd95650e4f019219fef3ab86d3fca8ebade52e4bc39389a"},
+    {file = "grpcio-1.58.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:cde11577d5b6fd73a00e6bfa3cf5f428f3f33c2d2878982369b5372bbc4acc60"},
+    {file = "grpcio-1.58.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:a2d67ff99e70e86b2be46c1017ae40b4840d09467d5455b2708de6d4c127e143"},
+    {file = "grpcio-1.58.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1ed979b273a81de36fc9c6716d9fb09dd3443efa18dcc8652501df11da9583e9"},
+    {file = "grpcio-1.58.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:458899d2ebd55d5ca2350fd3826dfd8fcb11fe0f79828ae75e2b1e6051d50a29"},
+    {file = "grpcio-1.58.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bc7ffef430b80345729ff0a6825e9d96ac87efe39216e87ac58c6c4ef400de93"},
+    {file = "grpcio-1.58.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5b23d75e5173faa3d1296a7bedffb25afd2fddb607ef292dfc651490c7b53c3d"},
+    {file = "grpcio-1.58.0-cp310-cp310-win32.whl", hash = "sha256:fad9295fe02455d4f158ad72c90ef8b4bcaadfdb5efb5795f7ab0786ad67dd58"},
+    {file = "grpcio-1.58.0-cp310-cp310-win_amd64.whl", hash = "sha256:bc325fed4d074367bebd465a20763586e5e1ed5b943e9d8bc7c162b1f44fd602"},
+    {file = "grpcio-1.58.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:652978551af02373a5a313e07bfef368f406b5929cf2d50fa7e4027f913dbdb4"},
+    {file = "grpcio-1.58.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:9f13a171281ebb4d7b1ba9f06574bce2455dcd3f2f6d1fbe0fd0d84615c74045"},
+    {file = "grpcio-1.58.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:8774219e21b05f750eef8adc416e9431cf31b98f6ce9def288e4cea1548cbd22"},
+    {file = "grpcio-1.58.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09206106848462763f7f273ca93d2d2d4d26cab475089e0de830bb76be04e9e8"},
+    {file = "grpcio-1.58.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62831d5e251dd7561d9d9e83a0b8655084b2a1f8ea91e4bd6b3cedfefd32c9d2"},
+    {file = "grpcio-1.58.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:212f38c6a156862098f6bdc9a79bf850760a751d259d8f8f249fc6d645105855"},
+    {file = "grpcio-1.58.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4b12754af201bb993e6e2efd7812085ddaaef21d0a6f0ff128b97de1ef55aa4a"},
+    {file = "grpcio-1.58.0-cp311-cp311-win32.whl", hash = "sha256:3886b4d56bd4afeac518dbc05933926198aa967a7d1d237a318e6fbc47141577"},
+    {file = "grpcio-1.58.0-cp311-cp311-win_amd64.whl", hash = "sha256:002f228d197fea12797a14e152447044e14fb4fdb2eb5d6cfa496f29ddbf79ef"},
+    {file = "grpcio-1.58.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:b5e8db0aff0a4819946215f156bd722b6f6c8320eb8419567ffc74850c9fd205"},
+    {file = "grpcio-1.58.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:201e550b7e2ede113b63e718e7ece93cef5b0fbf3c45e8fe4541a5a4305acd15"},
+    {file = "grpcio-1.58.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:d79b660681eb9bc66cc7cbf78d1b1b9e335ee56f6ea1755d34a31108b80bd3c8"},
+    {file = "grpcio-1.58.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ef8d4a76d2c7d8065aba829f8d0bc0055495c998dce1964ca5b302d02514fb3"},
+    {file = "grpcio-1.58.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6cba491c638c76d3dc6c191d9c75041ca5b8f5c6de4b8327ecdcab527f130bb4"},
+    {file = "grpcio-1.58.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6801ff6652ecd2aae08ef994a3e49ff53de29e69e9cd0fd604a79ae4e545a95c"},
+    {file = "grpcio-1.58.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:24edec346e69e672daf12b2c88e95c6f737f3792d08866101d8c5f34370c54fd"},
+    {file = "grpcio-1.58.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7e473a7abad9af48e3ab5f3b5d237d18208024d28ead65a459bd720401bd2f8f"},
+    {file = "grpcio-1.58.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:4891bbb4bba58acd1d620759b3be11245bfe715eb67a4864c8937b855b7ed7fa"},
+    {file = "grpcio-1.58.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:e9f995a8a421405958ff30599b4d0eec244f28edc760de82f0412c71c61763d2"},
+    {file = "grpcio-1.58.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:2f85f87e2f087d9f632c085b37440a3169fda9cdde80cb84057c2fc292f8cbdf"},
+    {file = "grpcio-1.58.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb6b92036ff312d5b4182fa72e8735d17aceca74d0d908a7f08e375456f03e07"},
+    {file = "grpcio-1.58.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d81c2b2b24c32139dd2536972f1060678c6b9fbd106842a9fcdecf07b233eccd"},
+    {file = "grpcio-1.58.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:fbcecb6aedd5c1891db1d70efbfbdc126c986645b5dd616a045c07d6bd2dfa86"},
+    {file = "grpcio-1.58.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:92ae871a902cf19833328bd6498ec007b265aabf2fda845ab5bd10abcaf4c8c6"},
+    {file = "grpcio-1.58.0-cp38-cp38-win32.whl", hash = "sha256:dc72e04620d49d3007771c0e0348deb23ca341c0245d610605dddb4ac65a37cb"},
+    {file = "grpcio-1.58.0-cp38-cp38-win_amd64.whl", hash = "sha256:1c1c5238c6072470c7f1614bf7c774ffde6b346a100521de9ce791d1e4453afe"},
+    {file = "grpcio-1.58.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:fe643af248442221db027da43ed43e53b73e11f40c9043738de9a2b4b6ca7697"},
+    {file = "grpcio-1.58.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:128eb1f8e70676d05b1b0c8e6600320fc222b3f8c985a92224248b1367122188"},
+    {file = "grpcio-1.58.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:039003a5e0ae7d41c86c768ef8b3ee2c558aa0a23cf04bf3c23567f37befa092"},
+    {file = "grpcio-1.58.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f061722cad3f9aabb3fbb27f3484ec9d4667b7328d1a7800c3c691a98f16bb0"},
+    {file = "grpcio-1.58.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0af11938acf8cd4cf815c46156bcde36fa5850518120920d52620cc3ec1830"},
+    {file = "grpcio-1.58.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d4cef77ad2fed42b1ba9143465856d7e737279854e444925d5ba45fc1f3ba727"},
+    {file = "grpcio-1.58.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:24765a627eb4d9288ace32d5104161c3654128fe27f2808ecd6e9b0cfa7fc8b9"},
+    {file = "grpcio-1.58.0-cp39-cp39-win32.whl", hash = "sha256:f0241f7eb0d2303a545136c59bc565a35c4fc3b924ccbd69cb482f4828d6f31c"},
+    {file = "grpcio-1.58.0-cp39-cp39-win_amd64.whl", hash = "sha256:dcfba7befe3a55dab6fe1eb7fc9359dc0c7f7272b30a70ae0af5d5b063842f28"},
+    {file = "grpcio-1.58.0.tar.gz", hash = "sha256:532410c51ccd851b706d1fbc00a87be0f5312bd6f8e5dbf89d4e99c7f79d7499"},
 ]
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.57.0)"]
+protobuf = ["grpcio-tools (>=1.58.0)"]
 
 [[package]]
 name = "grpcio-reflection"
-version = "1.57.0"
+version = "1.58.0"
 description = "Standard Protobuf Reflection Service for gRPC"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "grpcio-reflection-1.57.0.tar.gz", hash = "sha256:8f63a18729cba995a172f8325235f5094cb066febec75f9a3b1b2e28328aa166"},
-    {file = "grpcio_reflection-1.57.0-py3-none-any.whl", hash = "sha256:d7deb8587f9d0095fb5d367c2aa5ce1380e3f23b0f8bca6c00bc404c5429cb6a"},
+    {file = "grpcio-reflection-1.58.0.tar.gz", hash = "sha256:e6048a758d17b6ca1705258e7ee5d926d2960a95ae08ba0929dd233e505acd3d"},
+    {file = "grpcio_reflection-1.58.0-py3-none-any.whl", hash = "sha256:fa18885d8a09cef02c9a6b1d17dfed0279f1f401b06bd1f75958b78ebf1b5c0c"},
 ]
 
 [package.dependencies]
-grpcio = ">=1.57.0"
+grpcio = ">=1.58.0"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-status"
-version = "1.57.0"
+version = "1.58.0"
 description = "Status proto mapping for gRPC"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "grpcio-status-1.57.0.tar.gz", hash = "sha256:b098da99df1eebe58337f8f78e50df990273ccacc1226fddeb47c590e3df9e02"},
-    {file = "grpcio_status-1.57.0-py3-none-any.whl", hash = "sha256:15d6af055914ebbc4ed17e55ebfb8e6bb17a45a57fea32e6af19978fb7844690"},
+    {file = "grpcio-status-1.58.0.tar.gz", hash = "sha256:0b42e70c0405a66a82d9e9867fa255fe59e618964a6099b20568c31dd9099766"},
+    {file = "grpcio_status-1.58.0-py3-none-any.whl", hash = "sha256:36d46072b71a00147709ebce49344ac59b4b8960942acf0f813a8a7d6c1c28e0"},
 ]
 
 [package.dependencies]
 googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.57.0"
+grpcio = ">=1.58.0"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-tools"
-version = "1.57.0"
+version = "1.58.0"
 description = "Protobuf code generator for gRPC"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "grpcio-tools-1.57.0.tar.gz", hash = "sha256:2f16130d869ce27ecd623194547b649dd657333ec7e8644cc571c645781a9b85"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:4fb8a8468031f858381a576078924af364a08833d8f8f3237018252c4573a802"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:35bf0dad8a3562043345236c26d0053a856fb06c04d7da652f2ded914e508ae7"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:ec9aab2fb6783c7fc54bc28f58eb75f1ca77594e6b0fd5e5e7a8114a95169fe0"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0cf5fc0a1c23f8ea34b408b72fb0e90eec0f404ad4dba98e8f6da3c9ce34e2ed"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26e69d08a515554e0cfe1ec4d31568836f4b17f0ff82294f957f629388629eb9"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c39a3656576b6fdaaf28abe0467f7a7231df4230c1bee132322dbc3209419e7f"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f64f8ab22d27d4a5693310748d35a696061c3b5c7b8c4fb4ab3b4bc1068b6b56"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-win32.whl", hash = "sha256:d2a134756f4db34759a5cc7f7e43f7eb87540b68d1cca62925593c6fb93924f7"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-win_amd64.whl", hash = "sha256:9a3d60fb8d46ede26c1907c146561b3a9caa20a7aff961bc661ef8226f85a2e9"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:aac98ecad8f7bd4301855669d42a5d97ef7bb34bec2b1e74c7a0641d47e313cf"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:cdd020cb68b51462983b7c2dfbc3eb6ede032b8bf438d4554df0c3f08ce35c76"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:f54081b08419a39221cd646363b5708857c696b3ad4784f1dcf310891e33a5f7"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ed85a0291fff45b67f2557fe7f117d3bc7af8b54b8619d27bf374b5c8b7e3ca2"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e868cd6feb3ef07d4b35be104fe1fd0657db05259ff8f8ec5e08f4f89ca1191d"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:dfb6f6120587b8e228a3cae5ee4985b5bdc18501bad05c49df61965dfc9d70a9"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a7ad7f328e28fc97c356d0f10fb10d8b5151bb65aa7cf14bf8084513f0b7306"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-win32.whl", hash = "sha256:9867f2817b1a0c93c523f89ac6c9d8625548af4620a7ce438bf5a76e23327284"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-win_amd64.whl", hash = "sha256:1f9e917a9f18087f6c14b4d4508fb94fca5c2f96852363a89232fb9b2124ac1f"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:9f2aefa8a37bd2c4db1a3f1aca11377e2766214520fb70e67071f4ff8d8b0fa5"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:850cbda0ec5d24c39e7215ede410276040692ca45d105fbbeada407fa03f0ac0"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:6fa52972c9647876ea35f6dc2b51002a74ed900ec7894586cbb2fe76f64f99de"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0eea89d7542719594e50e2283f51a072978b953e8b3e9fd7c59a2c762d4c1"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3da5240211252fc70a6451fe00c143e2ab2f7bfc2445695ad2ed056b8e48d96"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a0256f8786ac9e4db618a1aa492bb3472569a0946fd3ee862ffe23196323da55"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c026bdf5c1366ce88b7bbe2d8207374d675afd3fd911f60752103de3da4a41d2"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9053c2f655589545be08b9d6a673e92970173a4bf11a4b9f18cd6e9af626b587"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:81ec4dbb696e095057b2528d11a8da04be6bbe2b967fa07d4ea9ba6354338cbf"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:495e2946406963e0b9f063f76d5af0f2a19517dac2b367b5b044432ac9194296"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:7b46fc6aa8eb7edd18cafcd21fd98703cb6c09e46b507de335fca7f0161dfccb"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb81ff861692111fa81bd85f64584e624cb4013bd66fbce8a209b8893f5ce398"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a42dc220eb5305f470855c9284f4c8e85ae59d6d742cd07946b0cbe5e9ca186"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:90d10d9038ba46a595a223a34f136c9230e3d6d7abc2433dbf0e1c31939d3a8b"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5bc3e6d338aefb052e19cedabe00452be46d0c10a4ed29ee77abb00402e438fe"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-win32.whl", hash = "sha256:34b36217b17b5bea674a414229913e1fd80ede328be51e1b531fcc62abd393b0"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbde4004a0688400036342ff73e3706e8940483e2871547b1354d59e93a38277"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:784574709b9690dc28696617ea69352e2132352fdfc9bc89afa8e39f99ae538e"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:85ac4e62eb44428cde025fd9ab7554002315fc7880f791c553fc5a0015cc9931"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:dc771d4db5701f280957bbcee91745e0686d00ed1c6aa7e05ba30a58b02d70a1"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3ac06703c412f8167a9062eaf6099409967e33bf98fa5b02be4b4689b6bdf39"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02d78c034109f46032c7217260066d49d41e6bcaf588fa28fa40fe2f83445347"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2db25f15ed44327f2e02d0c4fe741ac966f9500e407047d8a7c7fccf2df65616"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2b417c97936d94874a3ce7ed8deab910f2233e3612134507cfee4af8735c38a6"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-win32.whl", hash = "sha256:f717cce5093e6b6049d9ea6d12fdf3658efdb1a80772f7737db1f8510b876df6"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-win_amd64.whl", hash = "sha256:1c0e8a1a32973a5d59fbcc19232f925e5c48116e9411f788033a31c5ca5130b4"},
+    {file = "grpcio-tools-1.58.0.tar.gz", hash = "sha256:6f4d80ceb591e31ca4dceec747dbe56132e1392a0a9bb1c8fe001d1b5cac898a"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:60c874908f3b40f32f1bb0221f7b3ab65ecb53a4d0a9f0a394f031f1b292c177"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:1852e798f31e5437ca7b37abc910e028b34732fb19364862cedb87b1dab66fad"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:149fb48f53cb691a6328f68bed8e4036c730f7106b7f98e92c2c0403f0b9e93c"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba3d383e5ca93826038b70f326fce8e8d12dd9b2f64d363a3d612f7475f12dd2"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6997511e9d2979f7a2389479682dbb06823f21a904e8fb0a5c6baaf1b4b4a863"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8de0b701da479643f71fad71fe66885cddd89441ae16e2c724939b47742dc72e"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:43cc23908b63fcaefe690b10f68a2d8652c994b5b36ab77d2271d9608c895320"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-win32.whl", hash = "sha256:2c2221123d010dc6231799e63a37f2f4786bf614ef65b23009c387cd20d8b193"},
+    {file = "grpcio_tools-1.58.0-cp310-cp310-win_amd64.whl", hash = "sha256:df2788736bdf58abe7b0e4d6b1ff806f7686c98c5ad900da312252e3322d91c4"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:b6ea5578712cdb29b0ff60bfc6405bf0e8d681b9c71d106dd1cda54fe7fe4e55"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:c29880f491581c83181c0a84a4d11402af2b13166a5266f64e246adf1da7aa66"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:32d51e933c3565414dd0835f930bb28a1cdeba435d9d2c87fa3cf8b1d284db3c"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ad9d77f25514584b1ddc981d70c9e50dfcfc388aa5ba943eee67520c5267ed9"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4882382631e6352819059278a5c878ce0b067008dd490911d16d5616e8a36d85"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d84091a189d848d94645b7c48b61734c12ec03b0d46e5fc0049343a26989ac5c"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:85ac28a9621e9b92a3fc416288c4ce45542db0b4c31b3e23031dd8e0a0ec5590"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-win32.whl", hash = "sha256:7371d8ea80234b29affec145e25569523f549520ed7e53b2aa92bed412cdecfd"},
+    {file = "grpcio_tools-1.58.0-cp311-cp311-win_amd64.whl", hash = "sha256:6997df6e7c5cf4d3ddc764240c1ff6a04b45d70ec28913b38fbc6396ef743e12"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:ac65b8d6e3acaf88b815edf9af88ff844b6600ff3d2591c05ba4f655b45d5fb4"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:88e8191d0dd789bebf42533808728f5ce75d2c51e2a72bdf20abe5b5e3fbec42"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:a3dbece2a121761499a659b799979d4b738586d1065439053de553773eee11ca"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1086fe240c4c879b9721952b47d46996deb283c2d9355a8dc24a804811aacf70"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7ae3dca059d5b358dd03fb63277428fa7d771605d4074a019138dd38d70719a"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3f8904ac7fc3da2e874f00b3a986e8b7e004f499344a8e7eb213c26dfb025041"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:aadbd8393ae332e49731adb31e741f2e689989150569b7acc939f5ea43124e2d"},
+    {file = "grpcio_tools-1.58.0-cp37-cp37m-win_amd64.whl", hash = "sha256:1cb6e24194786687d4f23c64de1f0ce553af51de22746911bc37340f85f9783e"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:6ec43909095c630df3e479e77469bdad367067431f4af602f6ccb978a3b78afd"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:4be49ed320b0ebcbc21d19ef555fbf229c1c452105522b728e1171ee2052078e"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:28eefebddec3d3adf19baca78f8b82a2287d358e1b1575ae018cdca8eacc6269"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ef8c696e9d78676cc3f583a92bbbf2c84e94e350f7ad22f150a52559f4599d1"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9aeb5949e46558d21c51fd3ec3eeecc59c94dbca76c67c0a80d3da6b7437930c"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6f7144aad9396d35fb1b80429600a970b559c2ad4d07020eeb180fe83cea2bee"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4ee26e9253a721fff355737649678535f76cf5d642aa3ac0cd937832559b90af"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-win32.whl", hash = "sha256:343f572312039059a8797d6e29a7fc62196e73131ab01755660a9d48202267c1"},
+    {file = "grpcio_tools-1.58.0-cp38-cp38-win_amd64.whl", hash = "sha256:cd7acfbb43b7338a78cf4a67528d05530d574d92b7c829d185b78dfc451d158f"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:46628247fbce86d18232eead24bd22ed0826c79f3fe2fc2fbdbde45971361049"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:51587842a54e025a3d0d37afcf4ef2b7ac1def9a5d17448665cb424b53d6c287"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:a062ae3072a2a39a3c057f4d68b57b021f1dd2956cd09aab39709f6af494e1de"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eec3c93a08df11c80ef1c29a616bcbb0d83dbc6ea41b48306fcacc720416dfa7"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b63f823ac991ff77104da614d2a2485a59d37d57830eb2e387a6e2a3edc7fa2b"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:579c11a9f198847ed48dbc4f211c67fe96a73320b87c81f01b044b72e24a7d77"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6ca2fc1dd8049d417a5034d944c9df05cee76f855b3e431627ab4292e7c01c47"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-win32.whl", hash = "sha256:453023120114c35d3d9d6717ea0820e5d5c140f51f9d0b621de4397ff854471b"},
+    {file = "grpcio_tools-1.58.0-cp39-cp39-win_amd64.whl", hash = "sha256:b6c896f1df99c35cf062d4803c15663ff00a33ff09add28baa6e475cf6b5e258"},
 ]
 
 [package.dependencies]
-grpcio = ">=1.57.0"
+grpcio = ">=1.58.0"
 protobuf = ">=4.21.6,<5.0dev"
 setuptools = "*"
 
@@ -871,6 +869,16 @@ files = [
     {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
@@ -1051,36 +1059,43 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
 [[package]]
 name = "numpy"
-version = "1.25.2"
+version = "1.26.0"
 description = "Fundamental package for array computing in Python"
 optional = false
-python-versions = ">=3.9"
+python-versions = "<3.13,>=3.9"
 files = [
-    {file = "numpy-1.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:db3ccc4e37a6873045580d413fe79b68e47a681af8db2e046f1dacfa11f86eb3"},
-    {file = "numpy-1.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:90319e4f002795ccfc9050110bbbaa16c944b1c37c0baeea43c5fb881693ae1f"},
-    {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfe4a913e29b418d096e696ddd422d8a5d13ffba4ea91f9f60440a3b759b0187"},
-    {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f08f2e037bba04e707eebf4bc934f1972a315c883a9e0ebfa8a7756eabf9e357"},
-    {file = "numpy-1.25.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bec1e7213c7cb00d67093247f8c4db156fd03075f49876957dca4711306d39c9"},
-    {file = "numpy-1.25.2-cp310-cp310-win32.whl", hash = "sha256:7dc869c0c75988e1c693d0e2d5b26034644399dd929bc049db55395b1379e044"},
-    {file = "numpy-1.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:834b386f2b8210dca38c71a6e0f4fd6922f7d3fcff935dbe3a570945acb1b545"},
-    {file = "numpy-1.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c5462d19336db4560041517dbb7759c21d181a67cb01b36ca109b2ae37d32418"},
-    {file = "numpy-1.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5652ea24d33585ea39eb6a6a15dac87a1206a692719ff45d53c5282e66d4a8f"},
-    {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d60fbae8e0019865fc4784745814cff1c421df5afee233db6d88ab4f14655a2"},
-    {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e7f0f7f6d0eee8364b9a6304c2845b9c491ac706048c7e8cf47b83123b8dbf"},
-    {file = "numpy-1.25.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb33d5a1cf360304754913a350edda36d5b8c5331a8237268c48f91253c3a364"},
-    {file = "numpy-1.25.2-cp311-cp311-win32.whl", hash = "sha256:5883c06bb92f2e6c8181df7b39971a5fb436288db58b5a1c3967702d4278691d"},
-    {file = "numpy-1.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:5c97325a0ba6f9d041feb9390924614b60b99209a71a69c876f71052521d42a4"},
-    {file = "numpy-1.25.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b79e513d7aac42ae918db3ad1341a015488530d0bb2a6abcbdd10a3a829ccfd3"},
-    {file = "numpy-1.25.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eb942bfb6f84df5ce05dbf4b46673ffed0d3da59f13635ea9b926af3deb76926"},
-    {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0746410e73384e70d286f93abf2520035250aad8c5714240b0492a7302fdca"},
-    {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7806500e4f5bdd04095e849265e55de20d8cc4b661b038957354327f6d9b295"},
-    {file = "numpy-1.25.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8b77775f4b7df768967a7c8b3567e309f617dd5e99aeb886fa14dc1a0791141f"},
-    {file = "numpy-1.25.2-cp39-cp39-win32.whl", hash = "sha256:2792d23d62ec51e50ce4d4b7d73de8f67a2fd3ea710dcbc8563a51a03fb07b01"},
-    {file = "numpy-1.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:76b4115d42a7dfc5d485d358728cdd8719be33cc5ec6ec08632a5d6fca2ed380"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a1329e26f46230bf77b02cc19e900db9b52f398d6722ca853349a782d4cff55"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c3abc71e8b6edba80a01a52e66d83c5d14433cbcd26a40c329ec7ed09f37901"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1b9735c27cea5d995496f46a8b1cd7b408b3f34b6d50459d9ac8fe3a20cc17bf"},
-    {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"},
+    {file = "numpy-1.26.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f8db2f125746e44dce707dd44d4f4efeea8d7e2b43aace3f8d1f235cfa2733dd"},
+    {file = "numpy-1.26.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0621f7daf973d34d18b4e4bafb210bbaf1ef5e0100b5fa750bd9cde84c7ac292"},
+    {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51be5f8c349fdd1a5568e72713a21f518e7d6707bcf8503b528b88d33b57dc68"},
+    {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:767254ad364991ccfc4d81b8152912e53e103ec192d1bb4ea6b1f5a7117040be"},
+    {file = "numpy-1.26.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:436c8e9a4bdeeee84e3e59614d38c3dbd3235838a877af8c211cfcac8a80b8d3"},
+    {file = "numpy-1.26.0-cp310-cp310-win32.whl", hash = "sha256:c2e698cb0c6dda9372ea98a0344245ee65bdc1c9dd939cceed6bb91256837896"},
+    {file = "numpy-1.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:09aaee96c2cbdea95de76ecb8a586cb687d281c881f5f17bfc0fb7f5890f6b91"},
+    {file = "numpy-1.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:637c58b468a69869258b8ae26f4a4c6ff8abffd4a8334c830ffb63e0feefe99a"},
+    {file = "numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:306545e234503a24fe9ae95ebf84d25cba1fdc27db971aa2d9f1ab6bba19a9dd"},
+    {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c6adc33561bd1d46f81131d5352348350fc23df4d742bb246cdfca606ea1208"},
+    {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e062aa24638bb5018b7841977c360d2f5917268d125c833a686b7cbabbec496c"},
+    {file = "numpy-1.26.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:546b7dd7e22f3c6861463bebb000646fa730e55df5ee4a0224408b5694cc6148"},
+    {file = "numpy-1.26.0-cp311-cp311-win32.whl", hash = "sha256:c0b45c8b65b79337dee5134d038346d30e109e9e2e9d43464a2970e5c0e93229"},
+    {file = "numpy-1.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:eae430ecf5794cb7ae7fa3808740b015aa80747e5266153128ef055975a72b99"},
+    {file = "numpy-1.26.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:166b36197e9debc4e384e9c652ba60c0bacc216d0fc89e78f973a9760b503388"},
+    {file = "numpy-1.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f042f66d0b4ae6d48e70e28d487376204d3cbf43b84c03bac57e28dac6151581"},
+    {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5e18e5b14a7560d8acf1c596688f4dfd19b4f2945b245a71e5af4ddb7422feb"},
+    {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f6bad22a791226d0a5c7c27a80a20e11cfe09ad5ef9084d4d3fc4a299cca505"},
+    {file = "numpy-1.26.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4acc65dd65da28060e206c8f27a573455ed724e6179941edb19f97e58161bb69"},
+    {file = "numpy-1.26.0-cp312-cp312-win32.whl", hash = "sha256:bb0d9a1aaf5f1cb7967320e80690a1d7ff69f1d47ebc5a9bea013e3a21faec95"},
+    {file = "numpy-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:ee84ca3c58fe48b8ddafdeb1db87388dce2c3c3f701bf447b05e4cfcc3679112"},
+    {file = "numpy-1.26.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4a873a8180479bc829313e8d9798d5234dfacfc2e8a7ac188418189bb8eafbd2"},
+    {file = "numpy-1.26.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:914b28d3215e0c721dc75db3ad6d62f51f630cb0c277e6b3bcb39519bed10bd8"},
+    {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c78a22e95182fb2e7874712433eaa610478a3caf86f28c621708d35fa4fd6e7f"},
+    {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f737708b366c36b76e953c46ba5827d8c27b7a8c9d0f471810728e5a2fe57c"},
+    {file = "numpy-1.26.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b44e6a09afc12952a7d2a58ca0a2429ee0d49a4f89d83a0a11052da696440e49"},
+    {file = "numpy-1.26.0-cp39-cp39-win32.whl", hash = "sha256:5671338034b820c8d58c81ad1dafc0ed5a00771a82fccc71d6438df00302094b"},
+    {file = "numpy-1.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:020cdbee66ed46b671429c7265cf00d8ac91c046901c55684954c3958525dab2"},
+    {file = "numpy-1.26.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0792824ce2f7ea0c82ed2e4fecc29bb86bee0567a080dacaf2e0a01fe7654369"},
+    {file = "numpy-1.26.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d484292eaeb3e84a51432a94f53578689ffdea3f90e10c8b203a99be5af57d8"},
+    {file = "numpy-1.26.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:186ba67fad3c60dbe8a3abff3b67a91351100f2661c8e2a80364ae6279720299"},
+    {file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
 ]
 
 [[package]]
@@ -1250,70 +1265,71 @@ files = [
 
 [[package]]
 name = "pandas"
-version = "2.0.3"
+version = "2.1.1"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"},
-    {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"},
-    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"},
-    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"},
-    {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"},
-    {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"},
-    {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"},
-    {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"},
-    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"},
-    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"},
-    {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"},
-    {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"},
-    {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"},
-    {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"},
-    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"},
-    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"},
-    {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"},
-    {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"},
-    {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"},
-    {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"},
-    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"},
-    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"},
-    {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"},
-    {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"},
-    {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"},
+    {file = "pandas-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58d997dbee0d4b64f3cb881a24f918b5f25dd64ddf31f467bb9b67ae4c63a1e4"},
+    {file = "pandas-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02304e11582c5d090e5a52aec726f31fe3f42895d6bfc1f28738f9b64b6f0614"},
+    {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffa8f0966de2c22de408d0e322db2faed6f6e74265aa0856f3824813cf124363"},
+    {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1f84c144dee086fe4f04a472b5cd51e680f061adf75c1ae4fc3a9275560f8f4"},
+    {file = "pandas-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:75ce97667d06d69396d72be074f0556698c7f662029322027c226fd7a26965cb"},
+    {file = "pandas-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:4c3f32fd7c4dccd035f71734df39231ac1a6ff95e8bdab8d891167197b7018d2"},
+    {file = "pandas-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e2959720b70e106bb1d8b6eadd8ecd7c8e99ccdbe03ee03260877184bb2877d"},
+    {file = "pandas-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:25e8474a8eb258e391e30c288eecec565bfed3e026f312b0cbd709a63906b6f8"},
+    {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8bd1685556f3374520466998929bade3076aeae77c3e67ada5ed2b90b4de7f0"},
+    {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc3657869c7902810f32bd072f0740487f9e030c1a3ab03e0af093db35a9d14e"},
+    {file = "pandas-2.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:05674536bd477af36aa2effd4ec8f71b92234ce0cc174de34fd21e2ee99adbc2"},
+    {file = "pandas-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b407381258a667df49d58a1b637be33e514b07f9285feb27769cedb3ab3d0b3a"},
+    {file = "pandas-2.1.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c747793c4e9dcece7bb20156179529898abf505fe32cb40c4052107a3c620b49"},
+    {file = "pandas-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3bcad1e6fb34b727b016775bea407311f7721db87e5b409e6542f4546a4951ea"},
+    {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5ec7740f9ccb90aec64edd71434711f58ee0ea7f5ed4ac48be11cfa9abf7317"},
+    {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29deb61de5a8a93bdd033df328441a79fcf8dd3c12d5ed0b41a395eef9cd76f0"},
+    {file = "pandas-2.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4f99bebf19b7e03cf80a4e770a3e65eee9dd4e2679039f542d7c1ace7b7b1daa"},
+    {file = "pandas-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:84e7e910096416adec68075dc87b986ff202920fb8704e6d9c8c9897fe7332d6"},
+    {file = "pandas-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366da7b0e540d1b908886d4feb3d951f2f1e572e655c1160f5fde28ad4abb750"},
+    {file = "pandas-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e50e72b667415a816ac27dfcfe686dc5a0b02202e06196b943d54c4f9c7693e"},
+    {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1ab6a25da197f03ebe6d8fa17273126120874386b4ac11c1d687df288542dd"},
+    {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0dbfea0dd3901ad4ce2306575c54348d98499c95be01b8d885a2737fe4d7a98"},
+    {file = "pandas-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0489b0e6aa3d907e909aef92975edae89b1ee1654db5eafb9be633b0124abe97"},
+    {file = "pandas-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:4cdb0fab0400c2cb46dafcf1a0fe084c8bb2480a1fa8d81e19d15e12e6d4ded2"},
+    {file = "pandas-2.1.1.tar.gz", hash = "sha256:fecb198dc389429be557cde50a2d46da8434a17fe37d7d41ff102e3987fd947b"},
 ]
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
 tzdata = ">=2022.1"
 
 [package.extras]
-all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"]
-aws = ["s3fs (>=2021.08.0)"]
-clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"]
-compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"]
-computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"]
-excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"]
+all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"]
+aws = ["s3fs (>=2022.05.0)"]
+clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"]
+compression = ["zstandard (>=0.17.0)"]
+computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"]
 feather = ["pyarrow (>=7.0.0)"]
-fss = ["fsspec (>=2021.07.0)"]
-gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"]
-hdf5 = ["tables (>=3.6.1)"]
-html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"]
-mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"]
-output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"]
+fss = ["fsspec (>=2022.05.0)"]
+gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"]
+hdf5 = ["tables (>=3.7.0)"]
+html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"]
+mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"]
 parquet = ["pyarrow (>=7.0.0)"]
-performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"]
+performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"]
 plot = ["matplotlib (>=3.6.1)"]
-postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"]
-spss = ["pyreadstat (>=1.1.2)"]
-sql-other = ["SQLAlchemy (>=1.4.16)"]
-test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
-xml = ["lxml (>=4.6.3)"]
+postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"]
+spss = ["pyreadstat (>=1.1.5)"]
+sql-other = ["SQLAlchemy (>=1.4.36)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.8.0)"]
 
 [[package]]
 name = "peft"
@@ -1344,67 +1360,65 @@ test = ["black (>=22.0,<23.0)", "datasets", "diffusers", "hf-doc-builder", "para
 
 [[package]]
 name = "pillow"
-version = "10.0.0"
+version = "10.0.1"
 description = "Python Imaging Library (Fork)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "Pillow-10.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891"},
-    {file = "Pillow-10.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d5db32e2a6ccbb3d34d87c87b432959e0db29755727afb37290e10f6e8e62614"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edf4392b77bdc81f36e92d3a07a5cd072f90253197f4a52a55a8cec48a12483b"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:520f2a520dc040512699f20fa1c363eed506e94248d71f85412b625026f6142c"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:8c11160913e3dd06c8ffdb5f233a4f254cb449f4dfc0f8f4549eda9e542c93d1"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a74ba0c356aaa3bb8e3eb79606a87669e7ec6444be352870623025d75a14a2bf"},
-    {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d5d0dae4cfd56969d23d94dc8e89fb6a217be461c69090768227beb8ed28c0a3"},
-    {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22c10cc517668d44b211717fd9775799ccec4124b9a7f7b3635fc5386e584992"},
-    {file = "Pillow-10.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dffe31a7f47b603318c609f378ebcd57f1554a3a6a8effbc59c3c69f804296de"},
-    {file = "Pillow-10.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:9fb218c8a12e51d7ead2a7c9e101a04982237d4855716af2e9499306728fb485"},
-    {file = "Pillow-10.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d35e3c8d9b1268cbf5d3670285feb3528f6680420eafe35cccc686b73c1e330f"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ed64f9ca2f0a95411e88a4efbd7a29e5ce2cea36072c53dd9d26d9c76f753b3"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b6eb5502f45a60a3f411c63187db83a3d3107887ad0d036c13ce836f8a36f1d"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c1fbe7621c167ecaa38ad29643d77a9ce7311583761abf7836e1510c580bf3dd"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cd25d2a9d2b36fcb318882481367956d2cf91329f6892fe5d385c346c0649629"},
-    {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3b08d4cc24f471b2c8ca24ec060abf4bebc6b144cb89cba638c720546b1cf538"},
-    {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d737a602fbd82afd892ca746392401b634e278cb65d55c4b7a8f48e9ef8d008d"},
-    {file = "Pillow-10.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3a82c40d706d9aa9734289740ce26460a11aeec2d9c79b7af87bb35f0073c12f"},
-    {file = "Pillow-10.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:bc2ec7c7b5d66b8ec9ce9f720dbb5fa4bace0f545acd34870eff4a369b44bf37"},
-    {file = "Pillow-10.0.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:d80cf684b541685fccdd84c485b31ce73fc5c9b5d7523bf1394ce134a60c6883"},
-    {file = "Pillow-10.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76de421f9c326da8f43d690110f0e79fe3ad1e54be811545d7d91898b4c8493e"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81ff539a12457809666fef6624684c008e00ff6bf455b4b89fd00a140eecd640"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce543ed15570eedbb85df19b0a1a7314a9c8141a36ce089c0a894adbfccb4568"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:685ac03cc4ed5ebc15ad5c23bc555d68a87777586d970c2c3e216619a5476223"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d72e2ecc68a942e8cf9739619b7f408cc7b272b279b56b2c83c6123fcfa5cdff"},
-    {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d50b6aec14bc737742ca96e85d6d0a5f9bfbded018264b3b70ff9d8c33485551"},
-    {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:00e65f5e822decd501e374b0650146063fbb30a7264b4d2744bdd7b913e0cab5"},
-    {file = "Pillow-10.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:f31f9fdbfecb042d046f9d91270a0ba28368a723302786c0009ee9b9f1f60199"},
-    {file = "Pillow-10.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:1ce91b6ec08d866b14413d3f0bbdea7e24dfdc8e59f562bb77bc3fe60b6144ca"},
-    {file = "Pillow-10.0.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:349930d6e9c685c089284b013478d6f76e3a534e36ddfa912cde493f235372f3"},
-    {file = "Pillow-10.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3a684105f7c32488f7153905a4e3015a3b6c7182e106fe3c37fbb5ef3e6994c3"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4f69b3700201b80bb82c3a97d5e9254084f6dd5fb5b16fc1a7b974260f89f43"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f07ea8d2f827d7d2a49ecf1639ec02d75ffd1b88dcc5b3a61bbb37a8759ad8d"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:040586f7d37b34547153fa383f7f9aed68b738992380ac911447bb78f2abe530"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f88a0b92277de8e3ca715a0d79d68dc82807457dae3ab8699c758f07c20b3c51"},
-    {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c7cf14a27b0d6adfaebb3ae4153f1e516df54e47e42dcc073d7b3d76111a8d86"},
-    {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3400aae60685b06bb96f99a21e1ada7bc7a413d5f49bce739828ecd9391bb8f7"},
-    {file = "Pillow-10.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbc02381779d412145331789b40cc7b11fdf449e5d94f6bc0b080db0a56ea3f0"},
-    {file = "Pillow-10.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:9211e7ad69d7c9401cfc0e23d49b69ca65ddd898976d660a2fa5904e3d7a9baa"},
-    {file = "Pillow-10.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:faaf07ea35355b01a35cb442dd950d8f1bb5b040a7787791a535de13db15ed90"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9f72a021fbb792ce98306ffb0c348b3c9cb967dce0f12a49aa4c3d3fdefa967"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f7c16705f44e0504a3a2a14197c1f0b32a95731d251777dcb060aa83022cb2d"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:76edb0a1fa2b4745fb0c99fb9fb98f8b180a1bbceb8be49b087e0b21867e77d3"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:368ab3dfb5f49e312231b6f27b8820c823652b7cd29cfbd34090565a015e99ba"},
-    {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:608bfdee0d57cf297d32bcbb3c728dc1da0907519d1784962c5f0c68bb93e5a3"},
-    {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5c6e3df6bdd396749bafd45314871b3d0af81ff935b2d188385e970052091017"},
-    {file = "Pillow-10.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:7be600823e4c8631b74e4a0d38384c73f680e6105a7d3c6824fcf226c178c7e6"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:92be919bbc9f7d09f7ae343c38f5bb21c973d2576c1d45600fce4b74bafa7ac0"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8182b523b2289f7c415f589118228d30ac8c355baa2f3194ced084dac2dbba"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:38250a349b6b390ee6047a62c086d3817ac69022c127f8a5dc058c31ccef17f3"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:88af2003543cc40c80f6fca01411892ec52b11021b3dc22ec3bc9d5afd1c5334"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c189af0545965fa8d3b9613cfdb0cd37f9d71349e0f7750e1fd704648d475ed2"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7b031a6fc11365970e6a5686d7ba8c63e4c1cf1ea143811acbb524295eabed"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:db24668940f82321e746773a4bc617bfac06ec831e5c88b643f91f122a785684"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:efe8c0681042536e0d06c11f48cebe759707c9e9abf880ee213541c5b46c5bf3"},
-    {file = "Pillow-10.0.0.tar.gz", hash = "sha256:9c82b5b3e043c7af0d95792d0d20ccf68f61a1fec6b3530e718b688422727396"},
+    {file = "Pillow-10.0.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:8f06be50669087250f319b706decf69ca71fdecd829091a37cc89398ca4dc17a"},
+    {file = "Pillow-10.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50bd5f1ebafe9362ad622072a1d2f5850ecfa44303531ff14353a4059113b12d"},
+    {file = "Pillow-10.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6a90167bcca1216606223a05e2cf991bb25b14695c518bc65639463d7db722d"},
+    {file = "Pillow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f11c9102c56ffb9ca87134bd025a43d2aba3f1155f508eff88f694b33a9c6d19"},
+    {file = "Pillow-10.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:186f7e04248103482ea6354af6d5bcedb62941ee08f7f788a1c7707bc720c66f"},
+    {file = "Pillow-10.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:0462b1496505a3462d0f35dc1c4d7b54069747d65d00ef48e736acda2c8cbdff"},
+    {file = "Pillow-10.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d889b53ae2f030f756e61a7bff13684dcd77e9af8b10c6048fb2c559d6ed6eaf"},
+    {file = "Pillow-10.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:552912dbca585b74d75279a7570dd29fa43b6d93594abb494ebb31ac19ace6bd"},
+    {file = "Pillow-10.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:787bb0169d2385a798888e1122c980c6eff26bf941a8ea79747d35d8f9210ca0"},
+    {file = "Pillow-10.0.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:fd2a5403a75b54661182b75ec6132437a181209b901446ee5724b589af8edef1"},
+    {file = "Pillow-10.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2d7e91b4379f7a76b31c2dda84ab9e20c6220488e50f7822e59dac36b0cd92b1"},
+    {file = "Pillow-10.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19e9adb3f22d4c416e7cd79b01375b17159d6990003633ff1d8377e21b7f1b21"},
+    {file = "Pillow-10.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93139acd8109edcdeffd85e3af8ae7d88b258b3a1e13a038f542b79b6d255c54"},
+    {file = "Pillow-10.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:92a23b0431941a33242b1f0ce6c88a952e09feeea9af4e8be48236a68ffe2205"},
+    {file = "Pillow-10.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cbe68deb8580462ca0d9eb56a81912f59eb4542e1ef8f987405e35a0179f4ea2"},
+    {file = "Pillow-10.0.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:522ff4ac3aaf839242c6f4e5b406634bfea002469656ae8358644fc6c4856a3b"},
+    {file = "Pillow-10.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:84efb46e8d881bb06b35d1d541aa87f574b58e87f781cbba8d200daa835b42e1"},
+    {file = "Pillow-10.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:898f1d306298ff40dc1b9ca24824f0488f6f039bc0e25cfb549d3195ffa17088"},
+    {file = "Pillow-10.0.1-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:bcf1207e2f2385a576832af02702de104be71301c2696d0012b1b93fe34aaa5b"},
+    {file = "Pillow-10.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5d6c9049c6274c1bb565021367431ad04481ebb54872edecfcd6088d27edd6ed"},
+    {file = "Pillow-10.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28444cb6ad49726127d6b340217f0627abc8732f1194fd5352dec5e6a0105635"},
+    {file = "Pillow-10.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de596695a75496deb3b499c8c4f8e60376e0516e1a774e7bc046f0f48cd620ad"},
+    {file = "Pillow-10.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:2872f2d7846cf39b3dbff64bc1104cc48c76145854256451d33c5faa55c04d1a"},
+    {file = "Pillow-10.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4ce90f8a24e1c15465048959f1e94309dfef93af272633e8f37361b824532e91"},
+    {file = "Pillow-10.0.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ee7810cf7c83fa227ba9125de6084e5e8b08c59038a7b2c9045ef4dde61663b4"},
+    {file = "Pillow-10.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:b1be1c872b9b5fcc229adeadbeb51422a9633abd847c0ff87dc4ef9bb184ae08"},
+    {file = "Pillow-10.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:98533fd7fa764e5f85eebe56c8e4094db912ccbe6fbf3a58778d543cadd0db08"},
+    {file = "Pillow-10.0.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:764d2c0daf9c4d40ad12fbc0abd5da3af7f8aa11daf87e4fa1b834000f4b6b0a"},
+    {file = "Pillow-10.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fcb59711009b0168d6ee0bd8fb5eb259c4ab1717b2f538bbf36bacf207ef7a68"},
+    {file = "Pillow-10.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:697a06bdcedd473b35e50a7e7506b1d8ceb832dc238a336bd6f4f5aa91a4b500"},
+    {file = "Pillow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f665d1e6474af9f9da5e86c2a3a2d2d6204e04d5af9c06b9d42afa6ebde3f21"},
+    {file = "Pillow-10.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2fa6dd2661838c66f1a5473f3b49ab610c98a128fc08afbe81b91a1f0bf8c51d"},
+    {file = "Pillow-10.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:3a04359f308ebee571a3127fdb1bd01f88ba6f6fb6d087f8dd2e0d9bff43f2a7"},
+    {file = "Pillow-10.0.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:723bd25051454cea9990203405fa6b74e043ea76d4968166dfd2569b0210886a"},
+    {file = "Pillow-10.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:71671503e3015da1b50bd18951e2f9daf5b6ffe36d16f1eb2c45711a301521a7"},
+    {file = "Pillow-10.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:44e7e4587392953e5e251190a964675f61e4dae88d1e6edbe9f36d6243547ff3"},
+    {file = "Pillow-10.0.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:3855447d98cced8670aaa63683808df905e956f00348732448b5a6df67ee5849"},
+    {file = "Pillow-10.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ed2d9c0704f2dc4fa980b99d565c0c9a543fe5101c25b3d60488b8ba80f0cce1"},
+    {file = "Pillow-10.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5bb289bb835f9fe1a1e9300d011eef4d69661bb9b34d5e196e5e82c4cb09b37"},
+    {file = "Pillow-10.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a0d3e54ab1df9df51b914b2233cf779a5a10dfd1ce339d0421748232cea9876"},
+    {file = "Pillow-10.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:2cc6b86ece42a11f16f55fe8903595eff2b25e0358dec635d0a701ac9586588f"},
+    {file = "Pillow-10.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ca26ba5767888c84bf5a0c1a32f069e8204ce8c21d00a49c90dabeba00ce0145"},
+    {file = "Pillow-10.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f0b4b06da13275bc02adfeb82643c4a6385bd08d26f03068c2796f60d125f6f2"},
+    {file = "Pillow-10.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bc2e3069569ea9dbe88d6b8ea38f439a6aad8f6e7a6283a38edf61ddefb3a9bf"},
+    {file = "Pillow-10.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8b451d6ead6e3500b6ce5c7916a43d8d8d25ad74b9102a629baccc0808c54971"},
+    {file = "Pillow-10.0.1-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:32bec7423cdf25c9038fef614a853c9d25c07590e1a870ed471f47fb80b244db"},
+    {file = "Pillow-10.0.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cf63d2c6928b51d35dfdbda6f2c1fddbe51a6bc4a9d4ee6ea0e11670dd981e"},
+    {file = "Pillow-10.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f6d3d4c905e26354e8f9d82548475c46d8e0889538cb0657aa9c6f0872a37aa4"},
+    {file = "Pillow-10.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:847e8d1017c741c735d3cd1883fa7b03ded4f825a6e5fcb9378fd813edee995f"},
+    {file = "Pillow-10.0.1-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:7f771e7219ff04b79e231d099c0a28ed83aa82af91fd5fa9fdb28f5b8d5addaf"},
+    {file = "Pillow-10.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:459307cacdd4138edee3875bbe22a2492519e060660eaf378ba3b405d1c66317"},
+    {file = "Pillow-10.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b059ac2c4c7a97daafa7dc850b43b2d3667def858a4f112d1aa082e5c3d6cf7d"},
+    {file = "Pillow-10.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d6caf3cd38449ec3cd8a68b375e0c6fe4b6fd04edb6c9766b55ef84a6e8ddf2d"},
+    {file = "Pillow-10.0.1.tar.gz", hash = "sha256:d72967b06be9300fed5cfbc8b5bafceec48bf7cdc7dab66b1d2549035287191d"},
 ]
 
 [package.extras]
@@ -1428,24 +1442,24 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "protobuf"
-version = "4.24.2"
+version = "4.24.3"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "protobuf-4.24.2-cp310-abi3-win32.whl", hash = "sha256:58e12d2c1aa428ece2281cef09bbaa6938b083bcda606db3da4e02e991a0d924"},
-    {file = "protobuf-4.24.2-cp310-abi3-win_amd64.whl", hash = "sha256:77700b55ba41144fc64828e02afb41901b42497b8217b558e4a001f18a85f2e3"},
-    {file = "protobuf-4.24.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:237b9a50bd3b7307d0d834c1b0eb1a6cd47d3f4c2da840802cd03ea288ae8880"},
-    {file = "protobuf-4.24.2-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:25ae91d21e3ce8d874211110c2f7edd6384816fb44e06b2867afe35139e1fd1c"},
-    {file = "protobuf-4.24.2-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:c00c3c7eb9ad3833806e21e86dca448f46035242a680f81c3fe068ff65e79c74"},
-    {file = "protobuf-4.24.2-cp37-cp37m-win32.whl", hash = "sha256:4e69965e7e54de4db989289a9b971a099e626f6167a9351e9d112221fc691bc1"},
-    {file = "protobuf-4.24.2-cp37-cp37m-win_amd64.whl", hash = "sha256:c5cdd486af081bf752225b26809d2d0a85e575b80a84cde5172a05bbb1990099"},
-    {file = "protobuf-4.24.2-cp38-cp38-win32.whl", hash = "sha256:6bd26c1fa9038b26c5c044ee77e0ecb18463e957fefbaeb81a3feb419313a54e"},
-    {file = "protobuf-4.24.2-cp38-cp38-win_amd64.whl", hash = "sha256:bb7aa97c252279da65584af0456f802bd4b2de429eb945bbc9b3d61a42a8cd16"},
-    {file = "protobuf-4.24.2-cp39-cp39-win32.whl", hash = "sha256:2b23bd6e06445699b12f525f3e92a916f2dcf45ffba441026357dea7fa46f42b"},
-    {file = "protobuf-4.24.2-cp39-cp39-win_amd64.whl", hash = "sha256:839952e759fc40b5d46be319a265cf94920174d88de31657d5622b5d8d6be5cd"},
-    {file = "protobuf-4.24.2-py3-none-any.whl", hash = "sha256:3b7b170d3491ceed33f723bbf2d5a260f8a4e23843799a3906f16ef736ef251e"},
-    {file = "protobuf-4.24.2.tar.gz", hash = "sha256:7fda70797ddec31ddfa3576cbdcc3ddbb6b3078b737a1a87ab9136af0570cd6e"},
+    {file = "protobuf-4.24.3-cp310-abi3-win32.whl", hash = "sha256:20651f11b6adc70c0f29efbe8f4a94a74caf61b6200472a9aea6e19898f9fcf4"},
+    {file = "protobuf-4.24.3-cp310-abi3-win_amd64.whl", hash = "sha256:3d42e9e4796a811478c783ef63dc85b5a104b44aaaca85d4864d5b886e4b05e3"},
+    {file = "protobuf-4.24.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:6e514e8af0045be2b56e56ae1bb14f43ce7ffa0f68b1c793670ccbe2c4fc7d2b"},
+    {file = "protobuf-4.24.3-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:ba53c2f04798a326774f0e53b9c759eaef4f6a568ea7072ec6629851c8435959"},
+    {file = "protobuf-4.24.3-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:f6ccbcf027761a2978c1406070c3788f6de4a4b2cc20800cc03d52df716ad675"},
+    {file = "protobuf-4.24.3-cp37-cp37m-win32.whl", hash = "sha256:1b182c7181a2891e8f7f3a1b5242e4ec54d1f42582485a896e4de81aa17540c2"},
+    {file = "protobuf-4.24.3-cp37-cp37m-win_amd64.whl", hash = "sha256:b0271a701e6782880d65a308ba42bc43874dabd1a0a0f41f72d2dac3b57f8e76"},
+    {file = "protobuf-4.24.3-cp38-cp38-win32.whl", hash = "sha256:e29d79c913f17a60cf17c626f1041e5288e9885c8579832580209de8b75f2a52"},
+    {file = "protobuf-4.24.3-cp38-cp38-win_amd64.whl", hash = "sha256:067f750169bc644da2e1ef18c785e85071b7c296f14ac53e0900e605da588719"},
+    {file = "protobuf-4.24.3-cp39-cp39-win32.whl", hash = "sha256:2da777d34b4f4f7613cdf85c70eb9a90b1fbef9d36ae4a0ccfe014b0b07906f1"},
+    {file = "protobuf-4.24.3-cp39-cp39-win_amd64.whl", hash = "sha256:f631bb982c5478e0c1c70eab383af74a84be66945ebf5dd6b06fc90079668d0b"},
+    {file = "protobuf-4.24.3-py3-none-any.whl", hash = "sha256:f6f8dc65625dadaad0c8545319c2e2f0424fede988368893ca3844261342c11a"},
+    {file = "protobuf-4.24.3.tar.gz", hash = "sha256:12e9ad2ec079b833176d2921be2cb24281fa591f0b119b208b788adc48c2561d"},
 ]
 
 [[package]]
@@ -1517,13 +1531,13 @@ numpy = ">=1.16.6"
 
 [[package]]
 name = "pytest"
-version = "7.4.0"
+version = "7.4.2"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
-    {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
+    {file = "pytest-7.4.2-py3-none-any.whl", hash = "sha256:1d881c6124e08ff0a1bb75ba3ec0bfd8b5354a01c194ddd5a0a870a48d99b002"},
+    {file = "pytest-7.4.2.tar.gz", hash = "sha256:a766259cfab564a2ad52cb1aae1b881a75c3eb7e34ca3779697c23ed47c47069"},
 ]
 
 [package.dependencies]
@@ -1553,13 +1567,13 @@ six = ">=1.5"
 
 [[package]]
 name = "pytz"
-version = "2023.3"
+version = "2023.3.post1"
 description = "World timezone definitions, modern and historical"
 optional = true
 python-versions = "*"
 files = [
-    {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"},
-    {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
+    {file = "pytz-2023.3.post1-py2.py3-none-any.whl", hash = "sha256:ce42d816b81b68506614c11e8937d3aa9e41007ceb50bfdcb0749b921bf646c7"},
+    {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"},
 ]
 
 [[package]]
@@ -1916,19 +1930,19 @@ files = [
 
 [[package]]
 name = "setuptools"
-version = "68.1.2"
+version = "68.2.2"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-68.1.2-py3-none-any.whl", hash = "sha256:3d8083eed2d13afc9426f227b24fd1659489ec107c0e86cec2ffdde5c92e790b"},
-    {file = "setuptools-68.1.2.tar.gz", hash = "sha256:3d4dfa6d95f1b101d695a6160a7626e15583af71a5f52176efa5d39a054d475d"},
+    {file = "setuptools-68.2.2-py3-none-any.whl", hash = "sha256:b454a35605876da60632df1a60f736524eb73cc47bbc9f3f1ef1b644de74fd2a"},
+    {file = "setuptools-68.2.2.tar.gz", hash = "sha256:4ac1475276d2f1c48684874089fefcd83bd7162ddaafb81fac866ba0db282a87"},
 ]
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5,<=7.1.2)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
 testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
-testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "six"
@@ -2092,13 +2106,13 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.32.1"
+version = "4.33.2"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.32.1-py3-none-any.whl", hash = "sha256:b930d3dbd907a3f300cf49e54d63a56f8a0ab16b01a2c2a61ecff37c6de1da08"},
-    {file = "transformers-4.32.1.tar.gz", hash = "sha256:1edc8ae1de357d97c3d36b04412aa63d55e6fc0c4b39b419a7d380ed947d2252"},
+    {file = "transformers-4.33.2-py3-none-any.whl", hash = "sha256:5a9a757bea5b5a1b94796805bcb5978b552208a3ac193f46edda66be6f4a5488"},
+    {file = "transformers-4.33.2.tar.gz", hash = "sha256:47dd36f302afec86d9cdcacab61bbd0296e6bb02e64d2ed7855daaab14ee290e"},
 ]
 
 [package.dependencies]
@@ -2115,16 +2129,16 @@ tqdm = ">=4.27"
 
 [package.extras]
 accelerate = ["accelerate (>=0.20.3)"]
-agents = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.9,!=1.12.0)"]
-all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision"]
+agents = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.10,!=1.12.0)"]
+all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 codecarbon = ["codecarbon (==1.2.0)"]
 deepspeed = ["accelerate (>=0.20.3)", "deepspeed (>=0.9.3)"]
 deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision"]
+dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"]
 docs-specific = ["hf-doc-builder"]
 fairscale = ["fairscale (>0.3)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"]
@@ -2147,15 +2161,15 @@ sigopt = ["sigopt"]
 sklearn = ["scikit-learn"]
 speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 testing = ["GitPython (<3.1.19)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "timeout-decorator"]
-tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx"]
-tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx"]
+tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx"]
+tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx"]
 tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 timm = ["timm"]
 tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.14)"]
-torch = ["accelerate (>=0.20.3)", "torch (>=1.9,!=1.12.0)"]
+torch = ["accelerate (>=0.20.3)", "torch (>=1.10,!=1.12.0)"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (<10.0.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "tqdm (>=4.27)"]
+torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "tqdm (>=4.27)"]
 video = ["av (==9.2.0)", "decord (==0.6.0)"]
 vision = ["Pillow (<10.0.0)"]
 
@@ -2181,13 +2195,13 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.
 
 [[package]]
 name = "typing-extensions"
-version = "4.7.1"
-description = "Backported and Experimental Type Hints for Python 3.7+"
+version = "4.8.0"
+description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"},
-    {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
+    {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
+    {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
 ]
 
 [[package]]
@@ -2203,13 +2217,13 @@ files = [
 
 [[package]]
 name = "urllib3"
-version = "2.0.4"
+version = "2.0.5"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"},
-    {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"},
+    {file = "urllib3-2.0.5-py3-none-any.whl", hash = "sha256:ef16afa8ba34a1f989db38e1dbbe0c302e4289a47856990d0682e374563ce35e"},
+    {file = "urllib3-2.0.5.tar.gz", hash = "sha256:13abf37382ea2ce6fb744d4dad67838eec857c9f4f57009891805e0b5e123594"},
 ]
 
 [package.extras]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 1babf7496f4..c06c298a063 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "1.0.3"
+version = "1.1.0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 
@@ -54,5 +54,7 @@ priority = "explicit"
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
 
 [build-system]
-requires = ["poetry-core>=1.0.0"]
+requires = [
+    "poetry-core>=1.0.0",
+]
 build-backend = "poetry.core.masonry.api"
diff --git a/server/requirements.txt b/server/requirements.txt
index 1b038ccac22..7c81c5f964a 100644
--- a/server/requirements.txt
+++ b/server/requirements.txt
@@ -9,19 +9,19 @@ certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
 charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-datasets==2.14.4 ; python_version >= "3.9" and python_version < "3.13"
+datasets==2.14.5 ; python_version >= "3.9" and python_version < "3.13"
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.12.3 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.12.4 ; python_version >= "3.9" and python_version < "3.13"
 frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "3.13"
 fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
 fsspec[http]==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
 googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.3 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
@@ -32,7 +32,7 @@ mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
 multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
 multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
 networkx==3.1 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.25.2 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
@@ -43,32 +43,32 @@ opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13
 opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
 packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
-pandas==2.0.3 ; python_version >= "3.9" and python_version < "3.13"
+pandas==2.1.1 ; python_version >= "3.9" and python_version < "3.13"
 peft==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
-pillow==10.0.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.24.2 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.0.1 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.24.3 ; python_version >= "3.9" and python_version < "3.13"
 psutil==5.9.5 ; python_version >= "3.9" and python_version < "3.13"
 pyarrow==13.0.0 ; python_version >= "3.9" and python_version < "3.13"
 python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "3.13"
-pytz==2023.3 ; python_version >= "3.9" and python_version < "3.13"
+pytz==2023.3.post1 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
 regex==2023.8.8 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.11.2 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==68.1.2 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==68.2.2 ; python_version >= "3.9" and python_version < "3.13"
 six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
 sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
 texttable==1.6.7 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
 torch==2.0.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.32.1 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.33.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
 tzdata==2023.3 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.0.5 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 xxhash==3.3.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/tests/utils/test_tokens.py b/server/tests/utils/test_tokens.py
index 4187ff257c2..0585f1fb341 100644
--- a/server/tests/utils/test_tokens.py
+++ b/server/tests/utils/test_tokens.py
@@ -45,12 +45,15 @@ def test_stopping_criteria_max():
     assert criteria(1, "") == (False, None)
     assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)
 
+
 def test_batch_top_tokens():
     top_n_tokens = [0, 2, 3, 4, 5]
     top_n_tokens_tensor = torch.tensor(top_n_tokens)
-    inp_logprobs = torch.tensor([[-1., -3., -4., -2., -3.]] * 5)
+    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5)
 
-    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(top_n_tokens, top_n_tokens_tensor, inp_logprobs)
+    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
+        top_n_tokens, top_n_tokens_tensor, inp_logprobs
+    )
 
     assert topn_tok_ids[0] == []
     assert topn_tok_ids[1] == [0, 3]
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index e3fda07f533..301acb6be02 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -17,6 +17,8 @@ class Quantization(str, Enum):
     bitsandbytes_nf4 = "bitsandbytes-nf4"
     bitsandbytes_fp4 = "bitsandbytes-fp4"
     gptq = "gptq"
+    awq = "awq"
+    eetq = "eetq"
 
 
 class Dtype(str, Enum):
@@ -123,8 +125,12 @@ def download_weights(
 
     if not is_local_model:
         try:
-            adapter_config_filename = hf_hub_download(model_id, revision=revision, filename="adapter_config.json")
-            utils.download_and_unload_peft(model_id, revision, trust_remote_code=trust_remote_code)
+            adapter_config_filename = hf_hub_download(
+                model_id, revision=revision, filename="adapter_config.json"
+            )
+            utils.download_and_unload_peft(
+                model_id, revision, trust_remote_code=trust_remote_code
+            )
             is_local_model = True
             utils.weight_files(model_id, revision, extension)
             return
@@ -177,8 +183,12 @@ def download_weights(
             import transformers
             import json
 
-
-            config_filename = hf_hub_download(model_id, revision=revision, filename="config.json")
+            if is_local_model:
+                config_filename = os.path.join(model_id, "config.json")
+            else:
+                config_filename = hf_hub_download(
+                    model_id, revision=revision, filename="config.json"
+                )
             with open(config_filename, "r") as f:
                 config = json.load(f)
             architecture = config["architectures"][0]
@@ -187,7 +197,6 @@ def download_weights(
 
             # Name for this varible depends on transformers version.
             discard_names = getattr(class_, "_tied_weights_keys", [])
-            discard_names.extend(getattr(class_, "_keys_to_ignore_on_load_missing", []))
 
         except Exception as e:
             discard_names = []
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 932ab32e758..5b1b5715c5e 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -67,6 +67,16 @@
     __all__.append(FlashLlama)
     __all__.append(IDEFICSSharded)
 
+MISTRAL = True
+try:
+    from text_generation_server.models.flash_mistral import FlashMistral
+except ImportError as e:
+    logger.warning(f"Could not import Mistral model: {e}")
+    MISTRAL = False
+
+if MISTRAL:
+    __all__.append(FlashMistral)
+
 
 def get_model(
     model_id: str,
@@ -153,7 +163,11 @@ def get_model(
         )
     elif model_type == "mpt":
         return MPTSharded(
-            model_id, revision, quantize=quantize, trust_remote_code=trust_remote_code
+            model_id,
+            revision,
+            quantize=quantize,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
         )
 
     elif model_type == "gpt_neox":
@@ -182,7 +196,7 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
 
-    elif model_type == "llama":
+    elif model_type == "llama" or model_type == "baichuan":
         if FLASH_ATTENTION:
             return FlashLlama(
                 model_id,
@@ -233,7 +247,18 @@ def get_model(
                     trust_remote_code=trust_remote_code,
                 )
 
-    elif model_type == "opt":
+    if model_type == "mistral":
+        if MISTRAL:
+            return FlashMistral(
+                model_id,
+                revision,
+                quantize=quantize,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        raise NotImplementedError("Mistral model requires flash attention v2")
+
+    if model_type == "opt":
         return OPTSharded(
             model_id,
             revision,
@@ -242,7 +267,7 @@ def get_model(
             trust_remote_code=trust_remote_code,
         )
 
-    elif model_type == "t5":
+    if model_type == "t5":
         return T5Sharded(
             model_id,
             revision,
@@ -250,15 +275,15 @@ def get_model(
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
-    elif model_type == "idefics":
+    if model_type == "idefics":
         if FLASH_ATTENTION:
-           return IDEFICSSharded(
-               model_id,
-               revision,
-               quantize=quantize,
-               dtype=dtype,
-               trust_remote_code=trust_remote_code,
-           )
+            return IDEFICSSharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
 
@@ -268,10 +293,12 @@ def get_model(
         raise ValueError(
             "gptq quantization is not supported for AutoModel, you can try to quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
         )
+    if quantize == "awq":
+        raise ValueError("awq quantization is not supported for AutoModel")
     elif (quantize == "bitsandbytes-fp4") or (quantize == "bitsandbytes-nf4"):
-        raise ValueError(
-            "4bit quantization is not supported for AutoModel"
-        )
+        raise ValueError("4bit quantization is not supported for AutoModel")
+    elif (quantize == "eetq"):
+        raise ValueError("Eetq quantization is not supported for AutoModel")
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
         return CausalLM(
             model_id,
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 79fb60c6580..0151b017025 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -51,7 +51,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py
new file mode 100644
index 00000000000..2e6ae086163
--- /dev/null
+++ b/server/text_generation_server/models/cache_manager.py
@@ -0,0 +1,135 @@
+import math
+import torch
+
+from typing import Optional, List, Tuple
+
+BLOCK_SIZE: int = 16
+# Will be set in warmup
+CACHE_MANAGER: Optional["CacheManager"] = None
+
+
+class CacheManager:
+    def __init__(
+        self,
+        num_blocks: int,
+        num_layers: int,
+        num_heads: int,
+        head_size: int,
+        repeat_slots: bool,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        self.block_size = BLOCK_SIZE
+        self.num_blocks = num_blocks
+        self.repeat_slots = repeat_slots
+
+        element_size = torch.tensor([], dtype=dtype).element_size()
+        x = self.block_size // element_size
+
+        self.kv_cache = [
+            (
+                torch.empty(
+                    (num_blocks, num_heads, head_size // x, self.block_size, x),
+                    dtype=dtype,
+                    device=device,
+                ),
+                torch.empty(
+                    (num_blocks, num_heads, head_size, self.block_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+            )
+            for _ in range(num_layers)
+        ]
+        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
+        self.slots = torch.arange(
+            0, num_blocks * self.block_size, dtype=torch.int32
+        ).view(num_blocks, self.block_size)
+
+    def allocate(
+        self,
+        needed_blocks_slots: List[Tuple[int, int]],
+        blocks: int,
+        max_blocks: int,
+        device: torch.device,
+    ):
+        # Get free blocks indices by finding values in mask that are not set to 0
+        free_block_indices = self.free_block_mask.nonzero()
+        assert (
+            len(free_block_indices) >= blocks
+        ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
+
+        # Slice by the number of required blocks
+        block_indices = free_block_indices[:blocks]
+        block_indices = block_indices.flatten()
+
+        # Padded block tables
+        block_tables_tensor = torch.zeros(
+            (len(needed_blocks_slots), max_blocks), dtype=torch.int32
+        )
+
+        # Allocate paged attention blocks
+        cumulative_blocks = 0
+        slots = []
+        block_tables = []
+        for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
+            # Get allocated blocks for this sequence
+            allocated_blocks = block_indices[
+                cumulative_blocks : cumulative_blocks + needed_blocks
+            ]
+            # Get slots for the allocated blocks
+            all_slots = self.slots[allocated_blocks].flatten()
+
+            # Repeat slots in the case of context sliding window
+            if needed_slots > len(all_slots) and self.repeat_slots:
+                repeats = math.ceil(needed_slots / len(all_slots))
+                all_slots = all_slots.repeat(repeats)
+
+            allocated_slots = all_slots[:needed_slots]
+
+            slots.append(allocated_slots)
+            block_tables.append(allocated_blocks.tolist())
+            block_tables_tensor[i, :needed_blocks] = allocated_blocks
+            cumulative_blocks += needed_blocks
+
+        block_tables = block_tables
+        block_tables_tensor = block_tables_tensor.to(device)
+        slots = torch.concat(slots).to(device)
+
+        # Allocate the required number of blocks by setting the mask to 0
+        self.free_block_mask[block_indices] = 0
+
+        return block_tables, block_tables_tensor, slots
+
+    def free(self, block_indices: Optional[List[int]]):
+        if block_indices is not None and block_indices:
+            # Reset mask
+            self.free_block_mask[block_indices] = 1
+
+
+def set_cache_manager(
+    num_blocks: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    repeat_slots: bool,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> CacheManager:
+    global CACHE_MANAGER
+    if CACHE_MANAGER is not None:
+        del CACHE_MANAGER
+        torch.cuda.empty_cache()
+
+    CACHE_MANAGER = CacheManager(
+        num_blocks, num_layers, num_heads, head_size, repeat_slots, dtype, device
+    )
+    return CACHE_MANAGER
+
+
+def get_cache_manager() -> CacheManager:
+    global CACHE_MANAGER
+    if CACHE_MANAGER is None:
+        raise RuntimeError("cache manager was not initialized")
+
+    return CACHE_MANAGER
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 4e3382631a2..fccfb0f8b52 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -492,7 +492,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
@@ -579,7 +579,7 @@ def generate_token(
         batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
             batch.top_n_tokens,
             batch.top_n_tokens_tensor,
-            torch.softmax(logits[:, -1], -1),
+            torch.log_softmax(logits[:, -1], -1),
         )
 
         # Zipped iterator
@@ -641,8 +641,14 @@ def generate_token(
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :, 0]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
                     )
                     # Get seed
                     if isinstance(next_token_chooser.choice, Sampling):
diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
index 047a1872292..5423d75aae2 100644
--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -40,7 +40,10 @@
 )
 
 CUSTOM_KERNELS_ENABLED = False
-if not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
+if (
+    torch.cuda.is_available()
+    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
+):
     try:
         from custom_kernels import fused_bloom_attention_cuda
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index f0e1236d0f4..7c743a880e7 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -149,6 +149,27 @@ def forward(self, hidden_states, residual=None):
             return normed_hidden_states, res
 
 
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        if config.model_type == "baichuan":
+            return TensorParallelColumnLinear.load_qkv(
+                config,
+                prefix=f"{prefix}.W_pack",
+                weights=weights,
+                bias=False,
+            )
+        else:
+            return TensorParallelColumnLinear.load_multi(
+                config,
+                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+                dim=0,
+                weights=weights,
+                bias=False,
+            )
+
+
 def _load_gqa(config, prefix: str, weights):
     assert config.hidden_size % config.num_attention_heads == 0
     assert config.num_attention_heads % weights.process_group.size() == 0
@@ -159,7 +180,7 @@ def _load_gqa(config, prefix: str, weights):
         dim=0,
     )
 
-    if config.quantize != "gptq":
+    if config.quantize not in ["gptq", "awq"]:
         weight = weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.hidden_size // config.num_attention_heads
@@ -191,7 +212,10 @@ def __init__(
         #     config=config, prefix=f"{prefix}.rotary_emb", weights=weights
         # )
         self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=config.rope_theta, device=weights.device
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
         )
 
         self.softmax_scale = self.head_size**-0.5
@@ -205,16 +229,9 @@ def __init__(
         self.num_key_value_heads = (
             config.num_key_value_heads // weights.process_group.size()
         )
-        if config.num_attention_heads != config.num_key_value_heads:
-            self.query_key_value = _load_gqa(config, prefix, weights)
-        else:
-            self.query_key_value = TensorParallelColumnLinear.load_multi(
-                config,
-                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-                dim=0,
-                weights=weights,
-                bias=False,
-            )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
new file mode 100644
index 00000000000..77b7f230a08
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -0,0 +1,532 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+# Flash attention imports
+import dropout_layer_norm
+
+# vllm imports
+import vllm_cache_ops
+import vllm_attention_ops
+
+from text_generation_server.utils.flash_attn import attention, HAS_FLASH_ATTN_V2
+from text_generation_server.utils.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    PositionRotaryEmbedding,
+    TensorParallelHead,
+    get_linear,
+)
+
+if not HAS_FLASH_ATTN_V2:
+    raise ImportError("Mistral model requires flash attn v2")
+
+
+class MistralConfig(PretrainedConfig):
+    model_type = "mistral"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class MistralRMSNorm(nn.Module):
+    def __init__(self, prefix, weights, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+
+        weight = weights.get_tensor(f"{prefix}.weight")
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states, residual=None):
+        if hidden_states.shape[-1] > 8192:
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(
+                variance + self.variance_epsilon
+            )
+
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+
+            return self.weight * hidden_states, residual
+        else:
+            # faster post attention rms norm
+            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states,
+                residual,
+                self.weight,
+                None,
+                None,
+                None,
+                None,
+                None,
+                0.0,
+                self.variance_epsilon,
+                1.0,
+                0,
+                None,
+                False,
+                True,  # Activate RMSNorm
+            )
+            if res is None:
+                res = hidden_states
+
+            return normed_hidden_states, res
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=None, quantize=config.quantize)
+    )
+
+
+class MistralAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else 0
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, cos, sin)
+        self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        vllm_cache_ops.reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            # kv_cache[1] => [num_blocks, num_heads, head_size, block_size]
+            block_size = kv_cache[1].shape[3]
+            vllm_attention_ops.single_query_cached_kv_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                block_size,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate="tanh"
+                if act in ["gelu_fast", "gelu_pytorch_tanh"]
+                else "none",
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+class MistralLayer(nn.Module):
+    def __init__(self, layer_id, config, weights):
+        super().__init__()
+        prefix = f"model.layers.{layer_id}"
+        self.self_attn = MistralAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = MistralMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = MistralRMSNorm(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = MistralRMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            prefill_cache_indices,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output)
+
+        return mlp_output, attn_res
+
+
+class MistralModel(torch.nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix="model.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                MistralLayer(
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = MistralRMSNorm(
+            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+                prefill_cache_indices,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashMistralForCausalLM(torch.nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+
+        self.model = MistralModel(config, weights)
+        self.lm_head = TensorParallelHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+        self.max_past = config.sliding_window
+        if self.max_past is None:
+            raise ValueError("max_past cannot be None")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        else:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            max_s = min(self.max_past, max_s)
+            input_lengths = torch.clamp(input_lengths, max=self.max_past)
+
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
index aec9a3dcc53..6fb0099992a 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
@@ -20,7 +20,12 @@
 from PIL import Image
 
 from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
-from transformers.image_transforms import resize, to_channel_dimension_format, rescale, normalize
+from transformers.image_transforms import (
+    resize,
+    to_channel_dimension_format,
+    rescale,
+    normalize,
+)
 from transformers.image_utils import (
     ChannelDimension,
     ImageInput,
@@ -121,7 +126,11 @@ def preprocess(
             a PyTorch tensor of the processed images
         """
         image_size = image_size if image_size is not None else self.image_size
-        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
+        image_num_channels = (
+            image_num_channels
+            if image_num_channels is not None
+            else self.image_num_channels
+        )
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
         size = (image_size, image_size)
@@ -160,9 +169,13 @@ def preprocess(
         images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
         images = [self.rescale(image=image, scale=1 / 255) for image in images]
         images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
-        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
+        images = [
+            to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images
+        ]
         # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
-        images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
+        images = BatchFeature(
+            data={"pixel_values": images}, tensor_type=TensorType.PYTORCH
+        )["pixel_values"]
 
         return images
 
@@ -185,7 +198,9 @@ def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
             response.raise_for_status()
             return Image.open(BytesIO(response.content))
         else:
-            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
+            raise ValueError(
+                f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}"
+            )
 
     def rescale(
         self,
@@ -255,10 +270,9 @@ def normalize(
             `np.ndarray`: The normalized image.
         """
         # TODO 4.32
-        return normalize(
-            image, mean=mean, std=std, data_format=data_format, **kwargs
-        )
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
 
 import transformers
+
 transformers.IdeficsImageProcessor = IdeficsImageProcessor
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index 8b43ae4d3c4..1ffe6276018 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -28,7 +28,11 @@
 
 from transformers import PreTrainedModel
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, dataclass
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    dataclass,
+)
 from transformers.modeling_utils import PretrainedConfig
 from transformers.utils import (
     add_start_docstrings,
@@ -37,8 +41,12 @@
     replace_return_docstrings,
 )
 from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
-from text_generation_server.models.custom_modeling.idefics_vision import IdeficsVisionTransformer
-from text_generation_server.models.custom_modeling.idefics_perceiver import IdeficsPerceiverResampler
+from text_generation_server.models.custom_modeling.idefics_vision import (
+    IdeficsVisionTransformer,
+)
+from text_generation_server.models.custom_modeling.idefics_perceiver import (
+    IdeficsPerceiverResampler,
+)
 from text_generation_server.utils.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
@@ -49,10 +57,12 @@
 )
 import dropout_layer_norm
 
+
 @dataclass
 class BaseModelOutputWithPastImage(BaseModelOutputWithPast):
     image_hidden_states: Optional[torch.FloatTensor] = None
 
+
 @dataclass
 class CausalLMOutputWithPastImage(CausalLMOutputWithPast):
     image_hidden_states: Optional[torch.FloatTensor] = None
@@ -78,25 +88,39 @@ def expand_inputs_for_generation(
     **model_kwargs,
 ):
     expanded_return_idx = (
-        torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
+        torch.arange(input_ids.shape[0])
+        .view(-1, 1)
+        .repeat(1, expand_size)
+        .view(-1)
+        .to(input_ids.device)
     )
     input_ids = input_ids.index_select(0, expanded_return_idx)
 
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
-        model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
+        model_kwargs["token_type_ids"] = token_type_ids.index_select(
+            0, expanded_return_idx
+        )
 
     if attention_mask is not None:
-        model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
-        model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
+        model_kwargs["attention_mask"] = attention_mask.index_select(
+            0, expanded_return_idx
+        )
+        model_kwargs["image_attention_mask"] = model_kwargs[
+            "image_attention_mask"
+        ].index_select(0, expanded_return_idx)
+        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(
             0, expanded_return_idx
         )
-        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
 
     if is_encoder_decoder:
         if encoder_outputs is None:
-            raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
-        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select(
+            raise ValueError(
+                "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
+            )
+        encoder_outputs[
+            "last_hidden_state"
+        ] = encoder_outputs.last_hidden_state.index_select(
             0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
         )
         model_kwargs["encoder_outputs"] = encoder_outputs
@@ -120,14 +144,17 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder
     # update token_type_ids with last value
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
-        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+        model_kwargs["token_type_ids"] = torch.cat(
+            [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1
+        )
 
     # update attention masks
     if not is_encoder_decoder:
         if "attention_mask" in model_kwargs:
             attention_mask = model_kwargs["attention_mask"]
             model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))],
+                dim=-1,
             )
         if "image_attention_mask" in model_kwargs:
             image_attention_mask = model_kwargs["image_attention_mask"]
@@ -180,8 +207,12 @@ def freeze_model(model, module_exceptions=[]):
     }
     module_exceptions_mapped = [mapping[m] for m in module_exceptions]
     for module in model.modules():
-        if module_exceptions and any([isinstance(module, t) for t in module_exceptions_mapped]):
-            module.requires_grad_(True)  # Explicitely setting it to true to avoid any mistakes
+        if module_exceptions and any(
+            [isinstance(module, t) for t in module_exceptions_mapped]
+        ):
+            module.requires_grad_(
+                True
+            )  # Explicitely setting it to true to avoid any mistakes
         else:
             module.requires_grad_(False)
     return model
@@ -195,15 +226,21 @@ def __init__(
     ):
         super().__init__()
         self.num_embeddings = config.vocab_size
-        self.weight = TensorParallelEmbedding(prefix="model.embed_tokens", weights=weights)
-        self.additional_weight = nn.Parameter(weights.get_tensor(f"model.embed_tokens.additional_embedding.weight"))
+        self.weight = TensorParallelEmbedding(
+            prefix="model.embed_tokens", weights=weights
+        )
+        self.additional_weight = nn.Parameter(
+            weights.get_tensor(f"model.embed_tokens.additional_embedding.weight")
+        )
 
     def forward(self, input_ids):
         # Clone so that we don't modify the original input_ids later on
         input_ids = input_ids.clone()
         additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
         input_ids_additional_vocab = input_ids[additional_vocab_indices]
-        additional_embeddings = torch.nn.functional.embedding(input_ids_additional_vocab - self.num_embeddings, self.additional_weight)
+        additional_embeddings = torch.nn.functional.embedding(
+            input_ids_additional_vocab - self.num_embeddings, self.additional_weight
+        )
 
         # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
         input_ids[additional_vocab_indices] = 0
@@ -234,7 +271,10 @@ def __init__(
             config=config, prefix="lm_head", weights=weights
         )
         self.additional_fc = FastLinear.load(
-            config=config, prefix="lm_head.additional_fc", weights=weights, bias=False,
+            config=config,
+            prefix="lm_head.additional_fc",
+            weights=weights,
+            bias=False,
         )
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
@@ -257,7 +297,10 @@ def extra_repr(self) -> str:
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
 ):
     """
     Make causal mask used for bi-directional self-attention.
@@ -269,8 +312,18 @@ def _make_causal_mask(
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
 
 
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
@@ -284,7 +337,9 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
     inverted_mask = 1.0 - expanded_mask
 
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
 
 
 class IdeficsRMSNorm(nn.Module):
@@ -346,7 +401,6 @@ def forward(self, hidden_states, residual=None):
             if unwrap:
                 normed_hidden_states = normed_hidden_states.view(*shape)
 
-
             return normed_hidden_states
 
 
@@ -367,7 +421,10 @@ def __init__(
             bias=False,
         )
         self.down_proj = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.down_proj", weights=weights, bias=False,
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
         )
         self.act_fn = ACT2FN[config.hidden_act]
 
@@ -375,7 +432,9 @@ def forward(self, hidden_states):
         gate_up_states = self.gate_up_proj(hidden_states)
         shape = gate_up_states.shape
         gate_up_states = gate_up_states.view(*shape[:-1], 2, shape[-1] // 2)
-        return self.down_proj(self.act_fn(gate_up_states[:, :, 0]) * gate_up_states[:, :, 1])
+        return self.down_proj(
+            self.act_fn(gate_up_states[:, :, 0]) * gate_up_states[:, :, 1]
+        )
 
 
 # this was adapted from LlamaAttention
@@ -445,14 +504,22 @@ def __init__(
         self.qk_layer_norms = qk_layer_norms
         if self.qk_layer_norms:
             self.q_layer_norm = IdeficsRMSNorm(
-            prefix=f"{prefix}.q_layer_norm", weights=weights, eps=config.rms_norm_eps
-        )
+                prefix=f"{prefix}.q_layer_norm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
             self.k_layer_norm = IdeficsRMSNorm(
-            prefix=f"{prefix}.q_layer_norm", weights=weights, eps=config.rms_norm_eps
-        )
+                prefix=f"{prefix}.q_layer_norm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
 
     def forward(
         self,
@@ -470,20 +537,42 @@ def forward(
         bsz, q_len, _ = hidden_states.size()
 
         if is_cross_attention:
-            query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)# .transpose(1, 2)
+            query_states = self.q_proj(hidden_states).view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
             query_states = query_states.transpose(1, 2)
-            _, kv_len, _ = key_value_states.size()  # Note that, in this case, `kv_len` == `kv_seq_len`
-            key_states = self.k_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            (
+                _,
+                kv_len,
+                _,
+            ) = (
+                key_value_states.size()
+            )  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = (
+                self.k_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
             value_states = (
-                self.v_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+                self.v_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
             )
         else:
             qkv = self.qkv(hidden_states)
-            query_states, key_states, value_states = qkv.split(self.num_heads * self.head_dim, dim=2)
+            query_states, key_states, value_states = qkv.split(
+                self.num_heads * self.head_dim, dim=2
+            )
 
-            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)# .transpose(1, 2)
-            key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim)# . transpose(1, 2)
-            value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim)# .transpose(1, 2)
+            query_states = query_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
+            key_states = key_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # . transpose(1, 2)
+            value_states = value_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
             kv_seq_len = q_len
             if past_key_value is not None:
                 kv_seq_len += past_key_value[0].shape[-2]
@@ -493,10 +582,14 @@ def forward(
             )
 
             shape = query_states.shape
-            query_states = self.rotary_emb(query_states.view(-1, *shape[2:]), cos, sin).view(shape)
+            query_states = self.rotary_emb(
+                query_states.view(-1, *shape[2:]), cos, sin
+            ).view(shape)
 
             shape = key_states.shape
-            key_states = self.rotary_emb(key_states.reshape(-1, *shape[2:]), cos, sin).view(shape)
+            key_states = self.rotary_emb(
+                key_states.reshape(-1, *shape[2:]), cos, sin
+            ).view(shape)
 
             query_states = query_states.transpose(1, 2)
             key_states = key_states.transpose(1, 2)
@@ -571,8 +664,14 @@ def __init__(self, layer_id: int, config: IdeficsConfig, weights):
             prefix=f"{prefix}.mlp",
             weights=weights,
         )
-        self.input_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=config.rms_norm_eps)
+        self.input_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
         self.dropout = config.dropout
 
     def forward(
@@ -583,7 +682,9 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -650,14 +751,22 @@ def __init__(self, layer_id, config: IdeficsConfig, weights):
             prefix=f"{prefix}.mlp",
             weights=weights,
         )
-        self.input_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=config.rms_norm_eps)
+        self.input_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
         self.config = config.dropout
 
         self.act_cross_attn = nn.Tanh()
         self.act_dense = nn.Tanh()
 
-        self.alpha_cross_attn = nn.Parameter(weights.get_tensor(f"{prefix}.alpha_cross_attn"))
+        self.alpha_cross_attn = nn.Parameter(
+            weights.get_tensor(f"{prefix}.alpha_cross_attn")
+        )
         self.alpha_dense = nn.Parameter(weights.get_tensor(f"{prefix}.alpha_dense"))
 
         if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
@@ -673,7 +782,9 @@ def forward(
         use_cache: Optional[bool] = False,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         no_images: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -695,7 +806,9 @@ def forward(
             )
 
         if past_key_value is not None:
-            raise NotImplementedError("Past key value states are not implemented for Idefics cross attention module.")
+            raise NotImplementedError(
+                "Past key value states are not implemented for Idefics cross attention module."
+            )
 
         residual = hidden_states
 
@@ -711,7 +824,9 @@ def forward(
         # hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
         # when there are no images the model is used in pure language mode
         gate = 0 if no_images else 1
-        hidden_states = residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+        hidden_states = (
+            residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+        )
 
         # Fully Connected
         residual = hidden_states
@@ -896,11 +1011,14 @@ def __init__(self, config: IdeficsConfig, weights):
         self.gated_cross_attn_layers = nn.ModuleList(
             [
                 IdeficsGatedCrossAttentionLayer(layer_id, config, weights)
-                for layer_id in range(num_cross_layers)]
+                for layer_id in range(num_cross_layers)
+            ]
         )
         # self.gradient_checkpointing = False
 
-        self.norm = IdeficsRMSNorm(prefix=f"model.norm", weights=weights, eps=config.rms_norm_eps)
+        self.norm = IdeficsRMSNorm(
+            prefix=f"model.norm", weights=weights, eps=config.rms_norm_eps
+        )
 
         # self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -932,7 +1050,9 @@ def __init__(self, config: IdeficsConfig, weights):
     #     self.embed_tokens = value
 
     # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         combined_attention_mask = None
@@ -946,11 +1066,13 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
             combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
             )
 
         return combined_attention_mask
@@ -974,23 +1096,35 @@ def forward(
     ) -> Union[Tuple, BaseModelOutputWithPastImage]:
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
 
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape
         elif inputs_embeds is not None:
             batch_size, seq_length, _ = inputs_embeds.shape
         else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
 
         seq_length_with_past = seq_length
         past_key_values_length = 0
@@ -1006,7 +1140,10 @@ def forward(
         elif position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
             position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
             )
             position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
         else:
@@ -1016,29 +1153,52 @@ def forward(
 
         if image_hidden_states is None:
             if pixel_values is None and image_embeddings is None:
-                raise ValueError("Either pixel_values and image_embeddings have to be not-None.")
+                raise ValueError(
+                    "Either pixel_values and image_embeddings have to be not-None."
+                )
 
             elif pixel_values is not None and image_embeddings is not None:
-                raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time")
+                raise ValueError(
+                    "You cannot specify both pixel_values and image_embeddings at the same time"
+                )
 
             elif pixel_values is not None:
                 no_images = len(torch.nonzero(pixel_values)) == 0
-                pixel_values = pixel_values.to(dtype=self.dtype, device=device)  # fp16 compatibility
+                pixel_values = pixel_values.to(
+                    dtype=self.dtype, device=device
+                )  # fp16 compatibility
                 batch_size, num_images = pixel_values.shape[:2]
-                pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
+                pixel_values = pixel_values.contiguous().view(
+                    batch_size * num_images, *pixel_values.shape[2:]
+                )
 
                 # Get sequence from the vision encoder
-                image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
+                image_hidden_states = self.vision_model(
+                    pixel_values=pixel_values
+                ).last_hidden_state
 
             elif image_embeddings is not None:
-                batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size()
-                image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device)
-                image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
+                (
+                    batch_size,
+                    num_images,
+                    image_seq_len,
+                    image_hidden_size,
+                ) = image_embeddings.size()
+                image_hidden_states = image_embeddings.to(
+                    dtype=self.dtype, device=input_ids.device
+                )
+                image_hidden_states = image_hidden_states.view(
+                    batch_size * num_images, image_seq_len, image_hidden_size
+                )
 
             if self.config.use_resampler:
                 image_hidden_states = self.perceiver_resampler(image_hidden_states)
-            image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-            image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
+            image_seq_len, image_hidden_size = image_hidden_states.size(
+                1
+            ), image_hidden_states.size(2)
+            image_hidden_states = image_hidden_states.view(
+                batch_size, num_images * image_seq_len, image_hidden_size
+            )
         else:
             no_images = False
             num_images = pixel_values.shape[1]
@@ -1050,7 +1210,9 @@ def forward(
         text_seq_len = image_attention_mask.size(1)
         image_attention_mask = image_attention_mask.unsqueeze(-1)
         image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
+        image_attention_mask = image_attention_mask.view(
+            batch_size, text_seq_len, num_images * image_seq_len
+        )
         image_batch_size, image_sequence_length, _ = image_hidden_states.size()
         image_hidden_shape = (image_batch_size, image_sequence_length)
         if image_attention_mask is None:
@@ -1060,7 +1222,6 @@ def forward(
         # if list(image_attention_mask.shape) != [4, 1, 1024, 64]:
         #     raise ValueError(f"Image hidden_states {image_hidden_states.shape} - mask {image_attention_mask.shape} {num_images} {image_seq_len} {text_seq_len}")
 
-
         # if image_hidden_states is not None:
         # else:
         #     image_attention_mask = None
@@ -1070,10 +1231,15 @@ def forward(
         # embed positions
         if attention_mask is None:
             attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
             )
         attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
         )
 
         hidden_states = inputs_embeds
@@ -1094,7 +1260,9 @@ def forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
 
             def vblock(
                 main_block,
@@ -1194,7 +1362,11 @@ def vblock(
 
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
         return BaseModelOutputWithPastImage(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1230,7 +1402,7 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
         image_embeddings: Optional[torch.FloatTensor] = None,
-        image_hidden_states:  Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
         image_attention_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1264,11 +1436,19 @@ def forward(
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
@@ -1298,7 +1478,7 @@ def forward(
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            image_hidden_states=outputs.image_hidden_states
+            image_hidden_states=outputs.image_hidden_states,
         )
 
     def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
@@ -1316,12 +1496,20 @@ def _expand_inputs_for_generation(
         return expand_inputs_for_generation(*args, **model_kwargs)
 
     @staticmethod
-    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder)
+    def _update_model_kwargs_for_generation(
+        outputs, model_kwargs, is_encoder_decoder=False
+    ):
+        return update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
+        )
 
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
         return reordered_past
diff --git a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
index def78390b80..477d4d70e26 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
@@ -46,7 +46,8 @@
     TensorParallelRowLinear,
 )
 
-EPS=1e-5
+EPS = 1e-5
+
 
 class IdeficsPerceiverResampler(nn.Module):
     def __init__(
@@ -78,7 +79,12 @@ def __init__(
 
         """
         super().__init__()
-        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = (
+            embed_dim,
+            n_heads,
+            head_dim,
+            n_latents,
+        )
         self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
 
         # Create Latents for Perceiver
@@ -107,14 +113,16 @@ def __init__(
                             prefix=f"{prefix}.blocks.{layer_id}.1",
                             intermediate_size=self.intermediate_dim,
                             config=config,
-                            weights=weights
+                            weights=weights,
                         ),
                     ]
                 )
                 for layer_id in range(depth)
             ]
         )
-        self.layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.layer_norm", weights=weights, eps=EPS)
+        self.layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm", weights=weights, eps=EPS
+        )
 
     def forward(self, context: torch.Tensor) -> torch.Tensor:
         """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
@@ -130,25 +138,34 @@ def forward(self, context: torch.Tensor) -> torch.Tensor:
 
 
 class IdeficsPerceiverAttention(nn.Module):
-    def __init__(self,
-            prefix,
-            config,
-            embed_dim: int,
-            n_heads: int,
-            head_dim: int,
-            qk_layer_norms: bool,
-            weights
-        ) -> None:
+    def __init__(
+        self,
+        prefix,
+        config,
+        embed_dim: int,
+        n_heads: int,
+        head_dim: int,
+        qk_layer_norms: bool,
+        weights,
+    ) -> None:
         """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
         super().__init__()
         self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
         self.qk_layer_norms = qk_layer_norms
         # Normalization & Scaling
-        self.context_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.context_layer_norm", weights=weights, eps=EPS)
-        self.latents_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.latents_layer_norm", weights=weights, eps=EPS)
+        self.context_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.context_layer_norm", weights=weights, eps=EPS
+        )
+        self.latents_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.latents_layer_norm", weights=weights, eps=EPS
+        )
         if self.qk_layer_norms:
-            self.q_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.q_layer_norm", weights=weights, eps=EPS)
-            self.k_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.k_layer_norm", weights=weights, eps=EPS)
+            self.q_layer_norm = nn.LayerNorm.load(
+                prefix=f"{prefix}.q_layer_norm", weights=weights, eps=EPS
+            )
+            self.k_layer_norm = nn.LayerNorm.load(
+                prefix=f"{prefix}.k_layer_norm", weights=weights, eps=EPS
+            )
 
         self.qk_scale = self.head_dim**-0.5
 
@@ -164,10 +181,10 @@ def __init__(self,
         self.q_proj = TensorParallelColumnLinear.load(
             config=config, prefix=f"{prefix}.q_proj", weights=weights, bias=False
         )
-        self.k_proj =  TensorParallelColumnLinear.load(
+        self.k_proj = TensorParallelColumnLinear.load(
             config=config, prefix=f"{prefix}.k_proj", weights=weights, bias=False
         )
-        self.v_proj =  TensorParallelColumnLinear.load(
+        self.v_proj = TensorParallelColumnLinear.load(
             config=config, prefix=f"{prefix}.v_proj", weights=weights, bias=False
         )
 
@@ -202,7 +219,12 @@ def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
         # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
         #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
         # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
-        q, k, v = [x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(1, 2) for x in (q, k, v)]
+        q, k, v = [
+            x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(
+                1, 2
+            )
+            for x in (q, k, v)
+        ]
 
         if self.qk_layer_norms:
             q = self.q_layer_norm(q)
@@ -219,25 +241,34 @@ def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
 
 
 class IdeficsMLP(nn.Module):
-    def __init__(self,
-            prefix,
-            intermediate_size,
-            config,
-            weights,
-        ):
+    def __init__(
+        self,
+        prefix,
+        intermediate_size,
+        config,
+        weights,
+    ):
         """Simple MLP block with intermediate_size and embedding size"""
         super().__init__()
         self.embed_dim = config.vision_config.embed_dim
         self.ln = nn.LayerNorm.load(prefix=f"{prefix}.ln", weights=weights, eps=EPS)
         self.fc = TensorParallelColumnLinear.load(
-            config=config, prefix=f"{prefix}.fc", weights=weights, bias=False,
+            config=config,
+            prefix=f"{prefix}.fc",
+            weights=weights,
+            bias=False,
         )
         self.act = nn.ReLU()
         self.c_proj = TensorParallelRowLinear.load(
-            config=config, prefix=f"{prefix}.c_proj", weights=weights, bias=False,
+            config=config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=False,
         )
 
-    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+    def forward(
+        self, hidden_states: Optional[Tuple[torch.FloatTensor]]
+    ) -> torch.FloatTensor:
         hidden_states = self.ln(hidden_states)
         hidden_states = self.fc(hidden_states)
         hidden_states = self.act(hidden_states)
diff --git a/server/text_generation_server/models/custom_modeling/idefics_processing.py b/server/text_generation_server/models/custom_modeling/idefics_processing.py
index e24fc7bd5b0..0fbcbeeba65 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_processing.py
@@ -21,9 +21,16 @@
 
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from transformers.tokenization_utils_base import (
+    BatchEncoding,
+    PaddingStrategy,
+    TextInput,
+    TruncationStrategy,
+)
 from transformers.utils import TensorType, is_torch_available
-from text_generation_server.models.custom_modeling.idefics_image_processing import IdeficsImageProcessor
+from text_generation_server.models.custom_modeling.idefics_image_processing import (
+    IdeficsImageProcessor,
+)
 
 
 if is_torch_available():
@@ -124,7 +131,14 @@ class IdeficsProcessor(ProcessorMixin):
     image_processor_class = "IdeficsImageProcessor"
     tokenizer_class = "LlamaTokenizerFast"
 
-    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+    def __init__(
+        self,
+        image_processor,
+        tokenizer=None,
+        image_size=224,
+        add_end_of_utterance_token=None,
+        **kwargs,
+    ):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
@@ -142,7 +156,8 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
 
         self.tokenizer_was_trained_with_end_of_utterance_token = (
             True
-            if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
+            if "<end_of_utterance>"
+            in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
             else False
         )
 
@@ -265,7 +280,9 @@ def __call__(
 
         # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
         if add_end_of_utterance_token is None:
-            add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
+            add_end_of_utterance_token = (
+                self.tokenizer_was_trained_with_end_of_utterance_token
+            )
 
         # turn non-batched prompts into batched
         if not any(isinstance(i, list) for i in prompts):
@@ -358,10 +375,14 @@ def image_tokens(last_was_image):
             current_images = images[:local_max_num_images]
 
             if len(current_images) > 0:
-                padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
+                padded_image_tensor = torch.zeros(
+                    max_num_images, *current_images.size()[1:]
+                )
                 padded_image_tensor[: current_images.size(0)] = current_images
             else:
-                padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
+                padded_image_tensor = torch.zeros(
+                    max_num_images, *self.default_image_dims
+                )
 
             output_images.append(padded_image_tensor)
             output_input_ids.append(torch.tensor(padded_input_ids))
@@ -373,14 +394,19 @@ def image_tokens(last_was_image):
         output_attention_masks = torch.stack(output_attention_masks)
 
         if at_least_one_image:
-            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(output_input_ids, self.tokenizer)
+            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(
+                output_input_ids, self.tokenizer
+            )
             image_attention_mask = incremental_to_binary_attention_mask(
                 image_attention_mask, num_classes=max_num_images
             )
         else:
             # in full language mode we set the image mask to all-0s
             image_attention_mask = torch.zeros(
-                output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=torch.bool
+                output_input_ids.shape[0],
+                output_input_ids.shape[1],
+                1,
+                dtype=torch.bool,
             )
 
         return BatchFeature(
diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py
index d933d7c12c0..c521dd0adbe 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -75,7 +75,9 @@ def __init__(self, prefix, config, weights):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.class_embedding = nn.Parameter(weights.get_tensor(f"{prefix}.class_embedding"))
+        self.class_embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.class_embedding")
+        )
 
         self.patch_embedding = nn.Conv2d.load_no_bias(
             prefix=f"{prefix}.patch_embedding",
@@ -88,17 +90,19 @@ def __init__(self, prefix, config, weights):
 
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
-        # self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.position_embedding = TensorParallelEmbedding(
             prefix="model.vision_model.embeddings.position_embedding", weights=weights
         )
-        # self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
-        self.position_ids = weights.get_tensor(f"{prefix}.position_ids")
+        self.position_ids = (
+            torch.arange(self.num_positions).expand((1, -1)).to(device=weights.device)
+        )
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -134,7 +138,6 @@ def __init__(self, prefix, config, weights):
         self.num_heads = self.num_heads // weights.process_group.size()
         self.embed_dim = self.embed_dim // weights.process_group.size()
 
-
         self.k_proj = TensorParallelColumnLinear.load(
             config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
         )
@@ -149,7 +152,11 @@ def __init__(self, prefix, config, weights):
         )
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
 
     def forward(
         self,
@@ -188,7 +195,10 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
                     f" {causal_attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + causal_attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if attention_mask is not None:
@@ -196,7 +206,10 @@ def forward(
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
@@ -206,12 +219,18 @@ def forward(
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
         else:
             attn_weights_reshaped = None
 
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
 
         attn_output = torch.bmm(attn_probs, value_states)
 
@@ -255,11 +274,15 @@ class IdeficsVisionEncoderLayer(nn.Module):
     def __init__(self, prefix, config, weights):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = IdeficsVisionAttention(prefix=f"{prefix}.self_attn", config=config, weights=weights)
+        self.self_attn = IdeficsVisionAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
         self.layer_norm1 = nn.LayerNorm.load(
             prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
         )
-        self.mlp = IdeficsVisionMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.mlp = IdeficsVisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
         self.layer_norm2 = nn.LayerNorm.load(
             prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
         )
@@ -320,7 +343,11 @@ def __init__(self, prefix, config, weights):
         self.config = config
         self.layers = nn.ModuleList(
             [
-                IdeficsVisionEncoderLayer(prefix=f"{prefix}.encoder.layers.{layer_id}", config=config, weights=weights)
+                IdeficsVisionEncoderLayer(
+                    prefix=f"{prefix}.encoder.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                )
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
@@ -364,11 +391,19 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -408,9 +443,15 @@ def forward(
             encoder_states = encoder_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
 
 
@@ -421,13 +462,19 @@ def __init__(self, prefix, config, weights):
         self.config = config
         embed_dim = config.hidden_size
 
-        self.embeddings = IdeficsVisionEmbeddings(prefix=f"{prefix}.embeddings", config=config, weights=weights)
+        self.embeddings = IdeficsVisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
         self.pre_layrnorm = nn.LayerNorm.load(
             prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
         )
-        self.encoder = IdeficsVisionEncoder(prefix=prefix, config=config, weights=weights)
+        self.encoder = IdeficsVisionEncoder(
+            prefix=prefix, config=config, weights=weights
+        )
         self.post_layernorm = nn.LayerNorm.load(
-            prefix=f"{prefix}.post_layernorm", weights=weights, eps=config.layer_norm_eps
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
         )
 
     # copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
@@ -442,11 +489,19 @@ def forward(
         Returns:
 
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
index 1951b171cf9..24ba6796227 100644
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -49,7 +49,10 @@
 
 
 CUSTOM_KERNELS_ENABLED = False
-if not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
+if (
+    torch.cuda.is_available()
+    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
+):
     try:
         from custom_kernels import fused_attention_cuda
 
diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py
index fe6f1e525e9..ce3f5e21190 100644
--- a/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -444,14 +444,14 @@ def __init__(self, config: OPTConfig, weights):
 
         if config.word_embed_proj_dim != config.hidden_size:
             self.project_out = FastLinear.load(
-                config, prefix="model.decoder.project_out", bias=False
+                config, prefix="model.decoder.project_out", weights=weights, bias=False
             )
         else:
             self.project_out = None
 
         if config.word_embed_proj_dim != config.hidden_size:
             self.project_in = FastLinear.load(
-                config, prefix="model.decoder.project_in", bias=False
+                config, prefix="model.decoder.project_in", weights=weights, bias=False
             )
         else:
             self.project_in = None
diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py
index 793f3a661d3..d3e4f53a6bc 100644
--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -1032,9 +1032,17 @@ def __init__(self, config: T5Config, weights):
             embed_tokens=self.shared,
         )
 
-        self.lm_head = TensorParallelHead.load(
-            config, prefix="lm_head", weights=weights
-        )
+        try:
+            self.lm_head = TensorParallelHead.load(
+                config, prefix="lm_head", weights=weights
+            )
+        except RuntimeError:
+            # Some models like t5-small were saved with shared weights unlike flan
+            # Since they are declared as the same arch we have no choice but hope
+            # that this is OK instead of using a proper flag.
+            self.lm_head = TensorParallelHead.load(
+                config, prefix="shared", weights=weights
+            )
 
     def forward(
         self,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index d6af07f4c01..1fe40c0c520 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -19,99 +19,17 @@
     GeneratedText,
     TopTokens,
 )
+from text_generation_server.models.cache_manager import (
+    get_cache_manager,
+    set_cache_manager,
+    BLOCK_SIZE,
+)
 from text_generation_server.pb import generate_pb2
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
 
 tracer = trace.get_tracer(__name__)
 
-BLOCK_SIZE = 16
-# Will be set in warmup
-CACHE_MANAGER: Optional["CacheManager"] = None
-
-
-class CacheManager:
-    def __init__(
-        self,
-        num_blocks: int,
-        num_layers: int,
-        num_heads: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ):
-        self.block_size = BLOCK_SIZE
-        self.num_blocks = num_blocks
-
-        element_size = torch.tensor([], dtype=dtype).element_size()
-        x = self.block_size // element_size
-
-        self.kv_cache = [
-            (
-                torch.empty(
-                    (num_blocks, num_heads, head_size // x, self.block_size, x),
-                    dtype=dtype,
-                    device=device,
-                ),
-                torch.empty(
-                    (num_blocks, num_heads, head_size, self.block_size),
-                    dtype=dtype,
-                    device=device,
-                ),
-            )
-            for _ in range(num_layers)
-        ]
-        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
-        self.slots = torch.arange(
-            0, num_blocks * self.block_size, dtype=torch.int32
-        ).view(num_blocks, self.block_size)
-
-    def allocate(self, batch: "FlashCausalLMBatch"):
-        # Get free blocks indices by finding values in mask that are not set to 0
-        free_block_indices = self.free_block_mask.nonzero()
-        assert (
-            len(free_block_indices) >= batch.blocks
-        ), f"Out of available cache blocks: asked {batch.blocks}, only {len(free_block_indices)} free blocks"
-
-        # Slice by the number of required blocks
-        block_indices = free_block_indices[: batch.blocks]
-        block_indices = block_indices.flatten()
-
-        # Padded block tables
-        block_tables_tensor = torch.zeros(
-            (len(batch), batch.max_blocks), dtype=torch.int32
-        )
-
-        # Allocate paged attention blocks
-        cumulative_blocks = 0
-        slots = []
-        block_tables = []
-        for i, (needed_blocks, needed_slots) in enumerate(batch.needed_blocks_slots):
-            # Get allocated blocks for this sequence
-            allocated_blocks = block_indices[
-                cumulative_blocks : cumulative_blocks + needed_blocks
-            ]
-            # Get slots for the allocated blocks
-            allocated_slots = self.slots[allocated_blocks].flatten()[:needed_slots]
-
-            slots.append(allocated_slots)
-            block_tables.append(allocated_blocks.tolist())
-            block_tables_tensor[i, :needed_blocks] = allocated_blocks
-            cumulative_blocks += needed_blocks
-
-        batch.needed_blocks_slots = None
-        batch.block_tables = block_tables
-        batch.block_tables_tensor = block_tables_tensor.to(batch.input_ids.device)
-        batch.slots = torch.concat(slots).to(batch.input_ids.device)
-
-        # Allocate the required number of blocks by setting the mask to 0
-        self.free_block_mask[block_indices] = 0
-
-    def free(self, block_indices: Optional[List[int]]):
-        if block_indices is not None and block_indices:
-            # Reset mask
-            self.free_block_mask[block_indices] = 1
-
 
 @dataclass
 class FlashCausalLMBatch(Batch):
@@ -481,7 +399,6 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
 
             max_blocks = max(max_blocks, len(request_block_table))
 
-        global CACHE_MANAGER
         block_indices_to_free = []
         # Iterate on all requests
         for i, r in enumerate(self.requests):
@@ -489,7 +406,7 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
             if r.id not in requests_idx_mapping.keys():
                 block_indices_to_free.extend(self.block_tables[i])
         # Free blocks
-        CACHE_MANAGER.free(block_indices_to_free)
+        get_cache_manager().free(block_indices_to_free)
         # Needed to avoid dropping blocks when the batches will go out of scope
         self.block_tables = None
 
@@ -508,7 +425,7 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
         # Move to GPU now that we have the whole tensor
         slot_indices = slot_indices.to(device)
 
-        return FlashCausalLMBatch(
+        return type(self)(
             batch_id=self.batch_id,
             requests=requests,
             requests_idx_mapping=requests_idx_mapping,
@@ -665,7 +582,7 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
             b.block_tables = None
             del b
 
-        return FlashCausalLMBatch(
+        return cls(
             batch_id=batches[0].batch_id,
             requests=requests,
             requests_idx_mapping=requests_idx_mapping,
@@ -698,9 +615,10 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
 
     def __del__(self):
         if self.block_tables is not None and self.block_tables:
-            global CACHE_MANAGER
             # Free blocks
-            CACHE_MANAGER.free(list(itertools.chain.from_iterable(self.block_tables)))
+            get_cache_manager().free(
+                list(itertools.chain.from_iterable(self.block_tables))
+            )
 
     def __len__(self):
         return len(self.requests)
@@ -718,6 +636,7 @@ def __init__(
         device: torch.device,
         rank: int = 0,
         world_size: int = 1,
+        sliding_window: Optional[int] = None,
     ):
         self.num_layers = num_layers
         self.num_kv_heads = num_kv_heads
@@ -731,6 +650,7 @@ def __init__(
             device=device,
             rank=rank,
             world_size=world_size,
+            sliding_window=sliding_window,
         )
 
     @property
@@ -738,15 +658,14 @@ def batch_type(self) -> Type[FlashCausalLMBatch]:
         return FlashCausalLMBatch
 
     def warmup(self, batch: FlashCausalLMBatch):
-        global CACHE_MANAGER
-
         torch.cuda.empty_cache()
         try:
-            CACHE_MANAGER = CacheManager(
+            cache_manager = set_cache_manager(
                 batch.blocks,
                 self.num_layers,
                 self.num_kv_heads,
                 self.head_size,
+                self.sliding_window is not None,
                 self.dtype,
                 self.device,
             )
@@ -775,53 +694,36 @@ def warmup(self, batch: FlashCausalLMBatch):
         num_blocks = (
             int(free_memory // total_cache_size)
             # Add batch.blocks as we allocated it above, so it is included in the peak memory.
-            + CACHE_MANAGER.num_blocks
+            + cache_manager.num_blocks
         )
 
-        del CACHE_MANAGER
         del batch
-        torch.cuda.empty_cache()
+        del cache_manager
 
-        CACHE_MANAGER = CacheManager(
+        set_cache_manager(
             num_blocks,
             self.num_layers,
             self.num_kv_heads,
             self.head_size,
+            self.sliding_window is not None,
             self.dtype,
             self.device,
         )
 
         return int(num_blocks * BLOCK_SIZE)
 
-    def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        cu_seqlen_prefill: Optional[torch.Tensor],
-        block_tables: torch.Tensor,
-        slots: torch.Tensor,
-        input_lengths: torch.Tensor,
-        max_s: int,
-        lm_head_indices: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        global CACHE_MANAGER
-
+    def forward(self, batch: FlashCausalLMBatch) -> Tuple[torch.Tensor, torch.Tensor]:
         # Model Forward
         return self.model.forward(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=cu_seqlen_prefill,
-            kv_cache=CACHE_MANAGER.kv_cache,
-            block_tables=block_tables,
-            slots=slots,
-            input_lengths=input_lengths,
-            max_s=max_s,
-            lm_head_indices=lm_head_indices,
+            input_ids=batch.input_ids,
+            position_ids=batch.position_ids,
+            cu_seqlen_prefill=batch.cu_seqlen_prefill,
+            kv_cache=get_cache_manager().kv_cache,
+            block_tables=batch.block_tables_tensor,
+            slots=batch.slots[batch.slot_indices],
+            input_lengths=batch.input_lengths_tensor,
+            max_s=batch.max_seqlen,
+            lm_head_indices=batch.prefill_head_indices,
         )
 
     @tracer.start_as_current_span("generate_token")
@@ -833,19 +735,19 @@ def generate_token(
 
         if batch.needed_blocks_slots:
             # Allocate blocks to this batch
-            CACHE_MANAGER.allocate(batch)
+            block_tables, block_tables_tensor, slots = get_cache_manager().allocate(
+                batch.needed_blocks_slots,
+                batch.blocks,
+                batch.max_blocks,
+                batch.input_ids.device,
+            )
+            batch.needed_blocks_slots = None
+            batch.block_tables = block_tables
+            batch.block_tables_tensor = block_tables_tensor
+            batch.slots = slots
 
         try:
-            out = self.forward(
-                batch.input_ids,
-                batch.position_ids,
-                batch.cu_seqlen_prefill,
-                batch.block_tables_tensor,
-                batch.slots[batch.slot_indices],
-                batch.input_lengths_tensor,
-                batch.max_seqlen,
-                batch.prefill_head_indices,
-            )
+            out = self.forward(batch)
         except Exception as e:
             del batch
             raise e
@@ -1008,8 +910,14 @@ def generate_token(
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids,
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
                     )
                     generated_text = GeneratedText(
                         output_text,
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index 063aa01e1af..d2ed0b15a75 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -62,7 +62,7 @@ def __init__(
 
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "awq"]:
             weights._set_gptq_params(model_id)
 
         model = FlashLlamaForCausalLM(config, weights)
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
new file mode 100644
index 00000000000..919e4625a9d
--- /dev/null
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -0,0 +1,357 @@
+import math
+import torch
+import torch.distributed
+
+import numpy as np
+
+from dataclasses import dataclass
+from opentelemetry import trace
+from transformers import PreTrainedTokenizerBase
+from transformers.models.llama import LlamaTokenizerFast
+from typing import Optional, Tuple, Type
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models import FlashCausalLM
+from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch, BLOCK_SIZE
+from text_generation_server.models.cache_manager import (
+    get_cache_manager,
+    set_cache_manager,
+)
+from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
+    FlashMistralForCausalLM,
+    MistralConfig,
+)
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+    HeterogeneousNextTokenChooser,
+    StoppingCriteria,
+)
+
+tracer = trace.get_tracer(__name__)
+
+# Will be set in init
+SLIDING_WINDOW: Optional[int] = None
+SLIDING_WINDOW_BLOCKS: Optional[int] = None
+
+
+# Adds windowing logic to FlashCausalLMBatch
+@dataclass
+class FlashMistralBatch(FlashCausalLMBatch):
+    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
+    # as we only keep SLIDING_WINDOW values instead of the whole tensor
+    prefill_cache_indices: Optional[torch.Tensor] = None
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "FlashCausalLMBatch":
+        global SLIDING_WINDOW
+        global SLIDING_WINDOW_BLOCKS
+
+        batch_inputs = []
+        max_truncation = 0
+        for r in pb.requests:
+            batch_inputs.append(r.inputs)
+            max_truncation = max(max_truncation, r.truncate)
+
+        batch_tokenized_inputs = tokenizer(
+            batch_inputs, truncation=True, max_length=max_truncation
+        )["input_ids"]
+
+        position_ids = []
+        cu_seqlen_prefill = [0]
+        needed_blocks_slots = []
+        start_slots = []
+        slot_indices = []
+        prefill_cache_indices = []
+
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        requests_idx_mapping = {}
+
+        all_prefill_logprobs = True
+        no_prefill_logprobs = True
+        prefill_head_indices = []
+        prefill_next_token_indices = []
+        prefill_cu_outlens = [0]
+
+        next_token_chooser_parameters = []
+        stopping_criterias = []
+        top_n_tokens = []
+
+        # Cumulative length
+        cumulative_length = 0
+        cumulative_max_length = 0
+        prefill_out_cumulative_length = 0
+
+        blocks = 0
+        max_seqlen = 0
+        max_length = 0
+        max_blocks = 0
+
+        # Parse batch
+        for i, (r, tokenized_input) in enumerate(
+            zip(pb.requests, batch_tokenized_inputs)
+        ):
+            # request id -> idx in list mapping
+            requests_idx_mapping[r.id] = i
+
+            tokenized_input = tokenized_input[-r.truncate :]
+
+            input_length = len(tokenized_input)
+            input_lengths.append(input_length)
+
+            prefix_offsets.append(input_length - 5)
+            read_offsets.append(input_length)
+
+            all_input_ids.append(tokenized_input)
+
+            # Position ids
+            request_position_ids = torch.arange(0, input_length, dtype=torch.int32)
+            position_ids.append(request_position_ids)
+
+            # Add cumulative lengths of all previous inputs
+            cu_seqlen_prefill.append(cumulative_length + input_length)
+
+            next_token_chooser_parameters.append(r.parameters)
+
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            max_new_tokens = stopping_criteria.max_new_tokens
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
+
+            # Paged attention
+            # Remove one as the first token des not have a past
+            total_tokens = input_length + max_new_tokens - 1
+
+            # Needed blocks can not go over SLIDING_WINDOW_BLOCKS
+            needed_blocks = min(
+                math.ceil(total_tokens / BLOCK_SIZE), SLIDING_WINDOW_BLOCKS
+            )
+            blocks += needed_blocks
+
+            needed_blocks_slots.append((needed_blocks, total_tokens))
+            start_slots.append(cumulative_max_length)
+
+            request_slot_indices = torch.arange(
+                cumulative_max_length,
+                cumulative_max_length + input_length,
+                dtype=torch.int64,
+            )
+            slot_indices.append(request_slot_indices)
+
+            # Create tensor to slice into the kv tensor in prefill
+            request_prefill_cache_indices = torch.arange(
+                cumulative_length + max(0, input_length - SLIDING_WINDOW),
+                cumulative_length + input_length,
+                dtype=torch.int64,
+            )
+            prefill_cache_indices.append(request_prefill_cache_indices)
+
+            all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
+            no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
+
+            if r.prefill_logprobs:
+                prefill_head_indices.append(request_position_ids + cumulative_length)
+                prefill_next_token_indices.append(
+                    prefill_out_cumulative_length + input_length - 1
+                )
+                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
+                prefill_out_cumulative_length += input_length
+            else:
+                prefill_head_indices.append(
+                    torch.tensor(
+                        [cumulative_length + input_length - 1], dtype=torch.int32
+                    )
+                )
+                prefill_next_token_indices.append(prefill_out_cumulative_length)
+                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
+                prefill_out_cumulative_length += 1
+
+            # Update
+            cumulative_length += input_length
+            cumulative_max_length += total_tokens
+            max_seqlen = max(max_seqlen, input_length)
+            max_blocks = max(max_blocks, needed_blocks)
+            max_length = max(max_length, input_length + max_new_tokens)
+
+        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+            next_token_chooser_parameters, dtype, device
+        )
+        start_slots = torch.tensor(start_slots, dtype=torch.int64)
+
+        # Padded all_input_ids_tensor
+        all_input_ids_tensor = np.zeros(
+            (len(all_input_ids), max_length), dtype=np.int64
+        )
+        for i, input_ids in enumerate(all_input_ids):
+            all_input_ids_tensor[i, : len(input_ids)] = input_ids
+
+        # Create tensors on device
+        all_input_ids_tensor = torch.tensor(
+            all_input_ids_tensor, dtype=torch.int64, device=device
+        )
+
+        if len(pb.requests) > 1:
+            input_ids = np.concatenate(all_input_ids, dtype=np.int64)
+            position_ids = torch.cat(position_ids)
+            slot_indices = torch.cat(slot_indices)
+            prefill_cache_indices = torch.cat(prefill_cache_indices)
+        else:
+            input_ids = all_input_ids[0]
+            position_ids = position_ids[0]
+            slot_indices = slot_indices[0]
+            prefill_cache_indices = prefill_cache_indices[0]
+
+        cu_seqlen_prefill = torch.tensor(
+            cu_seqlen_prefill, device=device, dtype=torch.int32
+        )
+
+        position_ids = position_ids.to(device)
+        slot_indices = slot_indices.to(device)
+        prefill_cache_indices = prefill_cache_indices.to(device)
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+        input_lengths_tensor = torch.tensor(
+            input_lengths, dtype=torch.int32, device=device
+        )
+
+        if all_prefill_logprobs:
+            prefill_head_indices = None
+            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
+        elif no_prefill_logprobs:
+            prefill_head_indices = cu_seqlen_prefill[1:] - 1
+            prefill_next_token_indices = None
+        else:
+            prefill_head_indices = torch.tensor(
+                torch.cat(prefill_head_indices), dtype=torch.int64, device=device
+            )
+            prefill_next_token_indices = torch.tensor(
+                prefill_next_token_indices, dtype=torch.int64, device=device
+            )
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+
+        return cls(
+            batch_id=pb.id,
+            requests=pb.requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            start_slots=start_slots,
+            slot_indices=slot_indices,
+            needed_blocks_slots=needed_blocks_slots,
+            block_tables=None,
+            block_tables_tensor=None,
+            slots=None,
+            max_seqlen=max_seqlen,
+            prefill_head_indices=prefill_head_indices,
+            prefill_next_token_indices=prefill_next_token_indices,
+            prefill_cu_outlens=prefill_cu_outlens,
+            input_lengths=input_lengths,
+            input_lengths_tensor=input_lengths_tensor,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            all_input_ids=all_input_ids,
+            all_input_ids_tensor=all_input_ids_tensor,
+            next_token_chooser=next_token_chooser,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            blocks=blocks,
+            max_blocks=max_blocks,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+
+
+class FlashMistral(FlashCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        global SLIDING_WINDOW
+        global SLIDING_WINDOW_BLOCKS
+
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = torch.float16 if dtype is None else dtype
+        else:
+            raise NotImplementedError("FlashLlama is only available on GPU")
+
+        tokenizer = LlamaTokenizerFast.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+
+        config = MistralConfig.from_pretrained(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        config.quantize = quantize
+
+        # Set context windows
+        SLIDING_WINDOW = config.sliding_window
+        SLIDING_WINDOW_BLOCKS = math.ceil(config.sliding_window / BLOCK_SIZE)
+
+        torch.distributed.barrier(group=self.process_group)
+
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(filenames, device, dtype, process_group=self.process_group)
+        if config.quantize in ["gptq", "awq"]:
+            weights._set_gptq_params(model_id)
+
+        model = FlashMistralForCausalLM(config, weights)
+
+        torch.distributed.barrier(group=self.process_group)
+        super(FlashMistral, self).__init__(
+            model=model,
+            tokenizer=tokenizer,
+            num_layers=len(model.model.layers),
+            num_kv_heads=model.model.num_key_value_heads,
+            head_size=model.model.head_size,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+            sliding_window=config.sliding_window,
+        )
+
+    @property
+    def batch_type(self) -> Type[FlashMistralBatch]:
+        return FlashMistralBatch
+
+    def forward(self, batch: FlashMistralBatch) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Model Forward
+        logits = self.model.forward(
+            input_ids=batch.input_ids,
+            position_ids=batch.position_ids,
+            cu_seqlen_prefill=batch.cu_seqlen_prefill,
+            kv_cache=get_cache_manager().kv_cache,
+            block_tables=batch.block_tables_tensor,
+            slots=batch.slots[batch.slot_indices],
+            input_lengths=batch.input_lengths_tensor,
+            max_s=batch.max_seqlen,
+            prefill_cache_indices=batch.prefill_cache_indices,
+            lm_head_indices=batch.prefill_head_indices,
+        )
+        if batch.prefill_cache_indices is not None:
+            batch.prefill_cache_indices = None
+        return logits
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index d42117342d0..b296c96ebf9 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -80,6 +80,7 @@ def from_pb(
         next_token_choosers = []
         stopping_criterias = []
         prefix_offsets = []
+        top_n_tokens = []
         read_offsets = []
         requests_idx_mapping = {}
 
@@ -96,6 +97,7 @@ def from_pb(
                 r.stopping_parameters, tokenizer
             )
             stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
             max_truncation = max(max_truncation, r.truncate)
             max_decode_tokens += stopping_criteria.max_new_tokens
             padding_right_offset = max(
@@ -129,6 +131,9 @@ def from_pb(
         position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
         position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
         all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
 
         max_tokens = len(inputs) * max_input_length + max_decode_tokens
 
@@ -146,6 +151,8 @@ def from_pb(
             read_offsets=read_offsets,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
             max_input_length=max_input_length.item(),
             padding_right_offset=padding_right_offset,
             max_tokens=max_tokens,
@@ -167,7 +174,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index accedf14950..d4c64dfe77b 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -33,7 +33,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py
index c54b539b0cd..fa23d1f9cd0 100644
--- a/server/text_generation_server/models/idefics.py
+++ b/server/text_generation_server/models/idefics.py
@@ -42,7 +42,7 @@ def __init__(
             dtype = torch.bfloat16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
         self.device, self.dtype = device, dtype
 
         config = IdeficsConfig.from_pretrained(
diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
index 2dac87bc787..2472caf6223 100644
--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
@@ -8,7 +8,13 @@
 
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase, ProcessorMixin
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
 from typing import Optional, Tuple, List, Type, Dict
 
 from text_generation_server.models import Model
@@ -23,7 +29,8 @@
 
 import re
 
-IMAGES = re.compile(r'!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)')
+IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)")
+
 
 def split(string):
     parts = []
@@ -41,6 +48,7 @@ def split(string):
 
     return parts
 
+
 tracer = trace.get_tracer(__name__)
 
 
@@ -94,7 +102,7 @@ def from_pb(
         cls,
         pb: generate_pb2.Batch,
         tokenizer: PreTrainedTokenizerBase,
-        processor: ProcessorMixin, # Hack
+        processor: ProcessorMixin,  # Hack
         dtype: torch.dtype,
         device: torch.device,
     ) -> "IdeficsCausalLMBatch":
@@ -137,12 +145,16 @@ def from_pb(
             padding=True,
             truncation=True,
             max_length=max_truncation,
-            add_end_of_utterance_token=False, # Already taken care of inside the prompts, so bypassing the processor's handling of this token
+            add_end_of_utterance_token=False,  # Already taken care of inside the prompts, so bypassing the processor's handling of this token
         ).to(device)
         for _ in pb.requests:
             input_len = tokenized_inputs["input_ids"].shape[1]
-            prefix_offsets.append(input_len - 5) # To decode without potential fallbacks errors
-            read_offsets.append(input_len) # To decode without potential fallbacks errors
+            prefix_offsets.append(
+                input_len - 5
+            )  # To decode without potential fallbacks errors
+            read_offsets.append(
+                input_len
+            )  # To decode without potential fallbacks errors
 
         input_lengths = tokenized_inputs["attention_mask"].sum(1)
         max_input_length = input_lengths.max()
@@ -158,14 +170,21 @@ def from_pb(
         attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]
         # Do the same for image_attention_mask
         image_attention_mask = input_ids.new_zeros(
-            (pb.size, max_input_length + padding_right_offset, tokenized_inputs["pixel_values"].size(1))
+            (
+                pb.size,
+                max_input_length + padding_right_offset,
+                tokenized_inputs["pixel_values"].size(1),
+            )
         )
-        image_attention_mask[:, :max_input_length, :] = tokenized_inputs["image_attention_mask"]
-
+        image_attention_mask[:, :max_input_length, :] = tokenized_inputs[
+            "image_attention_mask"
+        ]
 
         position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
         position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
-        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1) # It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list
+        all_input_ids = tokenized_inputs["input_ids"].T.split(
+            1, dim=1
+        )  # It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list
 
         max_tokens = len(inputs) * (max_input_length + max_decode_tokens)
 
@@ -259,7 +278,7 @@ def filter(self, request_ids: List[int]) -> Optional["IdeficsCausalLMBatch"]:
                 self.image_attention_mask.shape[1] - self.padding_right_offset
             )
             + new_padding_right_offset,
-            :
+            :,
         ]
         if self.image_hidden_states is None:
             image_hidden_states = None
@@ -308,7 +327,9 @@ def filter(self, request_ids: List[int]) -> Optional["IdeficsCausalLMBatch"]:
 
     @classmethod
     @tracer.start_as_current_span("concatenate")
-    def concatenate(cls, batches: List["IdeficsCausalLMBatch"]) -> "IdeficsCausalLMBatch":
+    def concatenate(
+        cls, batches: List["IdeficsCausalLMBatch"]
+    ) -> "IdeficsCausalLMBatch":
         # It adds new requests to the batch
         # Used for padding
         total_batch_size = 0
@@ -383,12 +404,20 @@ def concatenate(cls, batches: List["IdeficsCausalLMBatch"]) -> "IdeficsCausalLMB
 
             curr_batch_max_num_images = batch.pixel_values.size(1)
             if pixel_values is None:
-                pixel_values = batch.pixel_values.new_zeros((total_batch_size, max_num_images, 3, 224, 224))
-            pixel_values[start_index:end_index, :curr_batch_max_num_images] = batch.pixel_values
+                pixel_values = batch.pixel_values.new_zeros(
+                    (total_batch_size, max_num_images, 3, 224, 224)
+                )
+            pixel_values[
+                start_index:end_index, :curr_batch_max_num_images
+            ] = batch.pixel_values
 
             if image_attention_mask is None:
                 image_attention_mask = batch.image_attention_mask.new_zeros(
-                    (total_batch_size, max_input_length + padding_right_offset, max_num_images)
+                    (
+                        total_batch_size,
+                        max_input_length + padding_right_offset,
+                        max_num_images,
+                    )
                 )
 
             # We need to slice the attention mask to remove padding from previous steps
@@ -409,11 +438,9 @@ def concatenate(cls, batches: List["IdeficsCausalLMBatch"]) -> "IdeficsCausalLMB
             image_attention_mask[
                 start_index:end_index,
                 left_offset:-padding_right_offset,
-                :curr_batch_max_num_images
+                :curr_batch_max_num_images,
             ] = batch.image_attention_mask[
-                :,
-                batch_left_offset : - batch.padding_right_offset,
-                :
+                :, batch_left_offset : -batch.padding_right_offset, :
             ]
 
             # Create empty tensor
@@ -550,7 +577,9 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
-        from text_generation_server.models.custom_modeling.idefics_modeling import IdeficsForVisionText2Text
+        from text_generation_server.models.custom_modeling.idefics_modeling import (
+            IdeficsForVisionText2Text,
+        )
 
         if torch.cuda.is_available():
             device = torch.device("cuda")
@@ -560,7 +589,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
@@ -611,11 +640,6 @@ def __init__(
     def batch_type(self) -> Type[IdeficsCausalLMBatch]:
         return IdeficsCausalLMBatch
 
-    def decode(self, generated_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
     def forward(
         self,
         input_ids,
@@ -655,9 +679,13 @@ def generate_token(
             # this is due to the nature IDEFICS: it's an encoder decoder, and so when decoding, only the currently generated
             # token need to attend to the encoder hidden states (i.e. the vision encoder)
             # Also see seq2seq_lm.Seq2SeqLM.generate_token which has roughly the same logic
-            image_attention_mask = batch.image_attention_mask[:, -(batch.padding_right_offset+1)].unsqueeze(1)
+            image_attention_mask = batch.image_attention_mask[
+                :, -(batch.padding_right_offset + 1)
+            ].unsqueeze(1)
         else:
-            image_attention_mask = batch.image_attention_mask[:, : -batch.padding_right_offset]
+            image_attention_mask = batch.image_attention_mask[
+                :, : -batch.padding_right_offset
+            ]
 
         logits, past, image_hidden_states = self.forward(
             input_ids=batch.input_ids,
@@ -728,8 +756,14 @@ def generate_token(
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :, 0]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
                     )
                     # Get seed
                     if isinstance(next_token_chooser.choice, Sampling):
@@ -763,7 +797,7 @@ def generate_token(
                 else:
                     prefill_tokens = None
 
-                top_tokens=None
+                top_tokens = None
 
                 generation = Generation(
                     request.id,
@@ -773,7 +807,7 @@ def generate_token(
                     next_token_text,
                     next_token_id_squeezed.item() in self.all_special_ids,
                     generated_text,
-                    top_tokens
+                    top_tokens,
                 )
 
                 generations.append(generation)
@@ -795,7 +829,9 @@ def generate_token(
 
         # Update attention_mask as we added a new token to input_ids
         batch.attention_mask[:, -batch.padding_right_offset] = 1
-        batch.image_attention_mask[:, -batch.padding_right_offset, :] = batch.image_attention_mask[:, -(batch.padding_right_offset+1), :]
+        batch.image_attention_mask[
+            :, -batch.padding_right_offset, :
+        ] = batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :]
         # Decrease right offset
         batch.padding_right_offset -= 1
 
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 806e98332b0..17d2ea9b433 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -21,6 +21,7 @@ def __init__(
         device: torch.device,
         rank: int = 0,
         world_size: int = 1,
+        sliding_window: Optional[int] = None,
     ):
         self.model = model.eval()
         self.tokenizer = tokenizer
@@ -30,6 +31,7 @@ def __init__(
         self.device = device
         self.rank = rank
         self.world_size = world_size
+        self.sliding_window = sliding_window
 
         self.has_position_ids = (
             inspect.signature(model.forward).parameters.get("position_ids", None)
@@ -40,10 +42,14 @@ def __init__(
 
     @property
     def info(self) -> InfoResponse:
+        if self.requires_padding and self.sliding_window is not None:
+            raise NotImplementedError("sliding_window is not implemented with padding")
+
         return InfoResponse(
             requires_padding=self.requires_padding,
             dtype=str(self.dtype),
             device_type=self.device.type,
+            window_size=self.sliding_window,
         )
 
     @property
@@ -64,16 +70,18 @@ def decode_token(
         all_input_ids: List[int],
         prefix_offset: int = 0,
         read_offset: int = 0,
+        skip_special_tokens: bool = False,
     ) -> Tuple[str, int, int]:
         """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
 
         # The prefix text is necessary only to defeat cleanup algorithms in the decode
         # which decide to add a space or not depending on the surrounding ids.
         prefix_text = self.tokenizer.decode(
-            all_input_ids[prefix_offset:read_offset], skip_special_tokens=False
+            all_input_ids[prefix_offset:read_offset],
+            skip_special_tokens=skip_special_tokens,
         )
         new_text = self.tokenizer.decode(
-            all_input_ids[prefix_offset:], skip_special_tokens=False
+            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
         )
 
         if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py
index 909d985231a..19de497c517 100644
--- a/server/text_generation_server/models/mpt.py
+++ b/server/text_generation_server/models/mpt.py
@@ -43,14 +43,16 @@ def __init__(
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16
+            dtype = torch.float16 if dtype is None else dtype
         else:
-            raise NotImplementedError("MPTSharded is only available on GPU")
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index f3a23d0733e..b2b87246375 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -31,7 +31,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py
index d97c1c73657..802a4aa676e 100644
--- a/server/text_generation_server/models/rw.py
+++ b/server/text_generation_server/models/rw.py
@@ -23,7 +23,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
index 81928c1d7e7..7b269d8e6a9 100644
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -30,7 +30,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index 361453fb479..d4d3cd19f97 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -541,7 +541,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_id,
@@ -642,7 +642,7 @@ def generate_token(
         batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
             batch.top_n_tokens,
             batch.top_n_tokens_tensor,
-            torch.softmax(logits[:, -1], -1),
+            torch.log_softmax(logits[:, -1], -1),
         )
 
         # Finished requests
@@ -710,8 +710,13 @@ def generate_token(
                 if stop:
                     # Slice with decoder_input_length to remove padding
                     # Decode all tokens
-                    output_text = self.decode(
-                        all_decoder_input_ids[-decoder_input_length:]
+                    output_text, _, _ = self.decode_token(
+                        all_decoder_input_ids,
+                        prefix_offset=len(all_decoder_input_ids)
+                        - decoder_input_length
+                        - 1,
+                        read_offset=len(all_decoder_input_ids) - decoder_input_length,
+                        skip_special_tokens=True,
                     )
 
                     # Get seed
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 133aafd8082..161e69ba891 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -34,7 +34,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         config = AutoConfig.from_pretrained(
             model_id,
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 67137aaa516..75d2b159ceb 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -16,6 +16,7 @@
 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
 from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
 
+
 class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
     def __init__(self, model: Model, cache: Cache, server_urls: List[str]):
         self.cache = cache
@@ -26,7 +27,6 @@ def __init__(self, model: Model, cache: Cache, server_urls: List[str]):
             # Force inference mode for the lifetime of TextGenerationService
             self._inference_mode_raii_guard = torch._C._InferenceMode(True)
 
-
     async def Info(self, request, context):
         return self.model.info
 
@@ -55,9 +55,15 @@ async def FilterBatch(self, request, context):
         return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
 
     async def Warmup(self, request, context):
-        if self.model.batch_type == IdeficsCausalLMBatch: #Hack, i would rather use kwargs in the `from_pb` call
+        if (
+            self.model.batch_type == IdeficsCausalLMBatch
+        ):  # Hack, i would rather use kwargs in the `from_pb` call
             batch = self.model.batch_type.from_pb(
-                request.batch, self.model.tokenizer, self.model.processor, self.model.dtype, self.model.device
+                request.batch,
+                self.model.tokenizer,
+                self.model.processor,
+                self.model.dtype,
+                self.model.device,
             )
         else:
             batch = self.model.batch_type.from_pb(
@@ -70,9 +76,15 @@ async def Warmup(self, request, context):
         )
 
     async def Prefill(self, request, context):
-        if self.model.batch_type == IdeficsCausalLMBatch: #Hack, i would rather use kwargs in the `from_pb` call
+        if (
+            self.model.batch_type == IdeficsCausalLMBatch
+        ):  # Hack, i would rather use kwargs in the `from_pb` call
             batch = self.model.batch_type.from_pb(
-                request.batch, self.model.tokenizer, self.model.processor, self.model.dtype, self.model.device
+                request.batch,
+                self.model.tokenizer,
+                self.model.processor,
+                self.model.dtype,
+                self.model.device,
             )
         else:
             batch = self.model.batch_type.from_pb(
diff --git a/server/text_generation_server/utils/awq/quantize/qmodule.py b/server/text_generation_server/utils/awq/quantize/qmodule.py
new file mode 100644
index 00000000000..ca8caf5080c
--- /dev/null
+++ b/server/text_generation_server/utils/awq/quantize/qmodule.py
@@ -0,0 +1,50 @@
+# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
+
+import math
+import torch
+import torch.nn as nn
+import awq_inference_engine  # with CUDA kernels
+
+
+# class ScaledActivation(nn.Module):
+#     def __init__(self, module, scales):
+#         super().__init__()
+#         self.act = module
+#         self.scales = nn.Parameter(scales.data)
+#
+#     def forward(self, x):
+#         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
+
+
+class WQLinear(nn.Module):
+    def __init__(self, w_bit, group_size, qweight, qzeros, scales, bias):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+
+        self.in_features = qweight.shape[0]
+        self.out_features = qweight.shape[1] * 32 // w_bit
+
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else self.in_features
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        if bias:
+            self.bias = bias
+        else:
+            self.bias = None
+
+    @torch.no_grad()
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features,)
+        out = awq_inference_engine.gemm_forward_cuda(
+            x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/utils/convert.py b/server/text_generation_server/utils/convert.py
index 8d414ecac91..0b62f520836 100644
--- a/server/text_generation_server/utils/convert.py
+++ b/server/text_generation_server/utils/convert.py
@@ -29,9 +29,15 @@ def _remove_duplicate_names(
             [name for name in shared if _is_complete(state_dict[name])]
         )
         if not complete_names:
-            raise RuntimeError(
-                f"Error while trying to find names to remove to save state dict, but found no suitable name to keep for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model since you could be storing much more memory than needed. Please refer to https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an issue."
-            )
+            if len(shared) == 1:
+                # Force contiguous
+                name = list(shared)[0]
+                state_dict[name] = state_dict[name].clone()
+                complete_names = {name}
+            else:
+                raise RuntimeError(
+                    f"Error while trying to find names to remove to save state dict, but found no suitable name to keep for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model since you could be storing much more memory than needed. Please refer to https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an issue."
+                )
 
         keep_name = sorted(list(complete_names))[0]
 
diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py
index c472d1fceab..caf072b764d 100644
--- a/server/text_generation_server/utils/flash_attn.py
+++ b/server/text_generation_server/utils/flash_attn.py
@@ -57,6 +57,7 @@ def attention(
     cu_seqlens,
     max_s,
     softmax_scale,
+    window_size_left=-1,
 ):
     if HAS_FLASH_ATTN_V2:
         return flash_attn_2_cuda.varlen_fwd(
@@ -72,11 +73,18 @@ def attention(
             softmax_scale,
             False,
             True,
+            window_size_left,
+            0,
             False,
             None,
         )
 
     if HAS_FLASH_ATTN:
+        if window_size_left != 0:
+            raise NotImplementedError(
+                "window_size_left is only available with flash attn v2"
+            )
+
         # Flash attention v1 requires q, k and v to have the same number of heads
         if k.shape[1] != q.shape[1]:
             # MQA expand
diff --git a/server/text_generation_server/utils/gptq/exllama.py b/server/text_generation_server/utils/gptq/exllama.py
index 6a1cf117a4c..7353afb57c6 100644
--- a/server/text_generation_server/utils/gptq/exllama.py
+++ b/server/text_generation_server/utils/gptq/exllama.py
@@ -69,10 +69,11 @@ def create_exllama_buffers():
     TEMP_STATE, TEMP_DQ = temp_state, temp_dq
 
 
-class Ex4bitLinear:
+class Ex4bitLinear(torch.nn.Module):
     """Linear layer implementation with per-group 4-bit quantization of the weights"""
 
     def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
         global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
         assert bits == 4
 
diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/utils/gptq/quantize.py
index 9547d5341c0..ca113d8fe99 100644
--- a/server/text_generation_server/utils/gptq/quantize.py
+++ b/server/text_generation_server/utils/gptq/quantize.py
@@ -578,7 +578,9 @@ def __init__(self, input_ids):
     return trainloader, valenc
 
 
-def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False):
+def get_loaders(
+    name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False
+):
     if "wikitext2" in name:
         return get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code)
     if "ptb" in name:
@@ -927,7 +929,7 @@ def _unload():
         seed=seed,
         model_id=model_id,
         seqlen=model.seqlen,
-        trust_remote_code=trust_remote_code
+        trust_remote_code=trust_remote_code,
     )
 
     tick = time.time()
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
index 6be54048901..cf61e47b98b 100644
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@@ -5,6 +5,8 @@
 from torch import nn
 from torch.nn import functional as F
 from typing import List
+from loguru import logger
+from functools import lru_cache
 
 HAS_BITS_AND_BYTES = True
 try:
@@ -18,6 +20,13 @@
 
 from text_generation_server.utils.gptq.quant_linear import QuantLinear
 
+
+HAS_AWQ = True
+try:
+    from text_generation_server.utils.awq.quantize.qmodule import WQLinear
+except ImportError:
+    HAS_AWQ = False
+
 try:
     major, _minor = torch.cuda.get_device_capability()
 except Exception:
@@ -27,14 +36,24 @@
 if os.getenv("DISABLE_EXLLAMA") == "True":
     HAS_EXLLAMA = False
 elif CAN_EXLLAMA:
-        try:
-            from text_generation_server.utils.gptq.exllama import Ex4bitLinear
-            HAS_EXLLAMA = True
-        except ImportError:
-            pass
+    try:
+        from text_generation_server.utils.gptq.exllama import Ex4bitLinear
+
+        HAS_EXLLAMA = True
+    except ImportError:
+        pass
 
 from typing import Optional
 
+HAS_EETQ = False
+try:
+    from EETQ import quant_weights, w8_a16_gemm
+
+    HAS_EETQ = True
+except ImportError:
+    pass
+
+
 # Monkey patching
 @classmethod
 def load_layer_norm(cls, prefix, weights, eps):
@@ -58,12 +77,18 @@ def load_layer_norm_no_bias(cls, prefix, weights, eps):
     ln.bias = None
     return ln
 
+
 @classmethod
 def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
     weight = weights.get_tensor(f"{prefix}.weight")
     bias = weights.get_tensor(f"{prefix}.bias")
     with init_empty_weights():
-        conv2d = cls(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride)
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
 
     conv2d.weight = nn.Parameter(weight)
     conv2d.bias = nn.Parameter(bias)
@@ -71,10 +96,17 @@ def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, st
 
 
 @classmethod
-def load_conv2d_no_bias(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
+def load_conv2d_no_bias(
+    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
+):
     weight = weights.get_tensor(f"{prefix}.weight")
     with init_empty_weights():
-        conv2d = cls(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride)
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
 
     conv2d.weight = nn.Parameter(weight)
     conv2d.bias = None
@@ -113,6 +145,30 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.linear(input, self.weight, self.bias)
 
 
+class EETQLinear(nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        device = weight.device
+        weight = torch.t(weight).contiguous().cpu()
+        weight, scale = quant_weights(weight, torch.int8, False)
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        self.weight = weight.cuda(device)
+        self.scale = scale.cuda(device)
+        self.bias = bias.cuda(device) if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = w8_a16_gemm(input, self.weight, self.scale)
+        output = output + self.bias if self.bias is not None else output
+        return output
+
+
 class Linear8bitLt(nn.Module):
     def __init__(
         self,
@@ -175,7 +231,10 @@ class Linear4bit(nn.Module):
     def __init__(self, weight, bias, quant_type):
         super().__init__()
         self.weight = Params4bit(
-            weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type
+            weight.data,
+            requires_grad=False,
+            compress_statistics=True,
+            quant_type=quant_type,
         )
         self.compute_dtype = None
         self.weight.cuda(weight.device)
@@ -204,10 +263,25 @@ def forward(self, x: torch.Tensor):
         return out
 
 
+@lru_cache(1)
+def warn_deprecate_bnb():
+    logger.warning(
+        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
+    )
+
+
 def get_linear(weight, bias, quantize):
     if quantize is None:
         linear = FastLinear(weight, bias)
+    elif quantize == "eetq":
+        if HAS_EETQ:
+            linear = EETQLinear(weight, bias)
+        else:
+            raise ImportError(
+                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
+            )
     elif quantize == "bitsandbytes":
+        warn_deprecate_bnb()
         linear = Linear8bitLt(
             weight,
             bias,
@@ -248,6 +322,21 @@ def get_linear(weight, bias, quantize):
                 bits,
                 groupsize,
             )
+    elif quantize == "awq":
+        try:
+            qweight, qzeros, scales, _, bits, groupsize, _ = weight
+        except Exception:
+            raise NotImplementedError(
+                f"The passed weight is not `awq` compatible, loader needs to be updated."
+            )
+        linear = WQLinear(
+            w_bit=bits,
+            group_size=groupsize,
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            bias=bias is not None,
+        )
     else:
         raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
     return linear
@@ -283,8 +372,8 @@ def load(config, prefix: str, weights):
             weight = weights.get_tensor(f"{prefix}.weight")
             should_gather = False
 
-        # GPTQ doesn't quantize heads (nor embeddings)
-        if config.quantize == "gptq":
+        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
+        if config.quantize in ["gptq", "awq", "eetq"]:
             quantize = None
         else:
             quantize = config.quantize
@@ -331,6 +420,17 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class TensorParallelColumnLinear(SuperLayer):
+    @classmethod
+    def load_qkv(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
+        if bias:
+            raise NotImplementedError("packed_qkv only implemented for baichuan")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
     @classmethod
     def load(cls, config, prefix: str, weights, bias: bool):
         return cls.load_multi(config, [prefix], weights, bias, dim=0)
@@ -459,14 +559,16 @@ def forward(self, hidden_states, residual=None):
 
     def _create_inv_freq(dim, base, device):
         inv_freq = 1.0 / (
-            base
-            ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+            base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
         )
         return inv_freq
 
     def _get_rope_config(config):
         if os.getenv("ROPE_SCALING", None) is not None:
-            rope_scaling = {"type": os.environ["ROPE_SCALING"], "factor": float(os.environ["ROPE_FACTOR"])}
+            rope_scaling = {
+                "type": os.environ["ROPE_SCALING"],
+                "factor": float(os.environ["ROPE_FACTOR"]),
+            }
             return rope_scaling
         return getattr(config, "rope_scaling", None)
 
@@ -492,9 +594,17 @@ def static(cls, config, dim, base, device):
                 if rope_scaling["type"] == "linear":
                     pass
                 elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(dim=dim, max_position_embeddings=config.max_position_embeddings, base=base, device=inv_freq.device, scaling_factor=scaling_factor)
+                    return DynamicPositionRotaryEmbedding(
+                        dim=dim,
+                        max_position_embeddings=config.max_position_embeddings,
+                        base=base,
+                        device=inv_freq.device,
+                        scaling_factor=scaling_factor,
+                    )
                 else:
-                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
+                    raise NotImplementedError(
+                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                    )
             return cls(inv_freq, scaling_factor)
 
         @classmethod
@@ -512,9 +622,17 @@ def load(cls, config, prefix, weights):
                 if rope_scaling["type"] == "linear":
                     pass
                 elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(dim=2*inv_freq.shape[0], max_position_embeddings=config.max_position_embeddings, base=10000.0, device=inv_freq.device, scaling_factor=scaling_factor)
+                    return DynamicPositionRotaryEmbedding(
+                        dim=2 * inv_freq.shape[0],
+                        max_position_embeddings=config.max_position_embeddings,
+                        base=10000.0,
+                        device=inv_freq.device,
+                        scaling_factor=scaling_factor,
+                    )
                 else:
-                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
+                    raise NotImplementedError(
+                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                    )
             return cls(inv_freq, scaling_factor)
 
         def _update_cos_sin_cache(self, dtype, device, seqlen):
@@ -574,8 +692,13 @@ def _update_cos_sin_cache(self, dtype, device, seqlen):
                 or self._cos_cached.dtype != dtype
             ):
                 if seqlen > self.max_position_embeddings:
-                    newbase = self.base * ((self.scaling_factor * seqlen / self.max_position_embeddings) - (self.scaling_factor - 1)) ** (self.dim / (self.dim - 2))
-                    self.inv_freq = _create_inv_freq(self.dim, newbase, self.inv_freq.device)
+                    newbase = self.base * (
+                        (self.scaling_factor * seqlen / self.max_position_embeddings)
+                        - (self.scaling_factor - 1)
+                    ) ** (self.dim / (self.dim - 2))
+                    self.inv_freq = _create_inv_freq(
+                        self.dim, newbase, self.inv_freq.device
+                    )
                 self._seq_len_cached = seqlen
                 t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                 # Don't do einsum, it converts fp32 to fp16
@@ -585,6 +708,5 @@ def _update_cos_sin_cache(self, dtype, device, seqlen):
                 self._cos_cached = torch.cos(freqs).to(dtype)
                 self._sin_cached = torch.sin(freqs).to(dtype)
 
-
 except ImportError:
     pass
diff --git a/server/text_generation_server/utils/peft.py b/server/text_generation_server/utils/peft.py
index be1f944422b..e37447dcc7b 100644
--- a/server/text_generation_server/utils/peft.py
+++ b/server/text_generation_server/utils/peft.py
@@ -6,6 +6,7 @@
 from transformers import AutoTokenizer
 from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
 
+
 def download_and_unload_peft(model_id, revision, trust_remote_code):
     torch_dtype = torch.float16
 
@@ -33,7 +34,7 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
     base_model_id = model.peft_config["default"].base_model_name_or_path
 
     model = model.merge_and_unload()
-    
+
     os.makedirs(model_id, exist_ok=True)
     cache_dir = model_id
     logger.info(f"Saving the newly created merged model to {cache_dir}")
@@ -41,6 +42,3 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
     model.save_pretrained(cache_dir, safe_serialization=True)
     model.config.save_pretrained(cache_dir)
     tokenizer.save_pretrained(cache_dir)
-
-
-
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
index 7b003f1dc65..f6339d7c1f7 100644
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -363,7 +363,7 @@ def batch_top_tokens(
     # Find the new "fuzzy" top n values
     top_n_indices = (logprobs >= nth_highest).nonzero()
     _, top_n_ishes = torch.unique_consecutive(top_n_indices[:, 0], return_counts=True)
- 
+
     k = 1 if top_n_ishes.numel() == 0 else top_n_ishes.max()
     # Take a new topk for these new max n values
     top_k = torch.topk(logprobs, k=k, dim=1, sorted=True)
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
index 261456bd467..8a19fd9f722 100644
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -62,7 +62,7 @@ def _get_slice(self, tensor_name: str):
     def get_shape(self, tensor_name: str):
         return self._get_slice(tensor_name).get_shape()
 
-    def get_tensor(self, tensor_name: str):
+    def get_tensor(self, tensor_name: str, to_device=True):
         filename, tensor_name = self.get_filename(tensor_name)
         f = self._get_handle(filename)
         tensor = f.get_tensor(tensor_name)
@@ -70,16 +70,17 @@ def get_tensor(self, tensor_name: str):
         # u4 which are disguised as int32
         if tensor.dtype not in [torch.int32, torch.int64]:
             tensor = tensor.to(dtype=self.dtype)
-        tensor = tensor.to(device=self.device)
+        if to_device:
+            tensor = tensor.to(device=self.device)
         return tensor
 
     def get_partial_sharded(self, tensor_name: str, dim: int):
         filename, tensor_name = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        slice_ = f.get_slice(tensor_name)
         world_size = self.process_group.size()
         rank = self.process_group.rank()
 
-        f = self._get_handle(filename)
-        slice_ = f.get_slice(tensor_name)
         size = slice_.get_shape()[dim]
         block_size = size // world_size
         start = rank * block_size
@@ -109,15 +110,81 @@ def get_sharded(self, tensor_name: str, dim: int):
         ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
         return self.get_partial_sharded(tensor_name, dim)
 
+    def _get_qweight(self, name: str):
+        slice_ = self._get_slice(name)
+        total_size = slice_.get_shape()[1]
+        assert total_size % 3 == 0, "Prepacked quantized qkv is not divisible by 3"
+        single_size = total_size // 3
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+
+        assert (
+            single_size % world_size == 0
+        ), f"Prepacked quantized qkv cannot be sharded across {world_size} shards"
+        block_size = single_size // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        q = slice_[:, start:stop]
+        k = slice_[:, start + single_size : stop + single_size]
+        v = slice_[:, start + 2 * single_size : stop + 2 * single_size]
+        weight = torch.cat([q, k, v], dim=1)
+        weight = weight.to(device=self.device)
+        return weight
+
+    def get_weights_col_packed_qkv(self, prefix: str, quantize: str):
+        """
+        Highly specific when the underlying tensor is a simple cat of Q,K,V instead of being
+        already alternating Q,K,V within the main tensor
+        """
+        if quantize in ["gptq", "awq"]:
+            try:
+                qweight = self._get_qweight(f"{prefix}.qweight")
+            except RuntimeError:
+                raise RuntimeError(
+                    f"Cannot load `{quantize}` weight, make sure the model is already quantized."
+                )
+
+            qzeros = self._get_qweight(f"{prefix}.qzeros")
+            scales = self._get_qweight(f"{prefix}.scales")
+            scales = scales.to(dtype=self.dtype)
+            if quantize == "gptq":
+                g_idx = self.get_tensor(f"{prefix}.g_idx")
+            else:
+                g_idx = None
+
+            bits, groupsize = self._get_gptq_params()
+            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
+        else:
+            slice_ = self._get_slice(f"{prefix}.weight")
+            total_size = slice_.get_shape()[0]
+            assert total_size % 3 == 0, "Prepacked qkv is not divisible by 3"
+            single_size = total_size // 3
+            world_size = self.process_group.size()
+            rank = self.process_group.rank()
+
+            assert (
+                single_size % world_size == 0
+            ), f"Prepacked qkv cannot be sharded across {world_size} shards"
+            block_size = single_size // world_size
+            start = rank * block_size
+            stop = (rank + 1) * block_size
+            q = slice_[start:stop]
+            k = slice_[start + single_size : stop + single_size]
+            v = slice_[start + 2 * single_size : stop + 2 * single_size]
+            weight = torch.cat([q, k, v], dim=0)
+            weight = weight.to(device=self.device)
+            weight = weight.to(dtype=self.dtype)
+        return weight
+
     def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
-        if quantize == "gptq":
+        if quantize in ["gptq", "awq"]:
             try:
                 qweight = torch.cat(
                     [self.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
                 )
             except RuntimeError:
                 raise RuntimeError(
-                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+                    f"Cannot load `{quantize}` weight, make sure the model is already quantized"
                 )
 
             qzeros = torch.cat(
@@ -126,10 +193,14 @@ def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
             scales = torch.cat(
                 [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
             )
-            w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
-            for w2 in w[1:]:
-                torch.testing.assert_close(w2, w[0])
-            g_idx = w[0]
+
+            if quantize == "gptq":
+                w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
+                for w2 in w[1:]:
+                    torch.testing.assert_close(w2, w[0])
+                g_idx = w[0]
+            else:
+                g_idx = None
 
             bits, groupsize = self._get_gptq_params()
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
@@ -138,6 +209,22 @@ def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
             weight = torch.cat(w, dim=dim)
         return weight
 
+    def get_tensor_shard(self, var, dim):
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+        block_size = var.size()[dim] // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        if dim == 0:
+            tensor = var[start:stop]
+        elif dim == 1:
+            tensor = var[:, start:stop]
+        else:
+            raise NotImplementedError("Let's make that generic when needed")
+        tensor = tensor.to(dtype=self.dtype)
+        tensor = tensor.to(device=self.device)
+        return tensor
+
     def get_multi_weights_row(self, prefix: str, quantize: str):
         if quantize == "gptq":
             use_exllama = True
@@ -173,10 +260,11 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
             from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA
 
             if use_exllama:
-                if not HAS_EXLLAMA and CAN_EXLLAMA:
-                    logger.warning(
-                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
-                    )
+                if not HAS_EXLLAMA:
+                    if CAN_EXLLAMA:
+                        logger.warning(
+                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
+                        )
                     use_exllama = False
                 else:
                     logger.info("Using exllama kernels")
@@ -203,6 +291,22 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
                 scales = self.get_tensor(f"{prefix}.scales")
                 g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
 
+            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
+        elif quantize == "awq":
+            bits, groupsize = self._get_gptq_params()
+
+            try:
+                qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `awq` weight, make sure the model is already quantized"
+                )
+
+            qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
+            scales = self.get_sharded(f"{prefix}.scales", dim=0)
+            g_idx = None
+            use_exllama = False
+
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
         else:
             weight = self.get_sharded(f"{prefix}.weight", dim=1)
@@ -222,7 +326,7 @@ def _get_gptq_params(self) -> Tuple[int, int]:
         return bits, groupsize
 
     def _set_gptq_params(self, model_id):
-        filename = "quantize_config.json"
+        filename = "config.json"
         try:
             if os.path.exists(os.path.join(model_id, filename)):
                 filename = os.path.join(model_id, filename)
@@ -230,7 +334,29 @@ def _set_gptq_params(self, model_id):
                 filename = hf_hub_download(model_id, filename=filename)
             with open(filename, "r") as f:
                 data = json.load(f)
-            self.gptq_bits = data["bits"]
-            self.gptq_groupsize = data["group_size"]
+            self.gptq_bits = data["quantization_config"]["bits"]
+            self.gptq_groupsize = data["quantization_config"]["group_size"]
         except Exception:
-            pass
+            filename = "quantize_config.json"
+            try:
+                if os.path.exists(os.path.join(model_id, filename)):
+                    filename = os.path.join(model_id, filename)
+                else:
+                    filename = hf_hub_download(model_id, filename=filename)
+                with open(filename, "r") as f:
+                    data = json.load(f)
+                self.gptq_bits = data["bits"]
+                self.gptq_groupsize = data["group_size"]
+            except Exception:
+                filename = "quant_config.json"
+                try:
+                    if os.path.exists(os.path.join(model_id, filename)):
+                        filename = os.path.join(model_id, filename)
+                    else:
+                        filename = hf_hub_download(model_id, filename=filename)
+                    with open(filename, "r") as f:
+                        data = json.load(f)
+                    self.gptq_bits = data["w_bit"]
+                    self.gptq_groupsize = data["q_group_size"]
+                except Exception:
+                    pass
diff --git a/update_doc.py b/update_doc.py
new file mode 100644
index 00000000000..81e6a94ef5e
--- /dev/null
+++ b/update_doc.py
@@ -0,0 +1,38 @@
+import subprocess
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--check", action="store_true")
+
+    args = parser.parse_args()
+
+    output = subprocess.check_output(["text-generation-launcher", "--help"]).decode(
+        "utf-8"
+    )
+    wrap_code_blocks_flag = "<!-- WRAP CODE BLOCKS -->"
+    final_doc = f"# Text-generation-launcher arguments\n\n{wrap_code_blocks_flag}\n\n```\n{output}\n```"
+
+    filename = "docs/source/basic_tutorials/launcher.md"
+    if args.check:
+        with open(filename, "r") as f:
+            doc = f.read()
+            if doc != final_doc:
+                tmp = "launcher.md"
+                with open(tmp, "w") as g:
+                    g.write(final_doc)
+                diff = subprocess.run(
+                    ["diff", tmp, filename], capture_output=True
+                ).stdout.decode("utf-8")
+                print(diff)
+                raise Exception(
+                    "Doc is not up-to-date, run `python update_doc.py` in order to update it"
+                )
+    else:
+        with open(filename, "w") as f:
+            f.write(final_doc)
+
+
+if __name__ == "__main__":
+    main()