From c8c7ccd31e1e760d216c9d2f2b17b0d984ed033b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 17 Jun 2024 16:40:44 +0200
Subject: [PATCH] Set maximum grpc message receive size to 2GiB (#2075)

* Set maximum grpc message receive size to 2GiB

The previous default was 4MiB, which doesn't really work well for
multi-modal models.

* Update to Rust 1.79.0

* Fixup formatting to make PR pass
---
 .github/workflows/tests.yaml            |  6 +++---
 CODE_OF_CONDUCT.md                      |  2 +-
 CONTRIBUTING.md                         | 22 +++++++++++-----------
 Dockerfile                              |  2 +-
 Dockerfile_amd                          |  2 +-
 Dockerfile_intel                        |  2 +-
 benchmark/src/app.rs                    | 12 ++++++------
 benchmark/src/table.rs                  |  6 +++---
 benchmark/src/utils.rs                  |  2 +-
 rust-toolchain.toml                     |  6 +++---
 server/text_generation_server/server.py |  6 +++++-
 11 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 74479cc6c0b..83fff1967a3 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -33,9 +33,9 @@ jobs:
       - name: Install Rust
         uses: actions-rs/toolchain@v1
         with:
-          # Released on: 02 May, 2024
-          # https://releases.rs/docs/1.78.0/
-          toolchain: 1.78.0
+          # Released on: June 13, 2024
+          # https://releases.rs/docs/1.79.0/
+          toolchain: 1.79.0
           override: true
           components: rustfmt, clippy
       - name: Install Protoc
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index ef09fa1375a..b23f3150a5a 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -130,4 +130,4 @@ For answers to common questions about this code of conduct, see the FAQ at
 [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
 [Mozilla CoC]: https://github.com/mozilla/diversity
 [FAQ]: https://www.contributor-covenant.org/faq
-[translations]: https://www.contributor-covenant.org/translations
\ No newline at end of file
+[translations]: https://www.contributor-covenant.org/translations
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 39b57c194d9..d541e47f3dd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -55,10 +55,10 @@ feedback.
 The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter.
 
 Before you report an issue, we would really appreciate it if you could **make sure the bug was not
-already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the 
-library itself, and not your code. 
+already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the
+library itself, and not your code.
 
-Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so 
+Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so
 we can quickly resolve it:
 
 * Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies).
@@ -79,20 +79,20 @@ that in your issue report.
 
 If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe:
 
-1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it 
-   a feature related to something you need for a project? Is it something you worked on and think it could benefit 
+1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it
+   a feature related to something you need for a project? Is it something you worked on and think it could benefit
    the community?
 
    Whatever it is, we'd love to hear about it!
 
-2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better 
+2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better
    we'll be able to help you.
 3. Provide a *code snippet* that demonstrates the feature's usage.
 4. If the feature is related to a paper, please include a link.
 
 If your issue is well written we're already 80% of the way there by the time you create it.
 
-We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE) 
+We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE)
 to help you get started with your issue.
 
 ## Do you want to implement a new model?
@@ -107,14 +107,14 @@ If you are willing to contribute the model yourself, let us know so we can help
 
 ## Do you want to add documentation?
 
-We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know 
-how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be 
+We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know
+how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be
 happy to make the changes or help you make a contribution if you're interested!
 
 ## I want to become a maintainer of the project. How do I get there?
 
 TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have
 motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference
-service. 
+service.
 
-If you are such an individual (or organization), please reach out to us and let's collaborate.
\ No newline at end of file
+If you are such an individual (or organization), please reach out to us and let's collaborate.
diff --git a/Dockerfile b/Dockerfile
index 1462833934c..c93372a2f1d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_amd b/Dockerfile_amd
index c79bc03c5b3..55da92046b7 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_intel b/Dockerfile_intel
index cb0e84bb23c..35362fc91cf 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,4 +1,4 @@
-FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/benchmark/src/app.rs b/benchmark/src/app.rs
index 48ac976a0c2..a0a9313a198 100644
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@@ -497,7 +497,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
                 "Lowest:  {:.2} {unit}",
                 data.iter()
                     .min_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
             ),
             Style::default().fg(Color::Reset),
         )]),
@@ -506,7 +506,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
                 "Highest: {:.2} {unit}",
                 data.iter()
                     .max_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
             ),
             Style::default().fg(Color::Reset),
         )]),
@@ -555,17 +555,17 @@ fn latency_throughput_chart<'a>(
     let min_latency: f64 = *latency_iter
         .clone()
         .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let max_latency: f64 = *latency_iter
         .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let min_throughput: f64 = *throughput_iter
         .clone()
         .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let max_throughput: f64 = *throughput_iter
         .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
 
     // Char min max values
     let min_x = if zoom {
diff --git a/benchmark/src/table.rs b/benchmark/src/table.rs
index e18d7310a35..1585a25f4fc 100644
--- a/benchmark/src/table.rs
+++ b/benchmark/src/table.rs
@@ -156,17 +156,17 @@ fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
     let min = data
         .iter()
         .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let max = data
         .iter()
         .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     (average, *min, *max)
 }
 
 fn px(data: &[f64], p: u32) -> f64 {
     let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
-    *data.get(i).unwrap_or(&std::f64::NAN)
+    *data.get(i).unwrap_or(&f64::NAN)
 }
 
 fn format_value(value: f64, unit: &'static str) -> String {
diff --git a/benchmark/src/utils.rs b/benchmark/src/utils.rs
index d096d65510f..20469991c39 100644
--- a/benchmark/src/utils.rs
+++ b/benchmark/src/utils.rs
@@ -37,7 +37,7 @@ pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f
         .iter()
         .map(|&p| {
             let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
-            (format!("p{p}"), *values.get(i).unwrap_or(&std::f64::NAN))
+            (format!("p{p}"), *values.get(i).unwrap_or(&f64::NAN))
         })
         .collect()
 }
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 507ee859411..8c77896e9e0 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-# Released on: 02 May, 2024
-# https://releases.rs/docs/1.78.0/
-channel = "1.78.0"
+# Released on: June 13, 2024
+# https://releases.rs/docs/1.79.0/
+channel = "1.79.0"
 components = ["rustfmt", "clippy"]
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 569b6925a0e..a0347cd8e73 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -240,7 +240,11 @@ async def serve_inner(
             interceptors=[
                 ExceptionInterceptor(),
                 UDSOpenTelemetryAioServerInterceptor(),
-            ]
+            ],
+            options=[
+                # Set the maximum possible message length: i32::MAX
+                ("grpc.max_receive_message_length", (1 << 31) - 1)
+            ],
         )
         generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
             TextGenerationService(model, Cache(), quantize, server_urls), server