Skip to content

Commit

Permalink
fix: refactor and move changes to v3 proto
Browse files Browse the repository at this point in the history
  • Loading branch information
drbh committed Jun 6, 2024
1 parent d103264 commit d0f1470
Show file tree
Hide file tree
Showing 9 changed files with 7 additions and 6 deletions.
2 changes: 0 additions & 2 deletions proto/generate.proto
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,6 @@ message Request {
bool prefill_logprobs = 6;
/// Return most likely n tokens
uint32 top_n_tokens = 7;
/// LORA adapter index
optional uint32 adapter_index = 8;
}

message Batch {
Expand Down
2 changes: 2 additions & 0 deletions proto/v3/generate.proto
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ message Request {
repeated uint32 blocks = 9;
/// Paged attention slots
repeated uint32 slots = 10;
/// LORA adapter index
optional uint32 adapter_index = 11;
}

message Batch {
Expand Down
1 change: 0 additions & 1 deletion router/client/src/v2/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ impl Client {
}),
prefill_logprobs: true,
top_n_tokens: 20,
adapter_index: None,
});
n_tokens += max_input_length;

Expand Down
1 change: 1 addition & 0 deletions router/client/src/v3/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ impl Client {
}),
prefill_logprobs: true,
top_n_tokens: 20,
adapter_index: None,
});
n_tokens += max_input_length;

Expand Down
1 change: 1 addition & 0 deletions router/client/src/v3/sharded_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ impl Health for ShardedClient {
// Block 0 is reserved for health checks
blocks: vec![0],
slots: (0..16).collect(),
adapter_index: None,
};
let batch = Batch {
id: u64::MAX,
Expand Down
1 change: 0 additions & 1 deletion router/src/infer/v2/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,6 @@ impl State {
entry.request.stopping_parameters.clone(),
)),
top_n_tokens: entry.request.top_n_tokens,
adapter_index: entry.request.adapter_index,
});
// Set batch_time
entry.batch_time = Some(Instant::now());
Expand Down
2 changes: 2 additions & 0 deletions router/src/infer/v3/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ impl State {
top_n_tokens: entry.request.top_n_tokens,
blocks,
slots,
adapter_index: entry.request.adapter_index,
});
// Set batch_time
entry.batch_time = Some(Instant::now());
Expand Down Expand Up @@ -491,6 +492,7 @@ mod tests {
stop_sequences: vec![],
},
top_n_tokens: 0,
adapter_index: None,
},
response_tx,
span: info_span!("entry"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
DOWN_PROJ = "down_proj"


def load_attention(config, prefix, weights):
def load_attention(config, prefix, weights, layer_id):
# Only defined in granite.
bias = getattr(config, "attention_bias", False)

Expand Down
1 change: 0 additions & 1 deletion server/text_generation_server/utils/weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path
from typing import List, Dict, Optional, Set, Tuple, Union
from safetensors import safe_open, SafetensorError
from safetensors.torch import load_file
import torch
from loguru import logger
from huggingface_hub import hf_hub_download
Expand Down

0 comments on commit d0f1470

Please sign in to comment.