Skip to content

Commit

Permalink
delete --kv-cache-storage arg.
Browse files Browse the repository at this point in the history
  • Loading branch information
b4rtaz committed Nov 18, 2024
1 parent c61ac19 commit 6e5824a
Show file tree
Hide file tree
Showing 8 changed files with 6 additions and 51 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Python 3 and C++ compiler required. The command will download the model and the

### 🛠️ Convert Model Manually

Supported architectures: Llama, Mixtral, Grok
Supported architectures: Llama, Mixtral

* [How to Convert Llama 2, Llama 3, Llama 3.1](./docs/LLAMA.md)
* [How to Convert Hugging Face Model](./docs/HUGGINGFACE.md)
Expand Down
5 changes: 0 additions & 5 deletions src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ AppArgs AppArgs::parse(int argc, char** argv, bool hasMode) {
args.chatTemplateType = TEMPLATE_UNKNOWN;
args.maxSeqLen = 0;
args.packetAlignment = 0;
args.useDiscForKvCache = false;

int i = 1;
if (hasMode && argc > 1) {
args.mode = argv[1];
Expand Down Expand Up @@ -103,8 +101,6 @@ AppArgs AppArgs::parse(int argc, char** argv, bool hasMode) {
args.chatTemplateType = parseChatTemplateType(value);
} else if (strcmp(name, "--max-seq-len") == 0) {
args.maxSeqLen = (unsigned int)atoi(value);
} else if (strcmp(name, "--kv-cache-storage") == 0) {
args.useDiscForKvCache = strcmp(value, "disc") == 0;
} else if (strcmp(name, "--packet-alignment") == 0) {
args.packetAlignment = (size_t)atoi(value);
} else {
Expand Down Expand Up @@ -142,7 +138,6 @@ void App::run(AppArgs* args, void (*program)(Inference* inference, SocketPool* s
}

TransformerConfig config;
config.useDiscForKvCache = args->useDiscForKvCache;

Transformer transformer = Transformer::loadRootFromFile(args->modelPath, &spec, &config, socketPool);
socketPool->setTurbo(true);
Expand Down
1 change: 0 additions & 1 deletion src/app.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ class AppArgs {
public:
char* mode;
int nThreads;
bool useDiscForKvCache;
size_t packetAlignment;

// inference
Expand Down
1 change: 0 additions & 1 deletion src/apps/dllama/dllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ void worker(AppArgs* args) {
}

TransformerConfig config;
config.useDiscForKvCache = args->useDiscForKvCache;

SocketPool* socketPool = SocketPool::serve(args->port);
TransformerSpec spec;
Expand Down
18 changes: 4 additions & 14 deletions src/transformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,13 +310,8 @@ TransformerBlock::TransformerBlock(TransformerSpec* spec, TransformerConfig* con
#endif

kvCacheSlice = new KvCacheSlice(spec->kvDim, spec->seqLen, spec->nSlices);
if (config->useDiscForKvCache) {
keyCache = (float*)newMmapFileBuffer(sliceIndex, kvCacheSlice->keyCacheSize);
valueCache = (float*)newMmapFileBuffer(sliceIndex, kvCacheSlice->valueCacheSize);
} else {
keyCache = (float*)newBuffer(kvCacheSlice->keyCacheSize);
valueCache = (float*)newBuffer(kvCacheSlice->valueCacheSize);
}
keyCache = (float*)newBuffer(kvCacheSlice->keyCacheSize);
valueCache = (float*)newBuffer(kvCacheSlice->valueCacheSize);

multiHeadAttSlice = new MultiHeadAttSlice(spec->nHeads, spec->seqLen, spec->nSlices, sliceIndex);
att = (float*)newBuffer(multiHeadAttSlice->attSize);
Expand Down Expand Up @@ -376,13 +371,8 @@ TransformerBlock::~TransformerBlock() {
#endif

delete kvCacheSlice;
if (config->useDiscForKvCache) {
freeMmapFileBuffer(keyCache);
freeMmapFileBuffer(valueCache);
} else {
freeBuffer(keyCache);
freeBuffer(valueCache);
}
freeBuffer(keyCache);
freeBuffer(valueCache);
delete multiHeadAttSlice;
freeBuffer(att);

Expand Down
2 changes: 1 addition & 1 deletion src/transformer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ struct TransformerSpec {
};

struct TransformerConfig {
bool useDiscForKvCache;
short __unused__;
};

class TransformerBlock {
Expand Down
25 changes: 0 additions & 25 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,31 +45,6 @@ void freeBuffer(void* buffer) {
#endif
}

unsigned int lastMmapFileBufferIndex = 0;

void* newMmapFileBuffer(unsigned int appInstanceId, size_t size) {
#ifdef _WIN32
throw new std::runtime_error("Mmap file buffer is not supported on Windows yet");
#else
char path[256];
snprintf(path, 256, "mmap-buffer-%d-%d.temp", appInstanceId, lastMmapFileBufferIndex++);
int fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
if (fd == -1)
throw new std::runtime_error("Cannot create mmap buffer file");
if (ftruncate(fd, size) == -1)
throw new std::runtime_error("Cannot truncate mmap buffer file. Not enough disk space?");
void *addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED)
throw new std::runtime_error("Cannot mmap buffer file");
close(fd);
return addr;
#endif
}

void freeMmapFileBuffer(void* addr) {
// TODO
}

unsigned long timeMs() {
struct timeval te;
gettimeofday(&te, NULL);
Expand Down
3 changes: 0 additions & 3 deletions src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@
void* newBuffer(size_t size);
void freeBuffer(void* buffer);

void* newMmapFileBuffer(unsigned int appInstanceId, size_t size);
void freeMmapFileBuffer(void* addr);

unsigned long timeMs();
unsigned int randomU32(unsigned long long *state);
float randomF32(unsigned long long *state);
Expand Down

0 comments on commit 6e5824a

Please sign in to comment.