diff --git a/example/apiserver/apiserver.cpp b/example/apiserver/apiserver.cpp index 145ae967..70645452 100644 --- a/example/apiserver/apiserver.cpp +++ b/example/apiserver/apiserver.cpp @@ -252,7 +252,11 @@ struct WorkQueue { while (true) { std::unique_lock lock(ts->locker); if (ts->activateQueryNumber >= ts->maxActivateQueryNumber) { +#ifdef WIN32 + Sleep(0); +#else sleep(0); +#endif continue; } if (ts->q.empty()) { @@ -435,7 +439,11 @@ int main(int argc, char** argv) { buff[size] = 0; while (workQueue.q.size() > workQueue.maxActivateQueryNumber) { - sleep(0); +#ifdef WIN32 + Sleep(0); +#else + sleep(0); +#endif } workQueue.Push(buff, client); } diff --git a/src/models/glm.cpp b/src/models/glm.cpp index a5481698..0833687a 100644 --- a/src/models/glm.cpp +++ b/src/models/glm.cpp @@ -78,7 +78,7 @@ namespace fastllm { Data mlpOutput; Data middle, middle2; Data toSave; - Data mem2; + Data mem2,mem3; std::vector lastRet; // GLMBlock std::string weightPre, weightMiddle; @@ -131,8 +131,8 @@ namespace fastllm { Split(qkv, -1, per * 2, per * 3, v); }else{ LayerNorm(mem, weight[inputLNWeightName], weight[inputLNBiasName], -1, mem2); - CatDirect(mem2,attenInput,1); - Linear(mem2, weight[qkvWeightName], weight[qkvBiasName], qkv); + Cat(mem2,attenInput,1,mem3); + Linear(mem3, weight[qkvWeightName], weight[qkvBiasName], qkv); int per = qkv.dims.back() / 3; Split(qkv, -1, 0, per, q0); Split(qkv, -1, per, per * 2, k); @@ -296,10 +296,8 @@ namespace fastllm { for(unsigned int i=0;i(); weight.tokenizer.spProcessor->LoadFromSerializedProto(decoded); - printf("GetPieceSize=%d\n",weight.tokenizer.spProcessor->GetPieceSize()); } } #endif