diff --git a/common/arg.cpp b/common/arg.cpp index a35792a610556..96c7f3879c926 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2215,6 +2215,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; params.vocoder.hf_repo = "ggml-org/WavTokenizer"; params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; + params.ctx_shift = false; // for better results } ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); diff --git a/examples/server/public_tts/index.html b/examples/server/public_tts/index.html index 10beafd5e64c3..3431cffa91f12 100644 --- a/examples/server/public_tts/index.html +++ b/examples/server/public_tts/index.html @@ -6,7 +6,7 @@ llama.cpp TTS @@ -16,10 +16,15 @@

llama.cpp TTS

Input text:

- +
+
+

Status: ready


+

diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 14dae741ee2b2..d99d100ca99b3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -942,6 +942,7 @@ struct server_task_result_embd : server_task_result { struct server_task_result_tts_embd : server_task_result { int index = 0; std::vector embd; + double t_ms = 0.0; virtual int get_index() override { return index; // unused @@ -1749,11 +1750,15 @@ struct server_context { if (!params.vocoder.model.empty()) { common_params v_params = params_base; - v_params.model = params.vocoder.model; - v_params.model_url = params.vocoder.model_url; - v_params.hf_repo = params.vocoder.hf_repo; - v_params.hf_file = params.vocoder.hf_file; - v_params.embedding = true; + v_params.model = params.vocoder.model; + v_params.model_url = params.vocoder.model_url; + v_params.hf_repo = params.vocoder.hf_repo; + v_params.hf_file = params.vocoder.hf_file; + v_params.embedding = true; + v_params.pooling_type = LLAMA_POOLING_TYPE_NONE; + // make sure the vocoder has the sufficient batch size + v_params.n_batch = v_params.n_ctx; + v_params.n_ubatch = v_params.n_ctx; llama_init_vocoder = common_init_from_params(v_params); } @@ -2606,9 +2611,18 @@ struct server_context { } break; case SERVER_TASK_TYPE_TTS_EMBD: { + const auto ctx_cts = llama_init_vocoder.context.get(); + const int n_ubatch = llama_n_ubatch(ctx_cts); + const int n_codes = (int) task.prompt_tokens.size(); + if (n_codes > n_ubatch) { + send_error(task, string_format("Number of codes (%d) exceeds the maximum ubatch of vocoder model (%d)", n_codes, n_ubatch), ERROR_TYPE_INVALID_REQUEST); + break; + } + std::vector embd; - SRV_DBG("tts_get_embd with %d tokens", (int) task.prompt_tokens.size()); - int status = tts_get_embd(llama_init_vocoder.context.get(), task.prompt_tokens, embd); + uint64_t t_start = ggml_time_us(); + SRV_DBG("tts_get_embd with %d codes", n_codes); + int status = tts_get_embd(ctx_cts, task.prompt_tokens, embd); if (status != 0) { send_error(task, string_format("Failed to get TTS embedding, status code = %d", status), ERROR_TYPE_SERVER); break; @@ -2620,6 +2634,7 @@ struct server_context { auto res = std::make_unique(); res->id = task.id; res->embd = std::move(embd); + res->t_ms = (ggml_time_us() - t_start) / 1e3; queue_results.send(std::move(res)); } break; } @@ -4149,8 +4164,9 @@ int main(int argc, char ** argv) { return; } - llama_tokens audio_tokens; - // convert text to audio token + llama_tokens codes; + result_timings ttc_timings; + // convert text to codes { server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); task.id = ctx_server.queue_tasks.get_new_id(); @@ -4174,32 +4190,35 @@ int main(int argc, char ** argv) { const server_task_result_cmpl_final * result = dynamic_cast(raw_result.get()); GGML_ASSERT(result != nullptr); GGML_ASSERT(!result->tokens.empty()); - audio_tokens = std::move(result->tokens); + codes = std::move(result->tokens); // debug - SRV_DBG("codes str (before filter) = %s\n", common_detokenize(ctx_server.ctx, audio_tokens, true).c_str()); + // SRV_DBG("codes str (before filter) = %s\n", common_detokenize(ctx_server.ctx, codes, true).c_str()); - // post-process audio tokens + // post-process codes // remove all non-audio tokens (i.e. < 151672 || > 155772) - audio_tokens.erase(std::remove_if( - audio_tokens.begin(), - audio_tokens.end(), + codes.erase(std::remove_if( + codes.begin(), + codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), - audio_tokens.end()); - SRV_DBG("codes size = %d\n", (int) audio_tokens.size()); + codes.end()); + SRV_DBG("codes size = %d\n", (int) codes.size()); + + ttc_timings = std::move(result->timings); } // debug - SRV_DBG("codes str = %s\n", common_detokenize(ctx_server.ctx, audio_tokens, true).c_str()); + // SRV_DBG("codes str = %s\n", common_detokenize(ctx_server.ctx, codes, true).c_str()); - // convert audio token to embeddings - int n_embd = llama_n_embd(ctx_server.model); + // convert codes to embeddings + int n_embd = llama_n_embd(ctx_server.llama_init_vocoder.model.get()); int n_codes = -1; + double t_voc_ms = 0.0; std::vector embd; { server_task task = server_task(SERVER_TASK_TYPE_TTS_EMBD); task.id = ctx_server.queue_tasks.get_new_id(); - task.prompt_tokens = std::move(audio_tokens); + task.prompt_tokens = std::move(codes); ctx_server.queue_results.add_waiting_tasks({task}); ctx_server.queue_tasks.post(task); @@ -4215,16 +4234,25 @@ int main(int argc, char ** argv) { GGML_ASSERT(!result->embd.empty()); // flatten the array - n_codes = result->embd.size() / n_embd; - embd = std::move(result->embd); - SRV_DBG("tts embd n_code = %d\n", n_codes); - SRV_DBG("tts embd size = %zu\n", embd.size()); + n_codes = result->embd.size() / n_embd; + embd = std::move(result->embd); + t_voc_ms = result->t_ms; + SRV_DBG("tts embd n_code = %d\n", n_codes); + SRV_DBG("tts embd size = %zu\n", embd.size()); + SRV_DBG("tts embd t_voc_ms = %lf\n", t_voc_ms); GGML_ASSERT(n_codes > 0); } // convert embeddings to wav // will be freed by chunked_content_provider + const auto t_spec_start = ggml_time_us(); std::vector audio = tts_embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads); + double t_spec_ms = (ggml_time_us() - t_spec_start) / 1e3; + + // for now, we can only leave timings in response headers, mostly for debugging + res.set_header("X-timings-ttc", ttc_timings.to_json().dump()); + res.set_header("X-timings-voc", (json{{ "t_voc_ms", t_voc_ms }}).dump()); + res.set_header("X-timings-spec", (json{{ "t_spec_ms", t_spec_ms }}).dump()); const auto chunked_content_provider = [audio = std::move(audio)](size_t, httplib::DataSink & sink) mutable { // TODO: some how reuse save_wav16 instead of duplicating the code here diff --git a/examples/tts/tts-impl.cpp b/examples/tts/tts-impl.cpp index e3d602123ab4a..50d3a15e9ed54 100644 --- a/examples/tts/tts-impl.cpp +++ b/examples/tts/tts-impl.cpp @@ -14,6 +14,7 @@ #include #include #include +#include // // Terminal utils diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 3de21c1e6ea6b..680a3cc60a6e7 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -346,28 +346,14 @@ int main(int argc, char ** argv) { LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size()); } - for (auto & token : codes) { - token -= 151672; - } - const auto t_voc_start = ggml_time_us(); - const int n_codes = codes.size(); - - llama_batch batch = llama_batch_init(n_codes, 0, 1); - - for (size_t i = 0; i < codes.size(); ++i) { - common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits? - } - GGML_ASSERT(batch.n_tokens == n_codes); - - if (llama_decode(ctx_cts, batch) != 0) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + std::vector embd; + if (tts_get_embd(ctx_cts, codes, embd) != 0) { + LOG_ERR("%s: tts_get_embd() failed\n", __func__); return 1; } - llama_synchronize(ctx_cts); - LOG_INF("%s: time for vocoder: %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f); const auto t_spec_start = ggml_time_us(); @@ -375,10 +361,9 @@ int main(int argc, char ** argv) { #if 1 // spectral operations const int n_embd = llama_n_embd(model_cts); - const float * embd = llama_get_embeddings(ctx_cts); - - auto audio = tts_embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads); + const int n_codes = codes.size(); + auto audio = tts_embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads); #else // read the spectrogram from a file for debugging purposes std::vector audio;