-
Notifications
You must be signed in to change notification settings - Fork 795
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for
text-to-speech
(w/ Speecht5) (#345)
* Add vocoder to export * Add tokenizer.json export for speecht5 models * Update speecht5 supported models * Create `SpeechT5Tokenizer` * Add `ones` and `ones_like` tensor functions * Add support for speecht5 text-to-speech * Disambiguate `SpeechSeq2Seq` and `Seq2SeqLM` * Create `TextToAudioPipeline` * Add listed support for `text-to-audio` / `text-to-speech` * Use unquantized vocoder by default * Skip speecht5 unit tests for now Due to bug in transformers: huggingface/transformers#26547 * Update example pipeline output * Create simple in-browser TTS demo * Add template README * Delete package-lock.json * Update required transformers.js version * Add link to Transformers.js * Double -> Single quotes * Add link to text-to-speech demo * Update sample speaker embeddings
- Loading branch information
Showing
28 changed files
with
988 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
module.exports = { | ||
root: true, | ||
env: { browser: true, es2020: true }, | ||
extends: [ | ||
'eslint:recommended', | ||
'plugin:react/recommended', | ||
'plugin:react/jsx-runtime', | ||
'plugin:react-hooks/recommended', | ||
], | ||
ignorePatterns: ['dist', '.eslintrc.cjs'], | ||
parserOptions: { ecmaVersion: 'latest', sourceType: 'module' }, | ||
settings: { react: { version: '18.2' } }, | ||
plugins: ['react-refresh'], | ||
rules: { | ||
'react-refresh/only-export-components': [ | ||
'warn', | ||
{ allowConstantExport: true }, | ||
], | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Logs | ||
logs | ||
*.log | ||
npm-debug.log* | ||
yarn-debug.log* | ||
yarn-error.log* | ||
pnpm-debug.log* | ||
lerna-debug.log* | ||
|
||
node_modules | ||
dist | ||
dist-ssr | ||
*.local | ||
|
||
# Editor directories and files | ||
.vscode/* | ||
!.vscode/extensions.json | ||
.idea | ||
.DS_Store | ||
*.suo | ||
*.ntvs* | ||
*.njsproj | ||
*.sln | ||
*.sw? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# React + Vite | ||
|
||
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules. | ||
|
||
Currently, two official plugins are available: | ||
|
||
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh | ||
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8" /> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> | ||
<title>Transformers.js - Text-to-speech demo</title> | ||
</head> | ||
<body> | ||
<div id="root"></div> | ||
<script type="module" src="/src/main.jsx"></script> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
{ | ||
"name": "text-to-speech-client", | ||
"private": true, | ||
"version": "0.0.0", | ||
"type": "module", | ||
"scripts": { | ||
"dev": "vite", | ||
"build": "vite build", | ||
"lint": "eslint . --ext js,jsx --report-unused-disable-directives --max-warnings 0", | ||
"preview": "vite preview" | ||
}, | ||
"dependencies": { | ||
"@xenova/transformers": "^2.7.0", | ||
"react": "^18.2.0", | ||
"react-dom": "^18.2.0" | ||
}, | ||
"devDependencies": { | ||
"@types/react": "^18.2.15", | ||
"@types/react-dom": "^18.2.7", | ||
"@vitejs/plugin-react": "^4.0.3", | ||
"autoprefixer": "^10.4.16", | ||
"eslint": "^8.45.0", | ||
"eslint-plugin-react": "^7.32.2", | ||
"eslint-plugin-react-hooks": "^4.6.0", | ||
"eslint-plugin-react-refresh": "^0.4.3", | ||
"postcss": "^8.4.31", | ||
"tailwindcss": "^3.3.3", | ||
"vite": "^4.4.5" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
export default { | ||
plugins: { | ||
tailwindcss: {}, | ||
autoprefixer: {}, | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
import React, { useState, useEffect, useRef } from 'react'; | ||
|
||
import AudioPlayer from './components/AudioPlayer'; | ||
import Progress from './components/Progress'; | ||
import { SPEAKERS, DEFAULT_SPEAKER } from './constants'; | ||
|
||
const App = () => { | ||
|
||
// Model loading | ||
const [ready, setReady] = useState(null); | ||
const [disabled, setDisabled] = useState(false); | ||
const [progressItems, setProgressItems] = useState([]); | ||
|
||
// Inputs and outputs | ||
const [text, setText] = useState('I love Hugging Face!'); | ||
const [selectedSpeaker, setSelectedSpeaker] = useState(DEFAULT_SPEAKER); | ||
const [output, setOutput] = useState(null); | ||
|
||
// Create a reference to the worker object. | ||
const worker = useRef(null); | ||
|
||
// We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted. | ||
useEffect(() => { | ||
if (!worker.current) { | ||
// Create the worker if it does not yet exist. | ||
worker.current = new Worker(new URL('./worker.js', import.meta.url), { | ||
type: 'module' | ||
}); | ||
} | ||
|
||
// Create a callback function for messages from the worker thread. | ||
const onMessageReceived = (e) => { | ||
switch (e.data.status) { | ||
case 'initiate': | ||
// Model file start load: add a new progress item to the list. | ||
setReady(false); | ||
setProgressItems(prev => [...prev, e.data]); | ||
break; | ||
|
||
case 'progress': | ||
// Model file progress: update one of the progress items. | ||
setProgressItems( | ||
prev => prev.map(item => { | ||
if (item.file === e.data.file) { | ||
return { ...item, progress: e.data.progress } | ||
} | ||
return item; | ||
}) | ||
); | ||
break; | ||
|
||
case 'done': | ||
// Model file loaded: remove the progress item from the list. | ||
setProgressItems( | ||
prev => prev.filter(item => item.file !== e.data.file) | ||
); | ||
break; | ||
|
||
case 'ready': | ||
// Pipeline ready: the worker is ready to accept messages. | ||
setReady(true); | ||
break; | ||
|
||
case 'complete': | ||
// Generation complete: re-enable the "Translate" button | ||
setDisabled(false); | ||
|
||
const blobUrl = URL.createObjectURL(e.data.output); | ||
setOutput(blobUrl); | ||
break; | ||
} | ||
}; | ||
|
||
// Attach the callback function as an event listener. | ||
worker.current.addEventListener('message', onMessageReceived); | ||
|
||
// Define a cleanup function for when the component is unmounted. | ||
return () => worker.current.removeEventListener('message', onMessageReceived); | ||
}); | ||
|
||
|
||
const handleGenerateSpeech = () => { | ||
setDisabled(true); | ||
worker.current.postMessage({ | ||
text, | ||
speaker_id: selectedSpeaker, | ||
}); | ||
}; | ||
|
||
const isLoading = ready === false; | ||
return ( | ||
<div className='min-h-screen flex items-center justify-center bg-gray-100'> | ||
<div className='absolute gap-1 z-50 top-0 left-0 w-full h-full transition-all px-8 flex flex-col justify-center text-center' style={{ | ||
opacity: isLoading ? 1 : 0, | ||
pointerEvents: isLoading ? 'all' : 'none', | ||
background: 'rgba(0, 0, 0, 0.9)', | ||
backdropFilter: 'blur(8px)', | ||
}}> | ||
{isLoading && ( | ||
<label className='text-white text-xl p-3'>Loading models... (only run once)</label> | ||
)} | ||
{progressItems.map(data => ( | ||
<div key={`${data.name}/${data.file}`}> | ||
<Progress text={`${data.name}/${data.file}`} percentage={data.progress} /> | ||
</div> | ||
))} | ||
</div> | ||
<div className='bg-white p-8 rounded-lg shadow-lg w-full max-w-xl m-2'> | ||
<h1 className='text-3xl font-semibold text-gray-800 mb-1 text-center'>In-browser Text to Speech</h1> | ||
<h2 className='text-base font-medium text-gray-700 mb-2 text-center'>Made with <a href='https://huggingface.co/docs/transformers.js'>🤗 Transformers.js</a></h2> | ||
<div className='mb-4'> | ||
<label htmlFor='text' className='block text-sm font-medium text-gray-600'> | ||
Text | ||
</label> | ||
<textarea | ||
id='text' | ||
className='border border-gray-300 rounded-md p-2 w-full' | ||
rows='4' | ||
placeholder='Enter text here' | ||
value={text} | ||
onChange={(e) => setText(e.target.value)} | ||
></textarea> | ||
</div> | ||
<div className='mb-4'> | ||
<label htmlFor='speaker' className='block text-sm font-medium text-gray-600'> | ||
Speaker | ||
</label> | ||
<select | ||
id='speaker' | ||
className='border border-gray-300 rounded-md p-2 w-full' | ||
value={selectedSpeaker} | ||
onChange={(e) => setSelectedSpeaker(e.target.value)} | ||
> | ||
{Object.entries(SPEAKERS).map(([key, value]) => ( | ||
<option key={key} value={value}> | ||
{key} | ||
</option> | ||
))} | ||
</select> | ||
</div> | ||
<div className='flex justify-center'> | ||
<button | ||
className={`${disabled | ||
? 'bg-gray-400 cursor-not-allowed' | ||
: 'bg-blue-500 hover:bg-blue-600' | ||
} text-white rounded-md py-2 px-4`} | ||
onClick={handleGenerateSpeech} | ||
disabled={disabled} | ||
> | ||
{disabled ? 'Generating...' : 'Generate'} | ||
</button> | ||
</div> | ||
{output && <AudioPlayer | ||
audioUrl={output} | ||
mimeType={'audio/wav'} | ||
/>} | ||
</div> | ||
</div> | ||
); | ||
}; | ||
|
||
export default App; |
26 changes: 26 additions & 0 deletions
26
examples/text-to-speech-client/src/components/AudioPlayer.jsx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import { useEffect, useRef } from "react"; | ||
|
||
export default function AudioPlayer({ audioUrl, mimeType }) { | ||
const audioPlayer = useRef(null); | ||
const audioSource = useRef(null); | ||
|
||
// Updates src when url changes | ||
useEffect(() => { | ||
if (audioPlayer.current && audioSource.current) { | ||
audioSource.current.src = audioUrl; | ||
audioPlayer.current.load(); | ||
} | ||
}, [audioUrl]); | ||
|
||
return ( | ||
<div className='flex relative z-10 my-4 w-full'> | ||
<audio | ||
ref={audioPlayer} | ||
controls | ||
className='w-full h-14 rounded-lg bg-white shadow-xl shadow-black/5 ring-1 ring-slate-700/10' | ||
> | ||
<source ref={audioSource} type={mimeType}></source> | ||
</audio> | ||
</div> | ||
); | ||
} |
12 changes: 12 additions & 0 deletions
12
examples/text-to-speech-client/src/components/Progress.jsx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
|
||
export default function Progress({ text, percentage }) { | ||
percentage ??= 0; | ||
return ( | ||
<div className="relative text-black bg-white rounded-lg text-left overflow-hidden"> | ||
<div className='px-2 w-[1%] h-full bg-blue-500 whitespace-nowrap' style={{ width: `${percentage}%` }}> | ||
{text} ({`${percentage.toFixed(2)}%`}) | ||
</div> | ||
</div> | ||
); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
export const SPEAKERS = { | ||
"US female 1": "cmu_us_slt_arctic-wav-arctic_a0001", | ||
"US female 2": "cmu_us_clb_arctic-wav-arctic_a0001", | ||
"US male 1": "cmu_us_bdl_arctic-wav-arctic_a0003", | ||
"US male 2": "cmu_us_rms_arctic-wav-arctic_a0003", | ||
"Canadian male": "cmu_us_jmk_arctic-wav-arctic_a0002", | ||
"Scottish male": "cmu_us_awb_arctic-wav-arctic_b0002", | ||
"Indian male": "cmu_us_ksp_arctic-wav-arctic_a0007", | ||
} | ||
|
||
export const DEFAULT_SPEAKER = "cmu_us_slt_arctic-wav-arctic_a0001"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
@tailwind base; | ||
@tailwind components; | ||
@tailwind utilities; | ||
|
||
:root { | ||
font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif; | ||
line-height: 1.5; | ||
font-weight: 400; | ||
color: #213547; | ||
background-color: #ffffff; | ||
|
||
font-synthesis: none; | ||
text-rendering: optimizeLegibility; | ||
-webkit-font-smoothing: antialiased; | ||
-moz-osx-font-smoothing: grayscale; | ||
-webkit-text-size-adjust: 100%; | ||
} | ||
|
||
audio::-webkit-media-controls-panel { | ||
background-color: white; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import React from 'react' | ||
import ReactDOM from 'react-dom/client' | ||
import App from './App.jsx' | ||
import './index.css' | ||
|
||
ReactDOM.createRoot(document.getElementById('root')).render( | ||
<React.StrictMode> | ||
<App /> | ||
</React.StrictMode>, | ||
) |
Oops, something went wrong.