Skip to content

Commit

Permalink
feat: example how to transcribe youtube videos
Browse files Browse the repository at this point in the history
  • Loading branch information
daulet committed Aug 26, 2024
1 parent 668a366 commit c5d2f7a
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 1 deletion.
16 changes: 16 additions & 0 deletions examples/transcribe/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Transcribe Youtube videos

```
youtube-dl -f "worst" "https://www.youtube.com/watch?v=l8pRSuU81PU"
ffmpeg \
-i video.mp4 \
-ar 16000 \
-ac 1 \
-map 0:a: \
audio.mp3
# split into chunks of 15 minutes, too many files and we run into rate limits
ffmpeg -i audio.mp3 -f segment -segment_time 900 -c copy intervals/out%03d.mp3
go run ./examples/transcribe ./intervals
```
109 changes: 109 additions & 0 deletions examples/transcribe/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package main

import (
"context"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"sync"
"time"

"github.com/daulet/cmd/config"
"github.com/daulet/cmd/provider"
)

const MAX_CONCURRENT_REQUESTS = 10

type work struct {
idx int
file string
}

type result struct {
idx int
content string
}

func run() error {
if len(os.Args) < 2 {
return fmt.Errorf("Usage: transcribe <directory>")
}
dir := os.Args[1]

files, err := filepath.Glob(filepath.Join(dir, "*.mp3"))
if err != nil {
return fmt.Errorf("failed to glob files: %w", err)
}

prov := provider.NewGroqProvider(os.Getenv("GROQ_API_KEY"))
{
cache, err := provider.NewCacheProvider(prov, ".cache/cache.json")
if err != nil {
return fmt.Errorf("failed to create cache provider: %w", err)
}
defer cache.Close()
prov = cache
}

ctx := context.Background()
var wg sync.WaitGroup
resCh := make(chan *result)
workCh := make(chan *work)
for range MAX_CONCURRENT_REQUESTS {
wg.Add(1)
go func() {
defer wg.Done()
for item := range workCh {
for {
fmt.Println("transcribing", item.file)
content, err := prov.Transcribe(ctx, &config.Config{}, item.file)
if err != nil {
waitTime := time.Minute
// Parse error message like:
// "Please try again in 6m6.125s."
re := regexp.MustCompile(`Please try again in (\d+)m(\d+.\d+)s\.`)
matches := re.FindStringSubmatch(err.Error())
if len(matches) == 3 {
minutes, _ := strconv.Atoi(matches[1])
seconds, _ := strconv.ParseFloat(matches[2], 64)
waitTime = time.Duration(minutes)*time.Minute + time.Duration(seconds)*time.Second
}
fmt.Printf("waiting for %s\n", waitTime)
<-time.After(waitTime)
continue
}
resCh <- &result{idx: item.idx, content: content}
break
}
}
}()
}

go func() {
for idx, file := range files {
workCh <- &work{idx: idx, file: file}
}
close(workCh)
}()

transcripts := make([]string, len(files))
for res := range resCh {
transcripts[res.idx] = res.content
}
wg.Wait()
close(resCh)

for idx, transcript := range transcripts {
fmt.Printf("%d: %s\n", idx, transcript)
}
return nil
}

func main() {
if err := run(); err != nil {
log.Fatal(err)
}
}
2 changes: 1 addition & 1 deletion provider/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (c *cacheProvider) Close() error {
if err := os.MkdirAll(filepath.Dir(c.cachePath), 0755); err != nil {
return err
}
data, err := json.Marshal(c.c)
data, err := json.MarshalIndent(c.c, "", " ")
if err != nil {
return err
}
Expand Down

0 comments on commit c5d2f7a

Please sign in to comment.