diff --git a/service/domain/content/lexer.go b/service/domain/content/lexer.go new file mode 100644 index 0000000..ea79a72 --- /dev/null +++ b/service/domain/content/lexer.go @@ -0,0 +1,290 @@ +package content + +import ( + "strings" + "unicode" + "unicode/utf8" + + "github.com/boreq/errors" +) + +type Token struct { + Type TokenType + Text string +} + +var ( + TokenTypeText = TokenType{"text"} + TokenTypeLink = TokenType{"link"} + TokenTypeNostrLink = TokenType{"nostrLink"} +) + +type TokenType struct { + s string +} + +type Lexer struct { + in string + out string + tokens []Token +} + +func NewLexer(s string) *Lexer { + return &Lexer{in: s} +} + +func (l *Lexer) Lex() ([]Token, error) { + if !utf8.ValidString(l.in) { + return nil, errors.New("invalid utf-8") + } + + var state stateFn = stateText + + for state != nil { + newState, err := state(l) + if err != nil { + return nil, errors.Wrap(err, "state fn error") + } + + state = newState + } + + return l.tokens, nil +} + +func (l *Lexer) next() (rune, bool) { + r, size := utf8.DecodeRuneInString(l.in) + if r == utf8.RuneError && size == 0 { + return 0, false + } + if r == utf8.RuneError && size == 1 { + panic("invalid utf-8") // checked in constructor + } + l.in = l.in[size:] + l.out += string(r) + return r, true +} + +func (l *Lexer) back() error { + r, size := utf8.DecodeLastRuneInString(l.out) + if r == utf8.RuneError && size == 0 { + return errors.New("empty out") + } + if r == utf8.RuneError && size == 1 { + return errors.New("this should be impossible, out is malformed") + } + l.out = l.out[:len(l.out)-size] + l.in = string(r) + l.in + return nil +} + +func (l *Lexer) backN(n int) error { + for i := 0; i < n; i++ { + if err := l.back(); err != nil { + return errors.Wrapf(err, "error calling back for index '%d'", i) + } + } + return nil +} + +func (l *Lexer) tryOrBack(s string) bool { + counter := 0 + for _, expectedR := range s { + counter++ + nextR, ok := l.next() + if !ok || nextR != expectedR { + if err := l.backN(counter); err != nil { + panic(err) // we just called next `counter` times so if there is a bug here it would have to be insanely silly + } + return false + } + } + return true +} + +func (l *Lexer) comesNext(s string) bool { + counter := 0 + + defer func() { + if err := l.backN(counter); err != nil { + panic(err) // we just called next `counter` times so if there is a bug here it would have to be insanely silly + } + }() + + for _, expectedR := range s { + nextR, ok := l.next() + if ok { + counter++ + } + if !ok || nextR != expectedR { + return false + } + } + return true + +} + +func (l *Lexer) emit(typ TokenType) { + if l.out != "" { + l.tokens = append(l.tokens, + Token{ + Type: typ, + Text: l.out, + }, + ) + l.out = "" + } +} + +func (l *Lexer) peek() (rune, bool) { + r, ok := l.next() + if ok { + if err := l.back(); err != nil { + panic(err) + } + } + return r, ok +} + +func (l *Lexer) destroyPrefix(prefix string) { + l.out = strings.TrimPrefix(l.out, prefix) +} + +type stateFn func(l *Lexer) (stateFn, error) + +const ( + httpColonSlashSlash = "http://" + httpsColonSlashSlash = "https://" + nostrColon = "nostr:" + nevent = "nevent" + npub = "npub" +) + +func stateText(l *Lexer) (stateFn, error) { + for { + if l.comesNext(httpColonSlashSlash) || l.comesNext(httpsColonSlashSlash) { + l.emit(TokenTypeText) + return stateLinkProtocol, nil + } + + if l.comesNext(nostrColon) { + l.emit(TokenTypeText) + return stateNostrLinkProtocol, nil + } + + if l.comesNext(nevent) || l.comesNext(npub) { + l.emit(TokenTypeText) + return stateNostrLinkType, nil + } + + _, ok := l.next() + if !ok { + l.emit(TokenTypeText) + return nil, nil + } + } +} + +func stateLinkProtocol(l *Lexer) (stateFn, error) { + if !l.tryOrBack(httpColonSlashSlash) && !l.tryOrBack(httpsColonSlashSlash) { + return nil, errors.New("where did the protocol go?") + } + + return stateLinkAddress, nil +} + +func stateLinkAddress(l *Lexer) (stateFn, error) { + counter := 0 + for { + r, ok := l.next() + if !ok { + if counter == 0 { + l.emit(TokenTypeText) + } else { + l.emit(TokenTypeLink) + } + return nil, nil + } + + counter++ + + if isValidLinkCharacterExcludingDot(r) { + continue + } + + switch r { + case '.': + nextR, ok := l.peek() + if !ok { + continue + } + + if !isValidLinkCharacterExcludingDot(nextR) { + if err := l.back(); err != nil { + return nil, errors.New("where did the dot go?") + } + l.emit(TokenTypeLink) + return stateText, nil + } + default: + if err := l.back(); err != nil { + return nil, errors.New("we just went forward but we can't go back?") + } + l.emit(TokenTypeLink) + return stateText, nil + } + } +} + +func stateNostrLinkProtocol(l *Lexer) (stateFn, error) { + if !l.tryOrBack(nostrColon) { + return nil, errors.New("where did 'nostr:' go?") + } + + return stateNostrLinkType, nil +} + +func stateNostrLinkType(l *Lexer) (stateFn, error) { + if !l.tryOrBack(nevent) && !l.tryOrBack(npub) { + return stateText, nil + } + + return stateNostrLinkData, nil +} + +func stateNostrLinkData(l *Lexer) (stateFn, error) { + counter := 0 + for { + r, ok := l.next() + if !ok || !isBech32(r) { + if ok { + if err := l.back(); err != nil { + return nil, errors.Wrap(err, "we just consumed a rune?") + } + } + + if counter == 0 { + l.emit(TokenTypeText) + } else { + l.destroyPrefix(nostrColon) + l.emit(TokenTypeNostrLink) + } + + return stateText, nil + } + + counter++ + } +} + +func isValidLinkCharacterExcludingDot(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) || r == '%' +} + +func isBech32(r rune) bool { + r = unicode.ToLower(r) + if r == 'b' || r == 'i' || r == 'o' { + return false + } + return (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') +} diff --git a/service/domain/content/lexer_test.go b/service/domain/content/lexer_test.go new file mode 100644 index 0000000..3b0cdbf --- /dev/null +++ b/service/domain/content/lexer_test.go @@ -0,0 +1,132 @@ +package content_test + +import ( + "testing" + + "github.com/planetary-social/nos-crossposting-service/service/domain/content" + "github.com/stretchr/testify/require" +) + +func TestLexer(t *testing.T) { + testCases := []struct { + Name string + In string + Out []content.Token + }{ + { + Name: "complex note", + In: `This is a nostr note which contains a link: https://example.com. It also has a nostr:nevent1qqs2jy48cze6962ezp9jmj2380v8s9d3ah9ymk7hvvtxsztcruhkjwgzyrnnrjjz0svqt8txvdkalthwk9gp90pdk0xay7u7fjk72pt6d6pw6uue5n2 link and a raw nevent1qqs2jy48cze6962ezp9jmj2380v8s9d3ah9ymk7hvvtxsztcruhkjwgzyrnnrjjz0svqt8txvdkalthwk9gp90pdk0xay7u7fjk72pt6d6pw6uue5n2 link.`, + Out: []content.Token{ + { + Type: content.TokenTypeText, + Text: "This is a nostr note which contains a link: ", + }, + { + Type: content.TokenTypeLink, + Text: "https://example.com", + }, + { + Type: content.TokenTypeText, + Text: ". It also has a ", + }, + { + Type: content.TokenTypeNostrLink, + Text: "nevent1qqs2jy48cze6962ezp9jmj2380v8s9d3ah9ymk7hvvtxsztcruhkjwgzyrnnrjjz0svqt8txvdkalthwk9gp90pdk0xay7u7fjk72pt6d6pw6uue5n2", + }, + { + Type: content.TokenTypeText, + Text: " link and a raw ", + }, + { + Type: content.TokenTypeNostrLink, + Text: "nevent1qqs2jy48cze6962ezp9jmj2380v8s9d3ah9ymk7hvvtxsztcruhkjwgzyrnnrjjz0svqt8txvdkalthwk9gp90pdk0xay7u7fjk72pt6d6pw6uue5n2", + }, + { + Type: content.TokenTypeText, + Text: " link.", + }, + }, + }, + { + Name: "malformed_link_with_just_protocol", + In: `nostr:`, + Out: []content.Token{ + { + Type: content.TokenTypeText, + Text: "nostr:", + }, + }, + }, + { + Name: "malformed_link_with_just_protocol_and_type", + In: `nostr:nevent`, + Out: []content.Token{ + { + Type: content.TokenTypeText, + Text: "nostr:nevent", + }, + }, + }, + { + Name: "malformed_link_with_incorrect_data", + In: `nostr:neventl1Ii`, + Out: []content.Token{ + { + Type: content.TokenTypeNostrLink, + Text: "neventl1", + }, + { + Type: content.TokenTypeText, + Text: "Ii", + }, + }, + }, + { + Name: "links_with_protocol", + In: `nostr:npubac nostr:neventac`, + Out: []content.Token{ + { + Type: content.TokenTypeNostrLink, + Text: "npubac", + }, + { + Type: content.TokenTypeText, + Text: " ", + }, + { + Type: content.TokenTypeNostrLink, + Text: "neventac", + }, + }, + }, + { + Name: "links_without_protocol", + In: `npubac neventac`, + Out: []content.Token{ + { + Type: content.TokenTypeNostrLink, + Text: "npubac", + }, + { + Type: content.TokenTypeText, + Text: " ", + }, + { + Type: content.TokenTypeNostrLink, + Text: "neventac", + }, + }, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Name, func(t *testing.T) { + l := content.NewLexer(testCase.In) + + tokens, err := l.Lex() + require.NoError(t, err) + + require.Equal(t, testCase.Out, tokens) + }) + } +} diff --git a/service/domain/content/transformer.go b/service/domain/content/transformer.go new file mode 100644 index 0000000..8b30ca9 --- /dev/null +++ b/service/domain/content/transformer.go @@ -0,0 +1,37 @@ +package content + +import ( + "fmt" + + "github.com/boreq/errors" +) + +type Transformer struct { +} + +func NewTransformer() *Transformer { + return &Transformer{} +} + +func (t *Transformer) BreakdownAndTransform(content string) ([]string, error) { + tokens, err := NewLexer(content).Lex() + if err != nil { + return nil, errors.Wrap(err, "error lexing") + } + + var elements []string + for _, token := range tokens { + switch token.Type { + case TokenTypeText: + elements = append(elements, token.Text) + case TokenTypeLink: + elements = append(elements, token.Text) + case TokenTypeNostrLink: + elements = append(elements, fmt.Sprintf("https://njump.me/%s", token.Text)) + default: + return nil, fmt.Errorf("unknown token '%+v'", token.Type) + } + } + + return elements, nil +} diff --git a/service/domain/content/transformer_test.go b/service/domain/content/transformer_test.go new file mode 100644 index 0000000..c99f9e8 --- /dev/null +++ b/service/domain/content/transformer_test.go @@ -0,0 +1,39 @@ +package content_test + +import ( + "testing" + + "github.com/planetary-social/nos-crossposting-service/service/domain/content" + "github.com/stretchr/testify/require" +) + +func TestTransformer(t *testing.T) { + testCases := []struct { + Name string + In string + Out []string + }{ + { + Name: "complex note", + In: `This is a nostr note which contains a link: https://example.com. It also has a nostr:nevent1qqs2jy48cze6962ezp9jmj2380v8s9d3ah9ymk7hvvtxsztcruhkjwgzyrnnrjjz0svqt8txvdkalthwk9gp90pdk0xay7u7fjk72pt6d6pw6uue5n2 link.`, + Out: []string{ + "This is a nostr note which contains a link: ", + "https://example.com", + ". It also has a ", + "https://njump.me/nevent1qqs2jy48cze6962ezp9jmj2380v8s9d3ah9ymk7hvvtxsztcruhkjwgzyrnnrjjz0svqt8txvdkalthwk9gp90pdk0xay7u7fjk72pt6d6pw6uue5n2", + " link.", + }, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Name, func(t *testing.T) { + transformer := content.NewTransformer() + + out, err := transformer.BreakdownAndTransform(testCase.In) + require.NoError(t, err) + + require.Equal(t, testCase.Out, out) + }) + } +} diff --git a/service/domain/tweet_generator.go b/service/domain/tweet_generator.go index 89bb501..b320a55 100644 --- a/service/domain/tweet_generator.go +++ b/service/domain/tweet_generator.go @@ -3,18 +3,19 @@ package domain import ( "fmt" "strings" - "unicode/utf8" "github.com/boreq/errors" + "github.com/planetary-social/nos-crossposting-service/service/domain/content" ) const noteContentMaxLengthInRunes = 200 type TweetGenerator struct { + transformer *content.Transformer } -func NewTweetGenerator() *TweetGenerator { - return &TweetGenerator{} +func NewTweetGenerator(transformer *content.Transformer) *TweetGenerator { + return &TweetGenerator{transformer: transformer} } func (g *TweetGenerator) Generate(event Event) ([]Tweet, error) { @@ -31,33 +32,36 @@ func (g *TweetGenerator) Generate(event Event) ([]Tweet, error) { return nil, nil } - tweetText := g.createText(event) + tweetText, err := g.createText(event) + if err != nil { + return nil, errors.Wrap(err, "error creating text") + } return []Tweet{ NewTweet(tweetText), }, nil } -func (g *TweetGenerator) createText(event Event) string { +func (g *TweetGenerator) createText(event Event) (string, error) { + elements, err := g.transformer.BreakdownAndTransform(event.Content()) + if err != nil { + return "", errors.Wrap(err, "error transforming") + } + var builder strings.Builder - if utf8.RuneCountInString(event.Content()) <= noteContentMaxLengthInRunes { - builder.WriteString(event.Content()) - } else { - runeCounter := 0 - for _, rune := range event.Content() { - builder.WriteRune(rune) - runeCounter++ - if runeCounter >= noteContentMaxLengthInRunes { - break - } + for _, element := range elements { + futureTotalLength := builder.Len() + len(element) + if futureTotalLength > noteContentMaxLengthInRunes { + builder.WriteString("...") + break } - builder.WriteString("...") + + builder.WriteString(element) } builder.WriteString(fmt.Sprintf("\n\n%s", g.njumpLinkEvent(event))) - - return builder.String() + return builder.String(), nil } func (g *TweetGenerator) njumpLinkEvent(event Event) string { diff --git a/service/domain/tweet_generator_test.go b/service/domain/tweet_generator_test.go index fee7e9e..dffb525 100644 --- a/service/domain/tweet_generator_test.go +++ b/service/domain/tweet_generator_test.go @@ -7,6 +7,7 @@ import ( "github.com/nbd-wtf/go-nostr" "github.com/planetary-social/nos-crossposting-service/internal/fixtures" "github.com/planetary-social/nos-crossposting-service/service/domain" + "github.com/planetary-social/nos-crossposting-service/service/domain/content" "github.com/stretchr/testify/require" ) @@ -39,6 +40,14 @@ func TestTweetGenerator(t *testing.T) { }, GeneratesTweet: false, }, + { + Name: "event_with_nostr_link", + Event: nostr.Event{ + Kind: domain.EventKindNote.Int(), + Content: "The content marketing on social can be totally crazy. Just imagine once it is mostly created by LLMs? \n\nnostr:note14aj40jvqs3auq2488c9qxgsqh79zdl0vyhzvzp275g44hhe4etxss9ncxd", + }, + GeneratesTweet: true, + }, } for _, testCase := range testCases { @@ -53,7 +62,8 @@ func TestTweetGenerator(t *testing.T) { event, err := domain.NewEvent(libevent) require.NoError(t, err) - g := domain.NewTweetGenerator() + transformer := content.NewTransformer() + g := domain.NewTweetGenerator(transformer) tweets, err := g.Generate(event) require.NoError(t, err) @@ -62,9 +72,8 @@ func TestTweetGenerator(t *testing.T) { []domain.Tweet{ domain.NewTweet( fmt.Sprintf( - `Some text. - -https://njump.me/%s`, + "%s\n\nhttps://njump.me/%s", + event.Content(), event.Nevent(), ), ),