Skip to content

Commit

Permalink
Detect links and nostr links in notes
Browse files Browse the repository at this point in the history
Links will not be broken up as to avoid breaking them. Nostr links will
be detected and convereted to njump links.
  • Loading branch information
boreq committed Nov 13, 2023
1 parent c498ebc commit 306bd12
Show file tree
Hide file tree
Showing 6 changed files with 533 additions and 22 deletions.
290 changes: 290 additions & 0 deletions service/domain/content/lexer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
package content

import (
"strings"
"unicode"
"unicode/utf8"

"github.com/boreq/errors"
)

type Token struct {
Type TokenType
Text string
}

var (
TokenTypeText = TokenType{"text"}
TokenTypeLink = TokenType{"link"}
TokenTypeNostrLink = TokenType{"nostrLink"}
)

type TokenType struct {
s string
}

type Lexer struct {
in string
out string
tokens []Token
}

func NewLexer(s string) *Lexer {
return &Lexer{in: s}
}

func (l *Lexer) Lex() ([]Token, error) {
if !utf8.ValidString(l.in) {
return nil, errors.New("invalid utf-8")
}

var state stateFn = stateText

for state != nil {
newState, err := state(l)
if err != nil {
return nil, errors.Wrap(err, "state fn error")
}

state = newState
}

return l.tokens, nil
}

func (l *Lexer) next() (rune, bool) {
r, size := utf8.DecodeRuneInString(l.in)
if r == utf8.RuneError && size == 0 {
return 0, false
}
if r == utf8.RuneError && size == 1 {
panic("invalid utf-8") // checked in constructor
}
l.in = l.in[size:]
l.out += string(r)
return r, true
}

func (l *Lexer) back() error {
r, size := utf8.DecodeLastRuneInString(l.out)
if r == utf8.RuneError && size == 0 {
return errors.New("empty out")
}
if r == utf8.RuneError && size == 1 {
return errors.New("this should be impossible, out is malformed")
}
l.out = l.out[:len(l.out)-size]
l.in = string(r) + l.in
return nil
}

func (l *Lexer) backN(n int) error {
for i := 0; i < n; i++ {
if err := l.back(); err != nil {
return errors.Wrapf(err, "error calling back for index '%d'", i)
}
}
return nil
}

func (l *Lexer) tryOrBack(s string) bool {
counter := 0
for _, expectedR := range s {
counter++
nextR, ok := l.next()
if !ok || nextR != expectedR {
if err := l.backN(counter); err != nil {
panic(err) // we just called next `counter` times so if there is a bug here it would have to be insanely silly
}
return false
}
}
return true
}

func (l *Lexer) comesNext(s string) bool {
counter := 0

defer func() {
if err := l.backN(counter); err != nil {
panic(err) // we just called next `counter` times so if there is a bug here it would have to be insanely silly
}
}()

for _, expectedR := range s {
nextR, ok := l.next()
if ok {
counter++
}
if !ok || nextR != expectedR {
return false
}
}
return true

}

func (l *Lexer) emit(typ TokenType) {
if l.out != "" {
l.tokens = append(l.tokens,
Token{
Type: typ,
Text: l.out,
},
)
l.out = ""
}
}

func (l *Lexer) peek() (rune, bool) {
r, ok := l.next()
if ok {
if err := l.back(); err != nil {
panic(err)
}
}
return r, ok
}

func (l *Lexer) destroyPrefix(prefix string) {
l.out = strings.TrimPrefix(l.out, prefix)
}

type stateFn func(l *Lexer) (stateFn, error)

const (
httpColonSlashSlash = "http://"
httpsColonSlashSlash = "https://"
nostrColon = "nostr:"
nevent = "nevent"
npub = "npub"
)

func stateText(l *Lexer) (stateFn, error) {
for {
if l.comesNext(httpColonSlashSlash) || l.comesNext(httpsColonSlashSlash) {
l.emit(TokenTypeText)
return stateLinkProtocol, nil
}

if l.comesNext(nostrColon) {
l.emit(TokenTypeText)
return stateNostrLinkProtocol, nil
}

if l.comesNext(nevent) || l.comesNext(npub) {
l.emit(TokenTypeText)
return stateNostrLinkType, nil
}

_, ok := l.next()
if !ok {
l.emit(TokenTypeText)
return nil, nil
}
}
}

func stateLinkProtocol(l *Lexer) (stateFn, error) {
if !l.tryOrBack(httpColonSlashSlash) && !l.tryOrBack(httpsColonSlashSlash) {
return nil, errors.New("where did the protocol go?")
}

return stateLinkAddress, nil
}

func stateLinkAddress(l *Lexer) (stateFn, error) {
counter := 0
for {
r, ok := l.next()
if !ok {
if counter == 0 {
l.emit(TokenTypeText)
} else {
l.emit(TokenTypeLink)
}
return nil, nil
}

counter++

if isValidLinkCharacterExcludingDot(r) {
continue
}

switch r {
case '.':
nextR, ok := l.peek()
if !ok {
continue
}

if !isValidLinkCharacterExcludingDot(nextR) {
if err := l.back(); err != nil {
return nil, errors.New("where did the dot go?")
}
l.emit(TokenTypeLink)
return stateText, nil
}
default:
if err := l.back(); err != nil {
return nil, errors.New("we just went forward but we can't go back?")
}
l.emit(TokenTypeLink)
return stateText, nil
}
}
}

func stateNostrLinkProtocol(l *Lexer) (stateFn, error) {
if !l.tryOrBack(nostrColon) {
return nil, errors.New("where did 'nostr:' go?")
}

return stateNostrLinkType, nil
}

func stateNostrLinkType(l *Lexer) (stateFn, error) {
if !l.tryOrBack(nevent) && !l.tryOrBack(npub) {
return stateText, nil
}

return stateNostrLinkData, nil
}

func stateNostrLinkData(l *Lexer) (stateFn, error) {
counter := 0
for {
r, ok := l.next()
if !ok || !isBech32(r) {
if ok {
if err := l.back(); err != nil {
return nil, errors.Wrap(err, "we just consumed a rune?")
}
}

if counter == 0 {
l.emit(TokenTypeText)
} else {
l.destroyPrefix(nostrColon)
l.emit(TokenTypeNostrLink)
}

return stateText, nil
}

counter++
}
}

func isValidLinkCharacterExcludingDot(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r) || r == '%'
}

func isBech32(r rune) bool {
r = unicode.ToLower(r)
if r == 'b' || r == 'i' || r == 'o' {
return false
}
return (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
}
Loading

0 comments on commit 306bd12

Please sign in to comment.