Skip to content

Commit

Permalink
bluesky.from_as1: guess missing indices in facets based on text
Browse files Browse the repository at this point in the history
...even if it was converted from HTML content. for #675
  • Loading branch information
snarfed committed Apr 1, 2024
1 parent f9c7554 commit 42a4153
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 87 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ Changelog
* Add hashtag facet support.
* `from_as1`:
* Add hashtag support.
* Guess missing indices in facets based on content text.
* Populate `reply.root` properly in reply posts ([snarfed/bridgy#1696](https://github.com/snarfed/bridgy/issues/1696)).
* Add `value` boolean kwarg to `from_as1_to_strong_ref`.
* `microformats2`:
Expand Down
154 changes: 75 additions & 79 deletions granary/bluesky.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,73 +504,72 @@ def from_as1(obj, out_type=None, blobs=None, client=None):
}

elif verb == 'post' and type in POST_TYPES:
# convert text to HTML and truncate
# convert text from HTML and truncate
src = Bluesky('unused')
content = obj.get('content')
text = obj.get('summary') or content or obj.get('name') or ''
text = src.truncate(html_to_text(text), None, OMIT_LINK)

facets = []
if text == content:
# convert index-based tags to facets
for tag in util.get_list(obj, 'tags'):
type = tag.get('objectType')
url = tag.get('url')
if not url and type != 'hashtag':
continue

facet = {
'$type': 'app.bsky.richtext.facet',
# convert index-based tags to facets
for tag in util.get_list(obj, 'tags'):
type = tag.get('objectType')
url = tag.get('url')
if not url and type != 'hashtag':
continue

facet = {
'$type': 'app.bsky.richtext.facet',
}
try:
start = int(tag['startIndex'])
if start and obj.get('content_is_html'):
raise NotImplementedError('HTML content is not supported with index tags')
end = start + int(tag['length'])

facet['index'] = {
# convert indices from Unicode chars to UTF-8 encoded bytes
# https://github.com/snarfed/atproto/blob/5b0c2d7dd533711c17202cd61c0e101ef3a81971/lexicons/app/bsky/richtext/facet.json#L34
'byteStart': len(content[:start].encode()),
'byteEnd': len(content[:end].encode()),
}
try:
start = int(tag['startIndex'])
if start and obj.get('content_is_html'):
raise NotImplementedError('HTML content is not supported with index tags')
end = start + int(tag['length'])

facet['index'] = {
# convert indices from Unicode chars to UTF-8 encoded bytes
# https://github.com/snarfed/atproto/blob/5b0c2d7dd533711c17202cd61c0e101ef3a81971/lexicons/app/bsky/richtext/facet.json#L34
'byteStart': len(content[:start].encode()),
'byteEnd': len(content[:end].encode()),
}
except (KeyError, ValueError, IndexError, TypeError):
pass

if type == 'hashtag':
if name := tag.get('displayName'):
facet['features'] = [{
'$type': 'app.bsky.richtext.facet#tag',
'tag': name,
}]
if 'index' not in facet:
# find (first) location
# can't use \b for word boundaries here because that only includes
# alphanumerics, and Bluesky hashtags can include emoji
match = re.search(fr'[\s^](#{name})[\s$]', content)
if match:
start_bytes = len(content[:match.start(1)].encode())
facet['index'] = {
'byteStart': start_bytes + 1,
'byteEnd': start_bytes + 1 + len(name.encode()),
}

elif type == 'mention':
facet['features'] = [{
'$type': 'app.bsky.richtext.facet#mention',
# TODO: support bsky.app URLs with handles by resolving them?
'did': (url if url.startswith('did:')
else url.removeprefix(f'{Bluesky.BASE_URL}/profile/')
if url.startswith(f'{Bluesky.BASE_URL}/profile/did:')
else ''),
}]
else:
except (KeyError, ValueError, IndexError, TypeError):
pass

if type == 'hashtag':
if name := tag.get('displayName'):
facet['features'] = [{
'$type': 'app.bsky.richtext.facet#link',
'uri': url,
'$type': 'app.bsky.richtext.facet#tag',
'tag': name,
}]
if 'index' not in facet:
# find (first) location
# can't use \b for word boundaries here because that only includes
# alphanumerics, and Bluesky hashtags can include emoji
match = re.search(fr'[\s^](#{name})[\s$]', text)
if match:
start_bytes = len(content[:match.start(1)].encode())
facet['index'] = {
'byteStart': start_bytes + 1,
'byteEnd': start_bytes + 1 + len(name.encode()),
}

elif type == 'mention':
facet['features'] = [{
'$type': 'app.bsky.richtext.facet#mention',
# TODO: support bsky.app URLs with handles by resolving them?
'did': (url if url.startswith('did:')
else url.removeprefix(f'{Bluesky.BASE_URL}/profile/')
if url.startswith(f'{Bluesky.BASE_URL}/profile/did:')
else ''),
}]
else:
facet['features'] = [{
'$type': 'app.bsky.richtext.facet#link',
'uri': url,
}]

facets.append(facet)
facets.append(facet)

# images
images_embed = images_record_embed = None
Expand Down Expand Up @@ -1502,24 +1501,6 @@ def _create(self, obj, preview=None, include_link=OMIT_LINK,
url = obj.get('url')
content = self.truncate(content, url, include_link, type)

# facet for link to original post, if any
url_facets = []
if url:
url_index = content.rfind(url)
if url_index != -1:
byte_start = len(content[:url_index].encode())
url_facets = [{
'$type': 'app.bsky.richtext.facet',
'features': [{
'$type': 'app.bsky.richtext.facet#link',
'uri': url,
}],
'index': {
'byteStart': byte_start,
'byteEnd': byte_start + len(url.encode()),
},
}]

# TODO linkify mentions and hashtags
preview_content = util.linkify(content, pretty=True, skip_bare_cc_tlds=True)

Expand Down Expand Up @@ -1603,10 +1584,25 @@ def _create(self, obj, preview=None, include_link=OMIT_LINK,
else:
blobs = self.upload_media(images)
post_atp = from_as1(obj, blobs=blobs, client=self)
post_atp.update({
'text': content,
'facets': url_facets,
})
post_atp['text'] = content

# facet for link to original post, if any
if url:
url_index = content.rfind(url)
if url_index != -1:
byte_start = len(content[:url_index].encode())
post_atp.setdefault('facets', []).append({
'$type': 'app.bsky.richtext.facet',
'features': [{
'$type': 'app.bsky.richtext.facet#link',
'uri': url,
}],
'index': {
'byteStart': byte_start,
'byteEnd': byte_start + len(url.encode()),
},
})

result = self.client.com.atproto.repo.createRecord({
'repo': self.did,
'collection': post_atp['$type'],
Expand Down
16 changes: 9 additions & 7 deletions granary/tests/test_bluesky.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,14 @@ def test_from_as1_tag_hashtag_guess_index(self):

self.assert_equals(POST_BSKY_FACET_HASHTAG, from_as1(note))

def test_from_as1_tag_hashtag_html_content_guess_index(self):
note = copy.deepcopy(NOTE_AS_TAG_HASHTAG)
note['content'] = '<p>foo <a class="p-category">#hache-☕</a> bar</p>'
del note['tags'][0]['startIndex']
del note['tags'][0]['length']

self.assert_equals(POST_BSKY_FACET_HASHTAG, from_as1(note))

def test_from_as1_post_with_image(self):
expected = copy.deepcopy(POST_BSKY_IMAGES)
del expected['embed']['images'][0]['image']
Expand Down Expand Up @@ -1264,7 +1272,6 @@ def test_to_as1_embed_block(self):
}))

def test_to_as1_facet_link_and_embed(self):

self.assert_equals(trim_nulls({
**POST_AS_EMBED,
'id': None,
Expand Down Expand Up @@ -1660,7 +1667,6 @@ def test_create_reply(self, mock_get, mock_post):
reply_bsky = copy.deepcopy(REPLY_BSKY)
reply_bsky['reply']['root']['cid'] = \
reply_bsky['reply']['parent']['cid'] = 'my-syd'
reply_bsky['facets'] = []
self.assert_call(mock_post, 'com.atproto.repo.createRecord', json={
'repo': self.bs.did,
'collection': 'app.bsky.feed.post',
Expand Down Expand Up @@ -1799,10 +1805,7 @@ def test_create_with_media(self, mock_get, mock_post):
self.assert_call(mock_post, 'com.atproto.repo.createRecord', json={
'repo': self.bs.did,
'collection': 'app.bsky.feed.post',
'record': {
**POST_BSKY_IMAGES,
'facets': [],
},
'record': POST_BSKY_IMAGES,
})

# @patch('requests.post')
Expand Down Expand Up @@ -1865,7 +1868,6 @@ def test_create_bookmark(self, mock_post):
'$type': 'app.bsky.feed.post',
'text': 'foo ☕ bar', # TODO \n\nhttps://example.com/foo',
'createdAt': '2022-01-02T03:04:05.000Z',
'facets': [],
},
# TODO
# 'facets': [{
Expand Down
13 changes: 12 additions & 1 deletion granary/tests/testdata/note.bsky-from-as.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,16 @@
"$type": "app.bsky.embed.images#image",
"alt": ""
}]
}
},
"facets": [{
"$type": "app.bsky.richtext.facet",
"features": [{
"$type": "app.bsky.richtext.facet#link",
"uri": "http://my/link"
}],
"index": {
"byteEnd": 12,
"byteStart": 8
}
}]
}

0 comments on commit 42a4153

Please sign in to comment.