From f0f8941993b9d952377f8612bb16879d89249027 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Thu, 21 May 2020 21:35:08 +0800
Subject: [PATCH] Fix parsing logic for url in weibo text
---
hoshino/modules/weibo/__init__.py | 9 ++++++
hoshino/modules/weibo/weibo.py | 50 ++++++++++++++++++++++---------
2 files changed, 45 insertions(+), 14 deletions(-)
diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py
index b75083417..1b781b26e 100644
--- a/hoshino/modules/weibo/__init__.py
+++ b/hoshino/modules/weibo/__init__.py
@@ -18,6 +18,9 @@
}]
'''
+
+lmt = util.FreqLimiter(5)
+
def _load_config(services_config):
for sv_config in services_config:
sv.logger.debug(sv_config)
@@ -70,11 +73,17 @@ def wb_to_message(wb):
videos = wb["video_url"]
res_videos = ';'.join(videos)
msg = f'{msg}\n视频链接:{res_videos}'
+
return msg
# @bot 看微博 alias
@sv.on_command('看微博', only_to_me=True)
async def get_last_5_weibo(session):
+ uid = session.ctx['user_id']
+ if not lmt.check(uid):
+ session.finish('您查询得过于频繁,请稍等片刻', at_sender=True)
+ lmt.start_cd(uid)
+
alias = session.current_arg_text
if alias not in alias_dic:
await session.finish(f"未找到微博: {alias}")
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index a91a54280..65c8319f5 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -25,7 +25,10 @@ def __init__(self, config):
asyncio.get_event_loop().run_until_complete(self._async_init())
async def _async_init(self):
+ self.__init = True
self.user = await self.get_user_info(self.user_id)
+ await self.get_latest_weibos()
+ self.__init = False
async def get_json(self, params):
"""获取网页中json数据"""
@@ -158,16 +161,6 @@ def get_location(self, selector):
break
return location
- def get_article_url(self, selector):
- """获取微博中头条文章的url"""
- article_url = ''
- text = selector.xpath('string(.)')
- if text.startswith(u'发布了头条文章'):
- url = selector.xpath('//a/@data-url')
- if url and url[0].startswith('http://t.cn'):
- article_url = url[0]
- return article_url
-
def get_topics(self, selector):
"""获取参与的微博话题"""
span_list = selector.xpath("//span[@class='surl-text']")
@@ -193,6 +186,27 @@ def get_at_users(self, selector):
at_users = ','.join(at_list)
return at_users
+ def get_text(self, text_body):
+ selector = etree.HTML(text_body)
+ url_lists = selector.xpath('//a[@data-url]/@data-url')
+ url_elems = selector.xpath('//a[@data-url]/span[@class="surl-text"]')
+
+ '''
+ Add the url of to the text of
+ For example:
+
+
+
+ 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻
+
+
+ replace 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻
+ with 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻(http://t.cn/A622uDbW)
+ '''
+ for i in range(0, len(url_lists)):
+ url_elems[i].text = f'{url_elems[i].text}({url_lists[i]})'
+ return selector.xpath('string(.)')
+
def string_to_int(self, string):
"""字符串转换为整数"""
if isinstance(string, int):
@@ -217,11 +231,17 @@ def standardize_date(self, created_at):
hour = created_at[:created_at.find(u"小时")]
hour = timedelta(hours=int(hour))
created_at = (datetime.now() - hour).strftime("%Y-%m-%d")
- self.__recent = False
+ if self.__init:
+ self.__recent = True
+ else:
+ self.__recent = False
elif u"昨天" in created_at:
day = timedelta(days=1)
created_at = (datetime.now() - day).strftime("%Y-%m-%d")
- self.__recent = False
+ if self.__init:
+ self.__recent = True
+ else:
+ self.__recent = False
elif created_at.count('-') == 1:
year = datetime.now().strftime("%Y")
created_at = year + "-" + created_at
@@ -250,8 +270,10 @@ def parse_weibo(self, weibo_info):
weibo['bid'] = weibo_info['bid']
text_body = weibo_info['text']
selector = etree.HTML(text_body)
- weibo['text'] = etree.HTML(text_body).xpath('string(.)')
- weibo['article_url'] = self.get_article_url(selector)
+
+
+ weibo['text'] = self.get_text(text_body)
+
weibo['pics'] = self.get_pics(weibo_info)
weibo['video_url'] = self.get_video_url(weibo_info)
weibo['location'] = self.get_location(selector)