From f0f8941993b9d952377f8612bb16879d89249027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Thu, 21 May 2020 21:35:08 +0800 Subject: [PATCH] Fix parsing logic for url in weibo text --- hoshino/modules/weibo/__init__.py | 9 ++++++ hoshino/modules/weibo/weibo.py | 50 ++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py index b75083417..1b781b26e 100644 --- a/hoshino/modules/weibo/__init__.py +++ b/hoshino/modules/weibo/__init__.py @@ -18,6 +18,9 @@ }] ''' + +lmt = util.FreqLimiter(5) + def _load_config(services_config): for sv_config in services_config: sv.logger.debug(sv_config) @@ -70,11 +73,17 @@ def wb_to_message(wb): videos = wb["video_url"] res_videos = ';'.join(videos) msg = f'{msg}\n视频链接:{res_videos}' + return msg # @bot 看微博 alias @sv.on_command('看微博', only_to_me=True) async def get_last_5_weibo(session): + uid = session.ctx['user_id'] + if not lmt.check(uid): + session.finish('您查询得过于频繁,请稍等片刻', at_sender=True) + lmt.start_cd(uid) + alias = session.current_arg_text if alias not in alias_dic: await session.finish(f"未找到微博: {alias}") diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index a91a54280..65c8319f5 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -25,7 +25,10 @@ def __init__(self, config): asyncio.get_event_loop().run_until_complete(self._async_init()) async def _async_init(self): + self.__init = True self.user = await self.get_user_info(self.user_id) + await self.get_latest_weibos() + self.__init = False async def get_json(self, params): """获取网页中json数据""" @@ -158,16 +161,6 @@ def get_location(self, selector): break return location - def get_article_url(self, selector): - """获取微博中头条文章的url""" - article_url = '' - text = selector.xpath('string(.)') - if text.startswith(u'发布了头条文章'): - url = selector.xpath('//a/@data-url') - if url and url[0].startswith('http://t.cn'): - article_url = url[0] - return article_url - def get_topics(self, selector): """获取参与的微博话题""" span_list = selector.xpath("//span[@class='surl-text']") @@ -193,6 +186,27 @@ def get_at_users(self, selector): at_users = ','.join(at_list) return at_users + def get_text(self, text_body): + selector = etree.HTML(text_body) + url_lists = selector.xpath('//a[@data-url]/@data-url') + url_elems = selector.xpath('//a[@data-url]/span[@class="surl-text"]') + + ''' + Add the url of to the text of + For example: + + + + 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻 + + + replace 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻 + with 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻(http://t.cn/A622uDbW) + ''' + for i in range(0, len(url_lists)): + url_elems[i].text = f'{url_elems[i].text}({url_lists[i]})' + return selector.xpath('string(.)') + def string_to_int(self, string): """字符串转换为整数""" if isinstance(string, int): @@ -217,11 +231,17 @@ def standardize_date(self, created_at): hour = created_at[:created_at.find(u"小时")] hour = timedelta(hours=int(hour)) created_at = (datetime.now() - hour).strftime("%Y-%m-%d") - self.__recent = False + if self.__init: + self.__recent = True + else: + self.__recent = False elif u"昨天" in created_at: day = timedelta(days=1) created_at = (datetime.now() - day).strftime("%Y-%m-%d") - self.__recent = False + if self.__init: + self.__recent = True + else: + self.__recent = False elif created_at.count('-') == 1: year = datetime.now().strftime("%Y") created_at = year + "-" + created_at @@ -250,8 +270,10 @@ def parse_weibo(self, weibo_info): weibo['bid'] = weibo_info['bid'] text_body = weibo_info['text'] selector = etree.HTML(text_body) - weibo['text'] = etree.HTML(text_body).xpath('string(.)') - weibo['article_url'] = self.get_article_url(selector) + + + weibo['text'] = self.get_text(text_body) + weibo['pics'] = self.get_pics(weibo_info) weibo['video_url'] = self.get_video_url(weibo_info) weibo['location'] = self.get_location(selector)