From 60e297321edf87a5a2a0670ce09d11effa5e8432 Mon Sep 17 00:00:00 2001 From: shuaikangzhou <863909694@qq.com> Date: Fri, 12 Jan 2024 00:11:21 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=B8=8D=E5=8F=AF=E6=89=93?= =?UTF-8?q?=E5=8D=B0=E5=AD=97=E7=AC=A6=E5=AF=BC=E8=87=B4=E7=9A=84docx?= =?UTF-8?q?=E5=AF=BC=E5=87=BA=E5=A4=B1=E8=B4=A5#297?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/DataBase/exporter_docx.py | 39 ++++++++++++++++++++++++++++++----- app/DataBase/msg.py | 3 +++ app/DataBase/output.py | 9 ++++---- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/app/DataBase/exporter_docx.py b/app/DataBase/exporter_docx.py index 6c51bfe..b1e5272 100644 --- a/app/DataBase/exporter_docx.py +++ b/app/DataBase/exporter_docx.py @@ -4,6 +4,7 @@ import time from re import findall import docx +import unicodedata from docx import shared from docx.enum.table import WD_ALIGN_VERTICAL from docx.enum.text import WD_COLOR_INDEX, WD_PARAGRAPH_ALIGNMENT @@ -12,10 +13,33 @@ from docxcompose.composer import Composer from app.DataBase import msg_db, hard_link_db from app.DataBase.output import ExporterBase, escape_js_and_html +from app.log import logger from app.person import Me from app.util.compress_content import parser_reply, share_card, music_share from app.util.image import get_image_abs_path from app.util.music import get_music_path +import string + + +def filter_control_characters(input_string): + """ + 过滤掉不可打印字符 + @param input_string: + @return: + """ + # 创建一个包含所有可打印字符的字符串 + printable_chars = set(string.printable) + + # 过滤掉非可打印字符 + filtered_string = ''.join(char for char in input_string if char in printable_chars) + + return filtered_string + +def is_control_char(ch): + '''Whether a control character. + https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python + ''' + return unicodedata.category(ch)[0] == 'C' class DocxExporter(ExporterBase): @@ -30,7 +54,12 @@ class DocxExporter(ExporterBase): display_name = self.get_display_name(is_send, message) avatar = self.get_avatar_path(is_send, message, True) content_cell = self.create_table(doc, is_send, avatar) - content_cell.paragraphs[0].add_run(str_content) + try: + content_cell.paragraphs[0].add_run(str_content) + except ValueError: + logger.error(f'非法字符:{str_content}') + str_content = filter_control_characters(str_content) + content_cell.paragraphs[0].add_run(str_content) content_cell.paragraphs[0].font_size = shared.Inches(0.5) if is_send: p = content_cell.paragraphs[0] @@ -77,7 +106,7 @@ class DocxExporter(ExporterBase): display_name = self.get_display_name(is_send, message) avatar = self.get_avatar_path(is_send, message, True) content_cell = self.create_table(doc, is_send, avatar) - content_cell.paragraphs[0].add_run('【表情包】') + content_cell.paragraphs[0].add_run('【语音】') content_cell.paragraphs[0].font_size = shared.Inches(0.5) if is_send: p = content_cell.paragraphs[0] @@ -233,7 +262,6 @@ class DocxExporter(ExporterBase): avatar = self.get_avatar_path(is_send, message) display_name = self.get_display_name(is_send, message) - def share_card(self, doc, message): origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}" is_send = message[4] @@ -260,6 +288,7 @@ class DocxExporter(ExporterBase): app_logo = './image/' + os.path.basename(app_logo) else: app_logo = '' + def merge_docx(self, conRemark, n): origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}" all_file_path = [] @@ -343,7 +372,7 @@ class DocxExporter(ExporterBase): if index % 25 == 0: print(f"【导出 DOCX {self.contact.remark}】{index}/{len(messages)}") if index % 25: - print(f"【导出 DOCX {self.contact.remark}】{index+1}/{len(messages)}") + print(f"【导出 DOCX {self.contact.remark}】{index + 1}/{len(messages)}") filename = os.path.join(origin_docx_path, f"{self.contact.remark}.docx") doc = docx.Document() doc.styles["Normal"].font.name = "Cambria" @@ -355,7 +384,7 @@ class DocxExporter(ExporterBase): # print(f"【MERGE Export DOCX {self.contact.remark}】{len(docs)}") doc = Composer(doc) # 针对11188条消息(56组)所测,反排比正排更快,正排65s,反排54s for index, dx in enumerate(docs[::-1]): - print(f"【合并 DOCX {self.contact.remark}】{index+1}/{len(docs)}") + print(f"【合并 DOCX {self.contact.remark}】{index + 1}/{len(docs)}") doc.insert(0, dx) try: doc.save(filename) diff --git a/app/DataBase/msg.py b/app/DataBase/msg.py index c41dc49..8a0f205 100644 --- a/app/DataBase/msg.py +++ b/app/DataBase/msg.py @@ -59,6 +59,9 @@ def parser_chatroom_message(messages): message.append(ContactDefault(wxid)) updated_messages.append(tuple(message)) continue + # todo 解析还是有问题,会出现这种带:的东西 + if ':' in wxid: # wxid_ewi8gfgpp0eu22:25319:1 + wxid = wxid.split(':')[0] contact_info_list = micro_msg_db.get_contact_by_username(wxid) if contact_info_list is None: # 群聊中已退群的联系人不会保存在数据库里 message.append(ContactDefault(wxid)) diff --git a/app/DataBase/output.py b/app/DataBase/output.py index 4f00d22..6aa91c7 100644 --- a/app/DataBase/output.py +++ b/app/DataBase/output.py @@ -116,15 +116,16 @@ class ExporterBase(QThread): return False def get_avatar_path(self, is_send, message, is_absolute_path=False) -> str: - if self.contact.is_chatroom: - avatar = message[12].smallHeadImgUrl - else: - avatar = Me().smallHeadImgUrl if is_send else self.contact.smallHeadImgUrl if is_absolute_path: if self.contact.is_chatroom: avatar = message[12].avatar_path else: avatar = Me().avatar_path if is_send else self.contact.avatar_path + else: + if self.contact.is_chatroom: + avatar = message[12].smallHeadImgUrl + else: + avatar = Me().smallHeadImgUrl if is_send else self.contact.smallHeadImgUrl return avatar def get_display_name(self, is_send, message) -> str: