diff --git a/app/DataBase/exporter_docx.py b/app/DataBase/exporter_docx.py index 48ebfdb..a718188 100644 --- a/app/DataBase/exporter_docx.py +++ b/app/DataBase/exporter_docx.py @@ -40,84 +40,6 @@ def filter_control_characters(input_string): class DocxExporter(ExporterBase): - def merge_docx(self, n): - self.process_num += 1 - conRemark = self.contact.remark - origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}" - filename = f"{origin_docx_path}/{conRemark}_{n}.docx" - # print(all_file_path) - doc = docx.Document(filename) - if self.merged_doc_index == [-1, -1]: - self.document.append(doc) - self.merged_doc_index = [n, n] - else: - if n == self.merged_doc_index[0] - 1: - self.document.insert(0, doc) - self.merged_doc_index[0] -= 1 - elif n == self.merged_doc_index[1] + 1: - self.document.append(doc) - self.merged_doc_index[1] += 1 - else: - self.docs.append([doc, n]) - self.docs_set.add(n) - new_docx = [] - new_set = set() - # print(self.docs) - while new_set!=self.docs_set: - self.docs.sort(key=lambda x: x[1]) - for doc_, index in self.docs: - if index == self.merged_doc_index[0] - 1: - self.document.insert(0, doc_) - self.merged_doc_index[0] -= 1 - elif index == self.merged_doc_index[1] + 1: - self.document.append(doc_) - self.merged_doc_index[1] += 1 - else: - new_docx.append([doc_, index]) - new_set.add(index) - self.docs = new_docx - self.docs_set = new_set - os.remove(filename) - if self.process_num == self.child_thread_num: - # self.document.append(self.document) - file = os.path.join(origin_docx_path, f'{conRemark}.docx') - try: - self.document.save(file) - except PermissionError: - file = file[:-5] + f'{time.time()}' + '.docx' - self.document.save(file) - self.okSignal.emit(1) - - def export(self): - self.child_threads = [] - messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range) - # 计算每个子列表的长度 - num = 1 - # num = len(messages) // 500 +1 - sublist_length = len(messages) // num - - # 使用列表切片将列表分成n个子列表 - divided_list = [messages[i:i + sublist_length] for i in range(0, len(messages), sublist_length)] - self.child_thread_num = len(divided_list) - self.process_num = 0 - doc = docx.Document() - doc.styles["Normal"].font.name = "Cambria" - doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") - self.document = Composer(doc) - self.merged_doc_index = [-1, -1] - self.docs = [] - self.docs_set = set() - # self.document.append(self.document) - for i in range(self.child_thread_num): - child_thread = DocxExporterChildThread(self.contact, type_=self.DOCX, message_types=self.message_types, - time_range=self.time_range, messages=divided_list[i], index=i) - self.child_threads.append(child_thread) - child_thread.okSignal.connect(self.merge_docx) - child_thread.progressSignal.connect(self.progressSignal) - child_thread.start() - - -class DocxExporterChildThread(ExporterBase): def text(self, doc, message): type_ = message[2] str_content = message[7] @@ -139,11 +61,11 @@ class DocxExporterChildThread(ExporterBase): logger.error(f'非法字符:{str_content}') content_cell.paragraphs[0].add_run('非法字符') content_cell.paragraphs[0].font_size = shared.Inches(0.5) + # doc.add_picture(avatar) if is_send: p = content_cell.paragraphs[0] p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT doc.add_paragraph() - def image(self, doc, message): origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}" type_ = message[2] @@ -392,7 +314,8 @@ class DocxExporterChildThread(ExporterBase): def export(self): print(f"【开始导出 DOCX {self.contact.remark}】") origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}" - messages = self.messages + # messages = self.messages + messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range) Me().save_avatar(os.path.join(f"{origin_docx_path}/avatar/{Me().wxid}.png")) if self.contact.is_chatroom: for message in messages: @@ -407,29 +330,21 @@ class DocxExporterChildThread(ExporterBase): else: self.contact.save_avatar(os.path.join(f"{origin_docx_path}/avatar/{self.contact.wxid}.png")) self.rangeSignal.emit(len(messages)) - - index = 0 - def newdoc(): nonlocal n, doc doc = docx.Document() doc.styles["Normal"].font.name = "Cambria" doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") - docs.append(doc) n += 1 - doc = None - docs = [] n = 0 index = 0 newdoc() - # document = docx.Document() - # doc = document.add_paragraph() for index, message in enumerate(messages): if index % 200 == 0 and index: - # doc = document.add_paragraph() - # filename = os.path.join(origin_docx_path, f"{self.contact.remark}{n}.docx") - # doc.save(filename) + filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx") + doc.save(filename) + self.okSignal.emit(n) newdoc() type_ = message[2] @@ -459,20 +374,7 @@ class DocxExporterChildThread(ExporterBase): print(f"【导出 DOCX {self.contact.remark}】{index}/{len(messages)}") if index % 25: print(f"【导出 DOCX {self.contact.remark}】{index + 1}/{len(messages)}") - filename = os.path.join(origin_docx_path, f"{self.contact.remark}.docx") - doc = docx.Document() - doc.styles["Normal"].font.name = "Cambria" - doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") - # doc = Composer(doc) - # for index, dx in enumerate(docs): - # print(f"【MERGE Export DOCX {self.contact.remark}】{index}/{len(docs)}") - # doc.append(dx) - # print(f"【MERGE Export DOCX {self.contact.remark}】{len(docs)}") - doc = Composer(doc) # 针对11188条消息(56组)所测,反排比正排更快,正排65s,反排54s - for index, dx in enumerate(docs[::-1]): - print(f"【合并 DOCX {self.contact.remark}】{index + 1}/{len(docs)}") - doc.insert(0, dx) - filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{self.index}.docx") + filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx") try: # document.save(filename) doc.save(filename) @@ -480,5 +382,6 @@ class DocxExporterChildThread(ExporterBase): filename = filename[:-5] + f'{time.time()}' + '.docx' # document.save(filename) doc.save(filename) + self.okSignal.emit(n) print(f"【完成导出 DOCX {self.contact.remark}】") - self.okSignal.emit(self.index) + self.okSignal.emit(10086) diff --git a/app/DataBase/output.py b/app/DataBase/output.py index 359ee4a..763ff0f 100644 --- a/app/DataBase/output.py +++ b/app/DataBase/output.py @@ -61,6 +61,8 @@ def makedirs(path): def escape_js_and_html(input_str): + if not input_str: + return '' # 转义HTML特殊字符 html_escaped = html.escape(input_str, quote=False) diff --git a/app/DataBase/output_pc.py b/app/DataBase/output_pc.py index 6905b45..540e7c4 100644 --- a/app/DataBase/output_pc.py +++ b/app/DataBase/output_pc.py @@ -1,10 +1,14 @@ import csv import os +import time import traceback from typing import List +import docx from PyQt5.QtCore import pyqtSignal, QThread, QObject from PyQt5.QtWidgets import QFileDialog +from docx.oxml.ns import qn +from docxcompose.composer import Composer from app.DataBase.exporter_csv import CSVExporter from app.DataBase.exporter_docx import DocxExporter @@ -20,7 +24,7 @@ from ..util.image import get_image os.makedirs('./data/聊天记录', exist_ok=True) -class Output(QObject): +class Output(QThread): """ 发送信息线程 """ @@ -39,7 +43,7 @@ class Output(QObject): TXT = 5 Batch = 10086 - def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None,parent=None): + def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None, parent=None): super().__init__(parent) self.children = [] self.last_timestamp = 0 @@ -138,7 +142,7 @@ class Output(QObject): print(self.sub_type, self.message_types) print(len(self.contact)) print([contact.remark for contact in self.contact]) - self.batch_num_total = len(self.contact)*len(self.sub_type) + self.batch_num_total = len(self.contact) * len(self.sub_type) self.batch_num = 0 self.rangeSignal.emit(self.batch_num_total) for contact in self.contact: @@ -146,32 +150,66 @@ class Output(QObject): for type_ in self.sub_type: # print('导出类型', type_) if type_ == self.DOCX: - self.to_docx(contact, self.message_types,True) + self.to_docx(contact, self.message_types, True) elif type_ == self.TXT: # print('批量导出txt') - self.to_txt(contact, self.message_types,True) + self.to_txt(contact, self.message_types, True) elif type_ == self.CSV: - self.to_csv(contact, self.message_types,True) + self.to_csv(contact, self.message_types, True) elif type_ == self.HTML: - self.to_html(contact, self.message_types,True) + self.to_html(contact, self.message_types, True) def batch_finish_one(self, num): - self.nowContact.emit(self.contact[self.batch_num//len(self.sub_type)].remark) + self.nowContact.emit(self.contact[self.batch_num // len(self.sub_type)].remark) self.batch_num += 1 if self.batch_num == self.batch_num_total: self.okSignal.emit(1) + def merge_docx(self, n): + conRemark = self.contact.remark + origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}" + filename = f"{origin_docx_path}/{conRemark}_{n}.docx" + if n == 10086: + # self.document.append(self.document) + file = os.path.join(origin_docx_path, f'{conRemark}.docx') + try: + self.document.save(file) + except PermissionError: + file = file[:-5] + f'{time.time()}' + '.docx' + self.document.save(file) + self.okSignal.emit(1) + return + doc = docx.Document(filename) + self.document.append(doc) + os.remove(filename) + if n % 50 == 0: + # self.document.append(self.document) + file = os.path.join(origin_docx_path, f'{conRemark}-{n//50}.docx') + try: + self.document.save(file) + except PermissionError: + file = file[:-5] + f'{time.time()}' + '.docx' + self.document.save(file) + doc = docx.Document() + doc.styles["Normal"].font.name = "Cambria" + doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") + self.document = Composer(doc) + def to_docx(self, contact, message_types, is_batch=False): - Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types,time_range=self.time_range) + doc = docx.Document() + doc.styles["Normal"].font.name = "Cambria" + doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") + self.document = Composer(doc) + Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types, time_range=self.time_range) self.children.append(Child) Child.progressSignal.connect(self.progress) if not is_batch: Child.rangeSignal.connect(self.rangeSignal) - Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one) + Child.okSignal.connect(self.merge_docx if not is_batch else self.batch_finish_one) Child.start() def to_txt(self, contact, message_types, is_batch=False): - Child = TxtExporter(contact, type_=self.TXT, message_types=message_types,time_range=self.time_range) + Child = TxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range) self.children.append(Child) Child.progressSignal.connect(self.progress) if not is_batch: @@ -180,7 +218,7 @@ class Output(QObject): Child.start() def to_html(self, contact, message_types, is_batch=False): - Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types,time_range=self.time_range) + Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types, time_range=self.time_range) self.children.append(Child) Child.progressSignal.connect(self.progress) if not is_batch: @@ -191,7 +229,7 @@ class Output(QObject): if message_types.get(34): # 语音消息单独的线程 self.total_num += 1 - output_media = OutputMedia(contact,time_range=self.time_range) + output_media = OutputMedia(contact, time_range=self.time_range) self.children.append(output_media) output_media.okSingal.connect(self.count_finish_num) output_media.progressSignal.connect(self.progressSignal) @@ -199,7 +237,7 @@ class Output(QObject): if message_types.get(47): # emoji消息单独的线程 self.total_num += 1 - output_emoji = OutputEmoji(contact,time_range=self.time_range) + output_emoji = OutputEmoji(contact, time_range=self.time_range) self.children.append(output_emoji) output_emoji.okSingal.connect(self.count_finish_num) output_emoji.progressSignal.connect(self.progressSignal) @@ -207,14 +245,14 @@ class Output(QObject): if message_types.get(3): # 图片消息单独的线程 self.total_num += 1 - output_image = OutputImage(contact,time_range=self.time_range) + output_image = OutputImage(contact, time_range=self.time_range) self.children.append(output_image) output_image.okSingal.connect(self.count_finish_num) output_image.progressSignal.connect(self.progressSignal) output_image.start() def to_csv(self, contact, message_types, is_batch=False): - Child = CSVExporter(contact, type_=self.CSV, message_types=message_types,time_range=self.time_range) + Child = CSVExporter(contact, type_=self.CSV, message_types=message_types, time_range=self.time_range) self.children.append(Child) Child.progressSignal.connect(self.progress) if not is_batch: @@ -222,7 +260,7 @@ class Output(QObject): Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one) Child.start() - def start(self): + def run(self): if self.output_type == self.DOCX: self.to_docx(self.contact, self.message_types) elif self.output_type == self.CSV_ALL: @@ -264,14 +302,14 @@ class OutputMedia(QThread): okSingal = pyqtSignal(int) progressSignal = pyqtSignal(int) - def __init__(self, contact,time_range=None): + def __init__(self, contact, time_range=None): super().__init__() self.contact = contact self.time_range = time_range def run(self): origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}" - messages = msg_db.get_messages_by_type(self.contact.wxid, 34,time_range=self.time_range) + messages = msg_db.get_messages_by_type(self.contact.wxid, 34, time_range=self.time_range) for message in messages: is_send = message[4] msgSvrId = message[9] @@ -291,14 +329,14 @@ class OutputEmoji(QThread): okSingal = pyqtSignal(int) progressSignal = pyqtSignal(int) - def __init__(self, contact,time_range=None): + def __init__(self, contact, time_range=None): super().__init__() self.contact = contact self.time_range = time_range def run(self): origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}" - messages = msg_db.get_messages_by_type(self.contact.wxid, 47,time_range=self.time_range) + messages = msg_db.get_messages_by_type(self.contact.wxid, 47, time_range=self.time_range) for message in messages: str_content = message[7] try: @@ -318,11 +356,11 @@ class OutputImage(QThread): okSingal = pyqtSignal(int) progressSignal = pyqtSignal(int) - def __init__(self, contact,time_range): + def __init__(self, contact, time_range): super().__init__() self.contact = contact self.child_thread_num = 2 - self.time_range =time_range + self.time_range = time_range self.child_threads = [0] * (self.child_thread_num + 1) self.num = 0 @@ -335,7 +373,7 @@ class OutputImage(QThread): def run(self): origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}" - messages = msg_db.get_messages_by_type(self.contact.wxid, 3,time_range=self.time_range) + messages = msg_db.get_messages_by_type(self.contact.wxid, 3, time_range=self.time_range) for message in messages: str_content = message[7] BytesExtra = message[10] @@ -363,7 +401,7 @@ class OutputImageChild(QThread): okSingal = pyqtSignal(int) progressSignal = pyqtSignal(int) - def __init__(self, contact, messages,time_range): + def __init__(self, contact, messages, time_range): super().__init__() self.contact = contact self.messages = messages