更改docx导出逻辑，加快速度减少内存

2025-05-20 22:58:39 +08:00 · 2024-01-19 21:44:11 +08:00 · 2024-01-19 21:44:11 +08:00 · 9a37fc4aa2
commit 9a37fc4aa2
parent 8a0933cfb6
3 changed files with 74 additions and 131 deletions
--- a/app/DataBase/exporter_docx.py
+++ b/app/DataBase/exporter_docx.py
@ -40,84 +40,6 @@ def filter_control_characters(input_string):


 class DocxExporter(ExporterBase):
-    def merge_docx(self, n):
-        self.process_num += 1
-        conRemark = self.contact.remark
-        origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}"
-        filename = f"{origin_docx_path}/{conRemark}_{n}.docx"
-        # print(all_file_path)
-        doc = docx.Document(filename)
-        if self.merged_doc_index == [-1, -1]:
-            self.document.append(doc)
-            self.merged_doc_index = [n, n]
-        else:
-            if n == self.merged_doc_index[0] - 1:
-                self.document.insert(0, doc)
-                self.merged_doc_index[0] -= 1
-            elif n == self.merged_doc_index[1] + 1:
-                self.document.append(doc)
-                self.merged_doc_index[1] += 1
-            else:
-                self.docs.append([doc, n])
-                self.docs_set.add(n)
-            new_docx = []
-            new_set = set()
-            # print(self.docs)
-            while new_set!=self.docs_set:
-                self.docs.sort(key=lambda x: x[1])
-                for doc_, index in self.docs:
-                    if index == self.merged_doc_index[0] - 1:
-                        self.document.insert(0, doc_)
-                        self.merged_doc_index[0] -= 1
-                    elif index == self.merged_doc_index[1] + 1:
-                        self.document.append(doc_)
-                        self.merged_doc_index[1] += 1
-                    else:
-                        new_docx.append([doc_, index])
-                        new_set.add(index)
-            self.docs = new_docx
-            self.docs_set = new_set
-        os.remove(filename)
-        if self.process_num == self.child_thread_num:
-            # self.document.append(self.document)
-            file = os.path.join(origin_docx_path, f'{conRemark}.docx')
-            try:
-                self.document.save(file)
-            except PermissionError:
-                file = file[:-5] + f'{time.time()}' + '.docx'
-                self.document.save(file)
-            self.okSignal.emit(1)
-
-    def export(self):
-        self.child_threads = []
-        messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
-        # 计算每个子列表的长度
-        num = 1
-        # num = len(messages) // 500 +1
-        sublist_length = len(messages) // num
-
-        # 使用列表切片将列表分成n个子列表
-        divided_list = [messages[i:i + sublist_length] for i in range(0, len(messages), sublist_length)]
-        self.child_thread_num = len(divided_list)
-        self.process_num = 0
-        doc = docx.Document()
-        doc.styles["Normal"].font.name = "Cambria"
-        doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
-        self.document = Composer(doc)
-        self.merged_doc_index = [-1, -1]
-        self.docs = []
-        self.docs_set = set()
-        # self.document.append(self.document)
-        for i in range(self.child_thread_num):
-            child_thread = DocxExporterChildThread(self.contact, type_=self.DOCX, message_types=self.message_types,
-                                                   time_range=self.time_range, messages=divided_list[i], index=i)
-            self.child_threads.append(child_thread)
-            child_thread.okSignal.connect(self.merge_docx)
-            child_thread.progressSignal.connect(self.progressSignal)
-            child_thread.start()
-
-
-class DocxExporterChildThread(ExporterBase):
    def text(self, doc, message):
        type_ = message[2]
        str_content = message[7]
@ -139,11 +61,11 @@ class DocxExporterChildThread(ExporterBase):
                logger.error(f'非法字符:{str_content}')
                content_cell.paragraphs[0].add_run('非法字符')
        content_cell.paragraphs[0].font_size = shared.Inches(0.5)
+        # doc.add_picture(avatar)
        if is_send:
            p = content_cell.paragraphs[0]
            p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
        doc.add_paragraph()
-
    def image(self, doc, message):
        origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
        type_ = message[2]
@ -392,7 +314,8 @@ class DocxExporterChildThread(ExporterBase):
    def export(self):
        print(f"【开始导出 DOCX {self.contact.remark}】")
        origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
-        messages = self.messages
+        # messages = self.messages
+        messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
        Me().save_avatar(os.path.join(f"{origin_docx_path}/avatar/{Me().wxid}.png"))
        if self.contact.is_chatroom:
            for message in messages:
@ -407,29 +330,21 @@ class DocxExporterChildThread(ExporterBase):
        else:
            self.contact.save_avatar(os.path.join(f"{origin_docx_path}/avatar/{self.contact.wxid}.png"))
        self.rangeSignal.emit(len(messages))
-
-        index = 0
-
        def newdoc():
            nonlocal n, doc
            doc = docx.Document()
            doc.styles["Normal"].font.name = "Cambria"
            doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
-            docs.append(doc)
            n += 1
-
        doc = None
-        docs = []
        n = 0
        index = 0
        newdoc()
-        # document = docx.Document()
-        # doc = document.add_paragraph()
        for index, message in enumerate(messages):
            if index % 200 == 0 and index:
-                # doc = document.add_paragraph()
-                # filename = os.path.join(origin_docx_path, f"{self.contact.remark}{n}.docx")
-                # doc.save(filename)
+                filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx")
+                doc.save(filename)
+                self.okSignal.emit(n)
                newdoc()

            type_ = message[2]
@ -459,20 +374,7 @@ class DocxExporterChildThread(ExporterBase):
                print(f"【导出 DOCX {self.contact.remark}】{index}/{len(messages)}")
        if index % 25:
            print(f"【导出 DOCX {self.contact.remark}】{index + 1}/{len(messages)}")
-        filename = os.path.join(origin_docx_path, f"{self.contact.remark}.docx")
-        doc = docx.Document()
-        doc.styles["Normal"].font.name = "Cambria"
-        doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
-        # doc = Composer(doc)
-        # for index, dx in enumerate(docs):
-        #     print(f"【MERGE Export DOCX {self.contact.remark}】{index}/{len(docs)}")
-        #     doc.append(dx)
-        # print(f"【MERGE Export DOCX {self.contact.remark}】{len(docs)}")
-        doc = Composer(doc)  # 针对11188条消息（56组）所测，反排比正排更快，正排65s，反排54s
-        for index, dx in enumerate(docs[::-1]):
-            print(f"【合并 DOCX {self.contact.remark}】{index + 1}/{len(docs)}")
-            doc.insert(0, dx)
-        filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{self.index}.docx")
+        filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx")
        try:
            # document.save(filename)
            doc.save(filename)
@ -480,5 +382,6 @@ class DocxExporterChildThread(ExporterBase):
            filename = filename[:-5] + f'{time.time()}' + '.docx'
            # document.save(filename)
            doc.save(filename)
+        self.okSignal.emit(n)
        print(f"【完成导出 DOCX {self.contact.remark}】")
-        self.okSignal.emit(self.index)
+        self.okSignal.emit(10086)
--- a/app/DataBase/output.py
+++ b/app/DataBase/output.py
@ -61,6 +61,8 @@ def makedirs(path):


 def escape_js_and_html(input_str):
+    if not input_str:
+        return ''
    # 转义HTML特殊字符
    html_escaped = html.escape(input_str, quote=False)

--- a/app/DataBase/output_pc.py
+++ b/app/DataBase/output_pc.py
@ -1,10 +1,14 @@
 import csv
 import os
+import time
 import traceback
 from typing import List

+import docx
 from PyQt5.QtCore import pyqtSignal, QThread, QObject
 from PyQt5.QtWidgets import QFileDialog
+from docx.oxml.ns import qn
+from docxcompose.composer import Composer

 from app.DataBase.exporter_csv import CSVExporter
 from app.DataBase.exporter_docx import DocxExporter
@ -20,7 +24,7 @@ from ..util.image import get_image
 os.makedirs('./data/聊天记录', exist_ok=True)


-class Output(QObject):
+class Output(QThread):
    """
    发送信息线程
    """
@ -39,7 +43,7 @@ class Output(QObject):
    TXT = 5
    Batch = 10086

-    def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None,parent=None):
+    def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None, parent=None):
        super().__init__(parent)
        self.children = []
        self.last_timestamp = 0
@ -138,7 +142,7 @@ class Output(QObject):
        print(self.sub_type, self.message_types)
        print(len(self.contact))
        print([contact.remark for contact in self.contact])
-        self.batch_num_total = len(self.contact)*len(self.sub_type)
+        self.batch_num_total = len(self.contact) * len(self.sub_type)
        self.batch_num = 0
        self.rangeSignal.emit(self.batch_num_total)
        for contact in self.contact:
@ -146,32 +150,66 @@ class Output(QObject):
            for type_ in self.sub_type:
                # print('导出类型', type_)
                if type_ == self.DOCX:
-                    self.to_docx(contact, self.message_types,True)
+                    self.to_docx(contact, self.message_types, True)
                elif type_ == self.TXT:
                    # print('批量导出txt')
-                    self.to_txt(contact, self.message_types,True)
+                    self.to_txt(contact, self.message_types, True)
                elif type_ == self.CSV:
-                    self.to_csv(contact, self.message_types,True)
+                    self.to_csv(contact, self.message_types, True)
                elif type_ == self.HTML:
-                    self.to_html(contact, self.message_types,True)
+                    self.to_html(contact, self.message_types, True)

    def batch_finish_one(self, num):
-        self.nowContact.emit(self.contact[self.batch_num//len(self.sub_type)].remark)
+        self.nowContact.emit(self.contact[self.batch_num // len(self.sub_type)].remark)
        self.batch_num += 1
        if self.batch_num == self.batch_num_total:
            self.okSignal.emit(1)

+    def merge_docx(self, n):
+        conRemark = self.contact.remark
+        origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}"
+        filename = f"{origin_docx_path}/{conRemark}_{n}.docx"
+        if n == 10086:
+            # self.document.append(self.document)
+            file = os.path.join(origin_docx_path, f'{conRemark}.docx')
+            try:
+                self.document.save(file)
+            except PermissionError:
+                file = file[:-5] + f'{time.time()}' + '.docx'
+                self.document.save(file)
+            self.okSignal.emit(1)
+            return
+        doc = docx.Document(filename)
+        self.document.append(doc)
+        os.remove(filename)
+        if n % 50 == 0:
+            # self.document.append(self.document)
+            file = os.path.join(origin_docx_path, f'{conRemark}-{n//50}.docx')
+            try:
+                self.document.save(file)
+            except PermissionError:
+                file = file[:-5] + f'{time.time()}' + '.docx'
+                self.document.save(file)
+            doc = docx.Document()
+            doc.styles["Normal"].font.name = "Cambria"
+            doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
+            self.document = Composer(doc)
+
    def to_docx(self, contact, message_types, is_batch=False):
-        Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types,time_range=self.time_range)
+        doc = docx.Document()
+        doc.styles["Normal"].font.name = "Cambria"
+        doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
+        self.document = Composer(doc)
+        Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types, time_range=self.time_range)
        self.children.append(Child)
        Child.progressSignal.connect(self.progress)
        if not is_batch:
            Child.rangeSignal.connect(self.rangeSignal)
-        Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
+        Child.okSignal.connect(self.merge_docx if not is_batch else self.batch_finish_one)
        Child.start()

    def to_txt(self, contact, message_types, is_batch=False):
-        Child = TxtExporter(contact, type_=self.TXT, message_types=message_types,time_range=self.time_range)
+        Child = TxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range)
        self.children.append(Child)
        Child.progressSignal.connect(self.progress)
        if not is_batch:
@ -180,7 +218,7 @@ class Output(QObject):
        Child.start()

    def to_html(self, contact, message_types, is_batch=False):
-        Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types,time_range=self.time_range)
+        Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types, time_range=self.time_range)
        self.children.append(Child)
        Child.progressSignal.connect(self.progress)
        if not is_batch:
@ -191,7 +229,7 @@ class Output(QObject):
        if message_types.get(34):
            # 语音消息单独的线程
            self.total_num += 1
-            output_media = OutputMedia(contact,time_range=self.time_range)
+            output_media = OutputMedia(contact, time_range=self.time_range)
            self.children.append(output_media)
            output_media.okSingal.connect(self.count_finish_num)
            output_media.progressSignal.connect(self.progressSignal)
@ -199,7 +237,7 @@ class Output(QObject):
        if message_types.get(47):
            # emoji消息单独的线程
            self.total_num += 1
-            output_emoji = OutputEmoji(contact,time_range=self.time_range)
+            output_emoji = OutputEmoji(contact, time_range=self.time_range)
            self.children.append(output_emoji)
            output_emoji.okSingal.connect(self.count_finish_num)
            output_emoji.progressSignal.connect(self.progressSignal)
@ -207,14 +245,14 @@ class Output(QObject):
        if message_types.get(3):
            # 图片消息单独的线程
            self.total_num += 1
-            output_image = OutputImage(contact,time_range=self.time_range)
+            output_image = OutputImage(contact, time_range=self.time_range)
            self.children.append(output_image)
            output_image.okSingal.connect(self.count_finish_num)
            output_image.progressSignal.connect(self.progressSignal)
            output_image.start()

    def to_csv(self, contact, message_types, is_batch=False):
-        Child = CSVExporter(contact, type_=self.CSV, message_types=message_types,time_range=self.time_range)
+        Child = CSVExporter(contact, type_=self.CSV, message_types=message_types, time_range=self.time_range)
        self.children.append(Child)
        Child.progressSignal.connect(self.progress)
        if not is_batch:
@ -222,7 +260,7 @@ class Output(QObject):
        Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
        Child.start()

-    def start(self):
+    def run(self):
        if self.output_type == self.DOCX:
            self.to_docx(self.contact, self.message_types)
        elif self.output_type == self.CSV_ALL:
@ -264,14 +302,14 @@ class OutputMedia(QThread):
    okSingal = pyqtSignal(int)
    progressSignal = pyqtSignal(int)

-    def __init__(self, contact,time_range=None):
+    def __init__(self, contact, time_range=None):
        super().__init__()
        self.contact = contact
        self.time_range = time_range

    def run(self):
        origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
-        messages = msg_db.get_messages_by_type(self.contact.wxid, 34,time_range=self.time_range)
+        messages = msg_db.get_messages_by_type(self.contact.wxid, 34, time_range=self.time_range)
        for message in messages:
            is_send = message[4]
            msgSvrId = message[9]
@ -291,14 +329,14 @@ class OutputEmoji(QThread):
    okSingal = pyqtSignal(int)
    progressSignal = pyqtSignal(int)

-    def __init__(self, contact,time_range=None):
+    def __init__(self, contact, time_range=None):
        super().__init__()
        self.contact = contact
        self.time_range = time_range

    def run(self):
        origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
-        messages = msg_db.get_messages_by_type(self.contact.wxid, 47,time_range=self.time_range)
+        messages = msg_db.get_messages_by_type(self.contact.wxid, 47, time_range=self.time_range)
        for message in messages:
            str_content = message[7]
            try:
@ -318,11 +356,11 @@ class OutputImage(QThread):
    okSingal = pyqtSignal(int)
    progressSignal = pyqtSignal(int)

-    def __init__(self, contact,time_range):
+    def __init__(self, contact, time_range):
        super().__init__()
        self.contact = contact
        self.child_thread_num = 2
-        self.time_range =time_range
+        self.time_range = time_range
        self.child_threads = [0] * (self.child_thread_num + 1)
        self.num = 0

@ -335,7 +373,7 @@ class OutputImage(QThread):

    def run(self):
        origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
-        messages = msg_db.get_messages_by_type(self.contact.wxid, 3,time_range=self.time_range)
+        messages = msg_db.get_messages_by_type(self.contact.wxid, 3, time_range=self.time_range)
        for message in messages:
            str_content = message[7]
            BytesExtra = message[10]
@ -363,7 +401,7 @@ class OutputImageChild(QThread):
    okSingal = pyqtSignal(int)
    progressSignal = pyqtSignal(int)

-    def __init__(self, contact, messages,time_range):
+    def __init__(self, contact, messages, time_range):
        super().__init__()
        self.contact = contact
        self.messages = messages