更改docx导出逻辑,加快速度减少内存

This commit is contained in:
shuaikangzhou 2024-01-19 21:44:11 +08:00
parent 8a0933cfb6
commit 9a37fc4aa2
3 changed files with 74 additions and 131 deletions

View File

@ -40,84 +40,6 @@ def filter_control_characters(input_string):
class DocxExporter(ExporterBase):
def merge_docx(self, n):
self.process_num += 1
conRemark = self.contact.remark
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}"
filename = f"{origin_docx_path}/{conRemark}_{n}.docx"
# print(all_file_path)
doc = docx.Document(filename)
if self.merged_doc_index == [-1, -1]:
self.document.append(doc)
self.merged_doc_index = [n, n]
else:
if n == self.merged_doc_index[0] - 1:
self.document.insert(0, doc)
self.merged_doc_index[0] -= 1
elif n == self.merged_doc_index[1] + 1:
self.document.append(doc)
self.merged_doc_index[1] += 1
else:
self.docs.append([doc, n])
self.docs_set.add(n)
new_docx = []
new_set = set()
# print(self.docs)
while new_set!=self.docs_set:
self.docs.sort(key=lambda x: x[1])
for doc_, index in self.docs:
if index == self.merged_doc_index[0] - 1:
self.document.insert(0, doc_)
self.merged_doc_index[0] -= 1
elif index == self.merged_doc_index[1] + 1:
self.document.append(doc_)
self.merged_doc_index[1] += 1
else:
new_docx.append([doc_, index])
new_set.add(index)
self.docs = new_docx
self.docs_set = new_set
os.remove(filename)
if self.process_num == self.child_thread_num:
# self.document.append(self.document)
file = os.path.join(origin_docx_path, f'{conRemark}.docx')
try:
self.document.save(file)
except PermissionError:
file = file[:-5] + f'{time.time()}' + '.docx'
self.document.save(file)
self.okSignal.emit(1)
def export(self):
self.child_threads = []
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
# 计算每个子列表的长度
num = 1
# num = len(messages) // 500 +1
sublist_length = len(messages) // num
# 使用列表切片将列表分成n个子列表
divided_list = [messages[i:i + sublist_length] for i in range(0, len(messages), sublist_length)]
self.child_thread_num = len(divided_list)
self.process_num = 0
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
self.document = Composer(doc)
self.merged_doc_index = [-1, -1]
self.docs = []
self.docs_set = set()
# self.document.append(self.document)
for i in range(self.child_thread_num):
child_thread = DocxExporterChildThread(self.contact, type_=self.DOCX, message_types=self.message_types,
time_range=self.time_range, messages=divided_list[i], index=i)
self.child_threads.append(child_thread)
child_thread.okSignal.connect(self.merge_docx)
child_thread.progressSignal.connect(self.progressSignal)
child_thread.start()
class DocxExporterChildThread(ExporterBase):
def text(self, doc, message):
type_ = message[2]
str_content = message[7]
@ -139,11 +61,11 @@ class DocxExporterChildThread(ExporterBase):
logger.error(f'非法字符:{str_content}')
content_cell.paragraphs[0].add_run('非法字符')
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
# doc.add_picture(avatar)
if is_send:
p = content_cell.paragraphs[0]
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
doc.add_paragraph()
def image(self, doc, message):
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
type_ = message[2]
@ -392,7 +314,8 @@ class DocxExporterChildThread(ExporterBase):
def export(self):
print(f"【开始导出 DOCX {self.contact.remark}")
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
messages = self.messages
# messages = self.messages
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
Me().save_avatar(os.path.join(f"{origin_docx_path}/avatar/{Me().wxid}.png"))
if self.contact.is_chatroom:
for message in messages:
@ -407,29 +330,21 @@ class DocxExporterChildThread(ExporterBase):
else:
self.contact.save_avatar(os.path.join(f"{origin_docx_path}/avatar/{self.contact.wxid}.png"))
self.rangeSignal.emit(len(messages))
index = 0
def newdoc():
nonlocal n, doc
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
docs.append(doc)
n += 1
doc = None
docs = []
n = 0
index = 0
newdoc()
# document = docx.Document()
# doc = document.add_paragraph()
for index, message in enumerate(messages):
if index % 200 == 0 and index:
# doc = document.add_paragraph()
# filename = os.path.join(origin_docx_path, f"{self.contact.remark}{n}.docx")
# doc.save(filename)
filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx")
doc.save(filename)
self.okSignal.emit(n)
newdoc()
type_ = message[2]
@ -459,20 +374,7 @@ class DocxExporterChildThread(ExporterBase):
print(f"【导出 DOCX {self.contact.remark}{index}/{len(messages)}")
if index % 25:
print(f"【导出 DOCX {self.contact.remark}{index + 1}/{len(messages)}")
filename = os.path.join(origin_docx_path, f"{self.contact.remark}.docx")
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
# doc = Composer(doc)
# for index, dx in enumerate(docs):
# print(f"【MERGE Export DOCX {self.contact.remark}】{index}/{len(docs)}")
# doc.append(dx)
# print(f"【MERGE Export DOCX {self.contact.remark}】{len(docs)}")
doc = Composer(doc) # 针对11188条消息56组所测反排比正排更快正排65s反排54s
for index, dx in enumerate(docs[::-1]):
print(f"【合并 DOCX {self.contact.remark}{index + 1}/{len(docs)}")
doc.insert(0, dx)
filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{self.index}.docx")
filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx")
try:
# document.save(filename)
doc.save(filename)
@ -480,5 +382,6 @@ class DocxExporterChildThread(ExporterBase):
filename = filename[:-5] + f'{time.time()}' + '.docx'
# document.save(filename)
doc.save(filename)
self.okSignal.emit(n)
print(f"【完成导出 DOCX {self.contact.remark}")
self.okSignal.emit(self.index)
self.okSignal.emit(10086)

View File

@ -61,6 +61,8 @@ def makedirs(path):
def escape_js_and_html(input_str):
if not input_str:
return ''
# 转义HTML特殊字符
html_escaped = html.escape(input_str, quote=False)

View File

@ -1,10 +1,14 @@
import csv
import os
import time
import traceback
from typing import List
import docx
from PyQt5.QtCore import pyqtSignal, QThread, QObject
from PyQt5.QtWidgets import QFileDialog
from docx.oxml.ns import qn
from docxcompose.composer import Composer
from app.DataBase.exporter_csv import CSVExporter
from app.DataBase.exporter_docx import DocxExporter
@ -20,7 +24,7 @@ from ..util.image import get_image
os.makedirs('./data/聊天记录', exist_ok=True)
class Output(QObject):
class Output(QThread):
"""
发送信息线程
"""
@ -39,7 +43,7 @@ class Output(QObject):
TXT = 5
Batch = 10086
def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None,parent=None):
def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None, parent=None):
super().__init__(parent)
self.children = []
self.last_timestamp = 0
@ -138,7 +142,7 @@ class Output(QObject):
print(self.sub_type, self.message_types)
print(len(self.contact))
print([contact.remark for contact in self.contact])
self.batch_num_total = len(self.contact)*len(self.sub_type)
self.batch_num_total = len(self.contact) * len(self.sub_type)
self.batch_num = 0
self.rangeSignal.emit(self.batch_num_total)
for contact in self.contact:
@ -146,32 +150,66 @@ class Output(QObject):
for type_ in self.sub_type:
# print('导出类型', type_)
if type_ == self.DOCX:
self.to_docx(contact, self.message_types,True)
self.to_docx(contact, self.message_types, True)
elif type_ == self.TXT:
# print('批量导出txt')
self.to_txt(contact, self.message_types,True)
self.to_txt(contact, self.message_types, True)
elif type_ == self.CSV:
self.to_csv(contact, self.message_types,True)
self.to_csv(contact, self.message_types, True)
elif type_ == self.HTML:
self.to_html(contact, self.message_types,True)
self.to_html(contact, self.message_types, True)
def batch_finish_one(self, num):
self.nowContact.emit(self.contact[self.batch_num//len(self.sub_type)].remark)
self.nowContact.emit(self.contact[self.batch_num // len(self.sub_type)].remark)
self.batch_num += 1
if self.batch_num == self.batch_num_total:
self.okSignal.emit(1)
def merge_docx(self, n):
conRemark = self.contact.remark
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}"
filename = f"{origin_docx_path}/{conRemark}_{n}.docx"
if n == 10086:
# self.document.append(self.document)
file = os.path.join(origin_docx_path, f'{conRemark}.docx')
try:
self.document.save(file)
except PermissionError:
file = file[:-5] + f'{time.time()}' + '.docx'
self.document.save(file)
self.okSignal.emit(1)
return
doc = docx.Document(filename)
self.document.append(doc)
os.remove(filename)
if n % 50 == 0:
# self.document.append(self.document)
file = os.path.join(origin_docx_path, f'{conRemark}-{n//50}.docx')
try:
self.document.save(file)
except PermissionError:
file = file[:-5] + f'{time.time()}' + '.docx'
self.document.save(file)
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
self.document = Composer(doc)
def to_docx(self, contact, message_types, is_batch=False):
Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types,time_range=self.time_range)
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
self.document = Composer(doc)
Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
Child.rangeSignal.connect(self.rangeSignal)
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
Child.okSignal.connect(self.merge_docx if not is_batch else self.batch_finish_one)
Child.start()
def to_txt(self, contact, message_types, is_batch=False):
Child = TxtExporter(contact, type_=self.TXT, message_types=message_types,time_range=self.time_range)
Child = TxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
@ -180,7 +218,7 @@ class Output(QObject):
Child.start()
def to_html(self, contact, message_types, is_batch=False):
Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types,time_range=self.time_range)
Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
@ -191,7 +229,7 @@ class Output(QObject):
if message_types.get(34):
# 语音消息单独的线程
self.total_num += 1
output_media = OutputMedia(contact,time_range=self.time_range)
output_media = OutputMedia(contact, time_range=self.time_range)
self.children.append(output_media)
output_media.okSingal.connect(self.count_finish_num)
output_media.progressSignal.connect(self.progressSignal)
@ -199,7 +237,7 @@ class Output(QObject):
if message_types.get(47):
# emoji消息单独的线程
self.total_num += 1
output_emoji = OutputEmoji(contact,time_range=self.time_range)
output_emoji = OutputEmoji(contact, time_range=self.time_range)
self.children.append(output_emoji)
output_emoji.okSingal.connect(self.count_finish_num)
output_emoji.progressSignal.connect(self.progressSignal)
@ -207,14 +245,14 @@ class Output(QObject):
if message_types.get(3):
# 图片消息单独的线程
self.total_num += 1
output_image = OutputImage(contact,time_range=self.time_range)
output_image = OutputImage(contact, time_range=self.time_range)
self.children.append(output_image)
output_image.okSingal.connect(self.count_finish_num)
output_image.progressSignal.connect(self.progressSignal)
output_image.start()
def to_csv(self, contact, message_types, is_batch=False):
Child = CSVExporter(contact, type_=self.CSV, message_types=message_types,time_range=self.time_range)
Child = CSVExporter(contact, type_=self.CSV, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
@ -222,7 +260,7 @@ class Output(QObject):
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
Child.start()
def start(self):
def run(self):
if self.output_type == self.DOCX:
self.to_docx(self.contact, self.message_types)
elif self.output_type == self.CSV_ALL:
@ -264,14 +302,14 @@ class OutputMedia(QThread):
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact,time_range=None):
def __init__(self, contact, time_range=None):
super().__init__()
self.contact = contact
self.time_range = time_range
def run(self):
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
messages = msg_db.get_messages_by_type(self.contact.wxid, 34,time_range=self.time_range)
messages = msg_db.get_messages_by_type(self.contact.wxid, 34, time_range=self.time_range)
for message in messages:
is_send = message[4]
msgSvrId = message[9]
@ -291,14 +329,14 @@ class OutputEmoji(QThread):
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact,time_range=None):
def __init__(self, contact, time_range=None):
super().__init__()
self.contact = contact
self.time_range = time_range
def run(self):
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
messages = msg_db.get_messages_by_type(self.contact.wxid, 47,time_range=self.time_range)
messages = msg_db.get_messages_by_type(self.contact.wxid, 47, time_range=self.time_range)
for message in messages:
str_content = message[7]
try:
@ -318,11 +356,11 @@ class OutputImage(QThread):
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact,time_range):
def __init__(self, contact, time_range):
super().__init__()
self.contact = contact
self.child_thread_num = 2
self.time_range =time_range
self.time_range = time_range
self.child_threads = [0] * (self.child_thread_num + 1)
self.num = 0
@ -335,7 +373,7 @@ class OutputImage(QThread):
def run(self):
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
messages = msg_db.get_messages_by_type(self.contact.wxid, 3,time_range=self.time_range)
messages = msg_db.get_messages_by_type(self.contact.wxid, 3, time_range=self.time_range)
for message in messages:
str_content = message[7]
BytesExtra = message[10]
@ -363,7 +401,7 @@ class OutputImageChild(QThread):
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact, messages,time_range):
def __init__(self, contact, messages, time_range):
super().__init__()
self.contact = contact
self.messages = messages