mirror of
https://github.com/LC044/WeChatMsg
synced 2025-02-23 03:22:17 +08:00
更改docx导出逻辑,加快速度减少内存
This commit is contained in:
parent
8a0933cfb6
commit
9a37fc4aa2
@ -40,84 +40,6 @@ def filter_control_characters(input_string):
|
||||
|
||||
|
||||
class DocxExporter(ExporterBase):
|
||||
def merge_docx(self, n):
|
||||
self.process_num += 1
|
||||
conRemark = self.contact.remark
|
||||
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}"
|
||||
filename = f"{origin_docx_path}/{conRemark}_{n}.docx"
|
||||
# print(all_file_path)
|
||||
doc = docx.Document(filename)
|
||||
if self.merged_doc_index == [-1, -1]:
|
||||
self.document.append(doc)
|
||||
self.merged_doc_index = [n, n]
|
||||
else:
|
||||
if n == self.merged_doc_index[0] - 1:
|
||||
self.document.insert(0, doc)
|
||||
self.merged_doc_index[0] -= 1
|
||||
elif n == self.merged_doc_index[1] + 1:
|
||||
self.document.append(doc)
|
||||
self.merged_doc_index[1] += 1
|
||||
else:
|
||||
self.docs.append([doc, n])
|
||||
self.docs_set.add(n)
|
||||
new_docx = []
|
||||
new_set = set()
|
||||
# print(self.docs)
|
||||
while new_set!=self.docs_set:
|
||||
self.docs.sort(key=lambda x: x[1])
|
||||
for doc_, index in self.docs:
|
||||
if index == self.merged_doc_index[0] - 1:
|
||||
self.document.insert(0, doc_)
|
||||
self.merged_doc_index[0] -= 1
|
||||
elif index == self.merged_doc_index[1] + 1:
|
||||
self.document.append(doc_)
|
||||
self.merged_doc_index[1] += 1
|
||||
else:
|
||||
new_docx.append([doc_, index])
|
||||
new_set.add(index)
|
||||
self.docs = new_docx
|
||||
self.docs_set = new_set
|
||||
os.remove(filename)
|
||||
if self.process_num == self.child_thread_num:
|
||||
# self.document.append(self.document)
|
||||
file = os.path.join(origin_docx_path, f'{conRemark}.docx')
|
||||
try:
|
||||
self.document.save(file)
|
||||
except PermissionError:
|
||||
file = file[:-5] + f'{time.time()}' + '.docx'
|
||||
self.document.save(file)
|
||||
self.okSignal.emit(1)
|
||||
|
||||
def export(self):
|
||||
self.child_threads = []
|
||||
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
|
||||
# 计算每个子列表的长度
|
||||
num = 1
|
||||
# num = len(messages) // 500 +1
|
||||
sublist_length = len(messages) // num
|
||||
|
||||
# 使用列表切片将列表分成n个子列表
|
||||
divided_list = [messages[i:i + sublist_length] for i in range(0, len(messages), sublist_length)]
|
||||
self.child_thread_num = len(divided_list)
|
||||
self.process_num = 0
|
||||
doc = docx.Document()
|
||||
doc.styles["Normal"].font.name = "Cambria"
|
||||
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
|
||||
self.document = Composer(doc)
|
||||
self.merged_doc_index = [-1, -1]
|
||||
self.docs = []
|
||||
self.docs_set = set()
|
||||
# self.document.append(self.document)
|
||||
for i in range(self.child_thread_num):
|
||||
child_thread = DocxExporterChildThread(self.contact, type_=self.DOCX, message_types=self.message_types,
|
||||
time_range=self.time_range, messages=divided_list[i], index=i)
|
||||
self.child_threads.append(child_thread)
|
||||
child_thread.okSignal.connect(self.merge_docx)
|
||||
child_thread.progressSignal.connect(self.progressSignal)
|
||||
child_thread.start()
|
||||
|
||||
|
||||
class DocxExporterChildThread(ExporterBase):
|
||||
def text(self, doc, message):
|
||||
type_ = message[2]
|
||||
str_content = message[7]
|
||||
@ -139,11 +61,11 @@ class DocxExporterChildThread(ExporterBase):
|
||||
logger.error(f'非法字符:{str_content}')
|
||||
content_cell.paragraphs[0].add_run('非法字符')
|
||||
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
|
||||
# doc.add_picture(avatar)
|
||||
if is_send:
|
||||
p = content_cell.paragraphs[0]
|
||||
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
||||
doc.add_paragraph()
|
||||
|
||||
def image(self, doc, message):
|
||||
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
|
||||
type_ = message[2]
|
||||
@ -392,7 +314,8 @@ class DocxExporterChildThread(ExporterBase):
|
||||
def export(self):
|
||||
print(f"【开始导出 DOCX {self.contact.remark}】")
|
||||
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
|
||||
messages = self.messages
|
||||
# messages = self.messages
|
||||
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
|
||||
Me().save_avatar(os.path.join(f"{origin_docx_path}/avatar/{Me().wxid}.png"))
|
||||
if self.contact.is_chatroom:
|
||||
for message in messages:
|
||||
@ -407,29 +330,21 @@ class DocxExporterChildThread(ExporterBase):
|
||||
else:
|
||||
self.contact.save_avatar(os.path.join(f"{origin_docx_path}/avatar/{self.contact.wxid}.png"))
|
||||
self.rangeSignal.emit(len(messages))
|
||||
|
||||
index = 0
|
||||
|
||||
def newdoc():
|
||||
nonlocal n, doc
|
||||
doc = docx.Document()
|
||||
doc.styles["Normal"].font.name = "Cambria"
|
||||
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
|
||||
docs.append(doc)
|
||||
n += 1
|
||||
|
||||
doc = None
|
||||
docs = []
|
||||
n = 0
|
||||
index = 0
|
||||
newdoc()
|
||||
# document = docx.Document()
|
||||
# doc = document.add_paragraph()
|
||||
for index, message in enumerate(messages):
|
||||
if index % 200 == 0 and index:
|
||||
# doc = document.add_paragraph()
|
||||
# filename = os.path.join(origin_docx_path, f"{self.contact.remark}{n}.docx")
|
||||
# doc.save(filename)
|
||||
filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx")
|
||||
doc.save(filename)
|
||||
self.okSignal.emit(n)
|
||||
newdoc()
|
||||
|
||||
type_ = message[2]
|
||||
@ -459,20 +374,7 @@ class DocxExporterChildThread(ExporterBase):
|
||||
print(f"【导出 DOCX {self.contact.remark}】{index}/{len(messages)}")
|
||||
if index % 25:
|
||||
print(f"【导出 DOCX {self.contact.remark}】{index + 1}/{len(messages)}")
|
||||
filename = os.path.join(origin_docx_path, f"{self.contact.remark}.docx")
|
||||
doc = docx.Document()
|
||||
doc.styles["Normal"].font.name = "Cambria"
|
||||
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
|
||||
# doc = Composer(doc)
|
||||
# for index, dx in enumerate(docs):
|
||||
# print(f"【MERGE Export DOCX {self.contact.remark}】{index}/{len(docs)}")
|
||||
# doc.append(dx)
|
||||
# print(f"【MERGE Export DOCX {self.contact.remark}】{len(docs)}")
|
||||
doc = Composer(doc) # 针对11188条消息(56组)所测,反排比正排更快,正排65s,反排54s
|
||||
for index, dx in enumerate(docs[::-1]):
|
||||
print(f"【合并 DOCX {self.contact.remark}】{index + 1}/{len(docs)}")
|
||||
doc.insert(0, dx)
|
||||
filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{self.index}.docx")
|
||||
filename = os.path.join(origin_docx_path, f"{self.contact.remark}_{n}.docx")
|
||||
try:
|
||||
# document.save(filename)
|
||||
doc.save(filename)
|
||||
@ -480,5 +382,6 @@ class DocxExporterChildThread(ExporterBase):
|
||||
filename = filename[:-5] + f'{time.time()}' + '.docx'
|
||||
# document.save(filename)
|
||||
doc.save(filename)
|
||||
self.okSignal.emit(n)
|
||||
print(f"【完成导出 DOCX {self.contact.remark}】")
|
||||
self.okSignal.emit(self.index)
|
||||
self.okSignal.emit(10086)
|
||||
|
@ -61,6 +61,8 @@ def makedirs(path):
|
||||
|
||||
|
||||
def escape_js_and_html(input_str):
|
||||
if not input_str:
|
||||
return ''
|
||||
# 转义HTML特殊字符
|
||||
html_escaped = html.escape(input_str, quote=False)
|
||||
|
||||
|
@ -1,10 +1,14 @@
|
||||
import csv
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from typing import List
|
||||
|
||||
import docx
|
||||
from PyQt5.QtCore import pyqtSignal, QThread, QObject
|
||||
from PyQt5.QtWidgets import QFileDialog
|
||||
from docx.oxml.ns import qn
|
||||
from docxcompose.composer import Composer
|
||||
|
||||
from app.DataBase.exporter_csv import CSVExporter
|
||||
from app.DataBase.exporter_docx import DocxExporter
|
||||
@ -20,7 +24,7 @@ from ..util.image import get_image
|
||||
os.makedirs('./data/聊天记录', exist_ok=True)
|
||||
|
||||
|
||||
class Output(QObject):
|
||||
class Output(QThread):
|
||||
"""
|
||||
发送信息线程
|
||||
"""
|
||||
@ -39,7 +43,7 @@ class Output(QObject):
|
||||
TXT = 5
|
||||
Batch = 10086
|
||||
|
||||
def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None,parent=None):
|
||||
def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None, parent=None):
|
||||
super().__init__(parent)
|
||||
self.children = []
|
||||
self.last_timestamp = 0
|
||||
@ -138,7 +142,7 @@ class Output(QObject):
|
||||
print(self.sub_type, self.message_types)
|
||||
print(len(self.contact))
|
||||
print([contact.remark for contact in self.contact])
|
||||
self.batch_num_total = len(self.contact)*len(self.sub_type)
|
||||
self.batch_num_total = len(self.contact) * len(self.sub_type)
|
||||
self.batch_num = 0
|
||||
self.rangeSignal.emit(self.batch_num_total)
|
||||
for contact in self.contact:
|
||||
@ -146,32 +150,66 @@ class Output(QObject):
|
||||
for type_ in self.sub_type:
|
||||
# print('导出类型', type_)
|
||||
if type_ == self.DOCX:
|
||||
self.to_docx(contact, self.message_types,True)
|
||||
self.to_docx(contact, self.message_types, True)
|
||||
elif type_ == self.TXT:
|
||||
# print('批量导出txt')
|
||||
self.to_txt(contact, self.message_types,True)
|
||||
self.to_txt(contact, self.message_types, True)
|
||||
elif type_ == self.CSV:
|
||||
self.to_csv(contact, self.message_types,True)
|
||||
self.to_csv(contact, self.message_types, True)
|
||||
elif type_ == self.HTML:
|
||||
self.to_html(contact, self.message_types,True)
|
||||
self.to_html(contact, self.message_types, True)
|
||||
|
||||
def batch_finish_one(self, num):
|
||||
self.nowContact.emit(self.contact[self.batch_num//len(self.sub_type)].remark)
|
||||
self.nowContact.emit(self.contact[self.batch_num // len(self.sub_type)].remark)
|
||||
self.batch_num += 1
|
||||
if self.batch_num == self.batch_num_total:
|
||||
self.okSignal.emit(1)
|
||||
|
||||
def merge_docx(self, n):
|
||||
conRemark = self.contact.remark
|
||||
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{conRemark}"
|
||||
filename = f"{origin_docx_path}/{conRemark}_{n}.docx"
|
||||
if n == 10086:
|
||||
# self.document.append(self.document)
|
||||
file = os.path.join(origin_docx_path, f'{conRemark}.docx')
|
||||
try:
|
||||
self.document.save(file)
|
||||
except PermissionError:
|
||||
file = file[:-5] + f'{time.time()}' + '.docx'
|
||||
self.document.save(file)
|
||||
self.okSignal.emit(1)
|
||||
return
|
||||
doc = docx.Document(filename)
|
||||
self.document.append(doc)
|
||||
os.remove(filename)
|
||||
if n % 50 == 0:
|
||||
# self.document.append(self.document)
|
||||
file = os.path.join(origin_docx_path, f'{conRemark}-{n//50}.docx')
|
||||
try:
|
||||
self.document.save(file)
|
||||
except PermissionError:
|
||||
file = file[:-5] + f'{time.time()}' + '.docx'
|
||||
self.document.save(file)
|
||||
doc = docx.Document()
|
||||
doc.styles["Normal"].font.name = "Cambria"
|
||||
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
|
||||
self.document = Composer(doc)
|
||||
|
||||
def to_docx(self, contact, message_types, is_batch=False):
|
||||
Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types,time_range=self.time_range)
|
||||
doc = docx.Document()
|
||||
doc.styles["Normal"].font.name = "Cambria"
|
||||
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
|
||||
self.document = Composer(doc)
|
||||
Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types, time_range=self.time_range)
|
||||
self.children.append(Child)
|
||||
Child.progressSignal.connect(self.progress)
|
||||
if not is_batch:
|
||||
Child.rangeSignal.connect(self.rangeSignal)
|
||||
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
|
||||
Child.okSignal.connect(self.merge_docx if not is_batch else self.batch_finish_one)
|
||||
Child.start()
|
||||
|
||||
def to_txt(self, contact, message_types, is_batch=False):
|
||||
Child = TxtExporter(contact, type_=self.TXT, message_types=message_types,time_range=self.time_range)
|
||||
Child = TxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range)
|
||||
self.children.append(Child)
|
||||
Child.progressSignal.connect(self.progress)
|
||||
if not is_batch:
|
||||
@ -180,7 +218,7 @@ class Output(QObject):
|
||||
Child.start()
|
||||
|
||||
def to_html(self, contact, message_types, is_batch=False):
|
||||
Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types,time_range=self.time_range)
|
||||
Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types, time_range=self.time_range)
|
||||
self.children.append(Child)
|
||||
Child.progressSignal.connect(self.progress)
|
||||
if not is_batch:
|
||||
@ -191,7 +229,7 @@ class Output(QObject):
|
||||
if message_types.get(34):
|
||||
# 语音消息单独的线程
|
||||
self.total_num += 1
|
||||
output_media = OutputMedia(contact,time_range=self.time_range)
|
||||
output_media = OutputMedia(contact, time_range=self.time_range)
|
||||
self.children.append(output_media)
|
||||
output_media.okSingal.connect(self.count_finish_num)
|
||||
output_media.progressSignal.connect(self.progressSignal)
|
||||
@ -199,7 +237,7 @@ class Output(QObject):
|
||||
if message_types.get(47):
|
||||
# emoji消息单独的线程
|
||||
self.total_num += 1
|
||||
output_emoji = OutputEmoji(contact,time_range=self.time_range)
|
||||
output_emoji = OutputEmoji(contact, time_range=self.time_range)
|
||||
self.children.append(output_emoji)
|
||||
output_emoji.okSingal.connect(self.count_finish_num)
|
||||
output_emoji.progressSignal.connect(self.progressSignal)
|
||||
@ -207,14 +245,14 @@ class Output(QObject):
|
||||
if message_types.get(3):
|
||||
# 图片消息单独的线程
|
||||
self.total_num += 1
|
||||
output_image = OutputImage(contact,time_range=self.time_range)
|
||||
output_image = OutputImage(contact, time_range=self.time_range)
|
||||
self.children.append(output_image)
|
||||
output_image.okSingal.connect(self.count_finish_num)
|
||||
output_image.progressSignal.connect(self.progressSignal)
|
||||
output_image.start()
|
||||
|
||||
def to_csv(self, contact, message_types, is_batch=False):
|
||||
Child = CSVExporter(contact, type_=self.CSV, message_types=message_types,time_range=self.time_range)
|
||||
Child = CSVExporter(contact, type_=self.CSV, message_types=message_types, time_range=self.time_range)
|
||||
self.children.append(Child)
|
||||
Child.progressSignal.connect(self.progress)
|
||||
if not is_batch:
|
||||
@ -222,7 +260,7 @@ class Output(QObject):
|
||||
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
|
||||
Child.start()
|
||||
|
||||
def start(self):
|
||||
def run(self):
|
||||
if self.output_type == self.DOCX:
|
||||
self.to_docx(self.contact, self.message_types)
|
||||
elif self.output_type == self.CSV_ALL:
|
||||
@ -264,14 +302,14 @@ class OutputMedia(QThread):
|
||||
okSingal = pyqtSignal(int)
|
||||
progressSignal = pyqtSignal(int)
|
||||
|
||||
def __init__(self, contact,time_range=None):
|
||||
def __init__(self, contact, time_range=None):
|
||||
super().__init__()
|
||||
self.contact = contact
|
||||
self.time_range = time_range
|
||||
|
||||
def run(self):
|
||||
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, 34,time_range=self.time_range)
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, 34, time_range=self.time_range)
|
||||
for message in messages:
|
||||
is_send = message[4]
|
||||
msgSvrId = message[9]
|
||||
@ -291,14 +329,14 @@ class OutputEmoji(QThread):
|
||||
okSingal = pyqtSignal(int)
|
||||
progressSignal = pyqtSignal(int)
|
||||
|
||||
def __init__(self, contact,time_range=None):
|
||||
def __init__(self, contact, time_range=None):
|
||||
super().__init__()
|
||||
self.contact = contact
|
||||
self.time_range = time_range
|
||||
|
||||
def run(self):
|
||||
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, 47,time_range=self.time_range)
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, 47, time_range=self.time_range)
|
||||
for message in messages:
|
||||
str_content = message[7]
|
||||
try:
|
||||
@ -318,11 +356,11 @@ class OutputImage(QThread):
|
||||
okSingal = pyqtSignal(int)
|
||||
progressSignal = pyqtSignal(int)
|
||||
|
||||
def __init__(self, contact,time_range):
|
||||
def __init__(self, contact, time_range):
|
||||
super().__init__()
|
||||
self.contact = contact
|
||||
self.child_thread_num = 2
|
||||
self.time_range =time_range
|
||||
self.time_range = time_range
|
||||
self.child_threads = [0] * (self.child_thread_num + 1)
|
||||
self.num = 0
|
||||
|
||||
@ -335,7 +373,7 @@ class OutputImage(QThread):
|
||||
|
||||
def run(self):
|
||||
origin_docx_path = f"{os.path.abspath('.')}/data/聊天记录/{self.contact.remark}"
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, 3,time_range=self.time_range)
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, 3, time_range=self.time_range)
|
||||
for message in messages:
|
||||
str_content = message[7]
|
||||
BytesExtra = message[10]
|
||||
@ -363,7 +401,7 @@ class OutputImageChild(QThread):
|
||||
okSingal = pyqtSignal(int)
|
||||
progressSignal = pyqtSignal(int)
|
||||
|
||||
def __init__(self, contact, messages,time_range):
|
||||
def __init__(self, contact, messages, time_range):
|
||||
super().__init__()
|
||||
self.contact = contact
|
||||
self.messages = messages
|
||||
|
Loading…
Reference in New Issue
Block a user