WeChatMsg/exporter/exporter.py
2025-03-28 21:43:32 +08:00

650 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
import html
import io
import os
import re
import shutil
import subprocess
import sys
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Tuple
import pysilk
from wxManager import MessageType, DataBaseInterface
from wxManager.model import Contact, Me, Message
from wxManager.log import logger
from exporter.config import FileType
def makedirs(path):
if not os.path.exists(path):
os.makedirs(path, exist_ok=True)
os.makedirs(os.path.join(path, 'image'), exist_ok=True)
os.makedirs(os.path.join(path, 'emoji'), exist_ok=True)
os.makedirs(os.path.join(path, 'video'), exist_ok=True)
os.makedirs(os.path.join(path, 'voice'), exist_ok=True)
os.makedirs(os.path.join(path, 'file'), exist_ok=True)
os.makedirs(os.path.join(path, 'avatar'), exist_ok=True)
os.makedirs(os.path.join(path, 'music'), exist_ok=True)
os.makedirs(os.path.join(path, 'icon'), exist_ok=True)
def escape_js_and_html(input_str):
if not input_str:
return ''
# 转义HTML特殊字符
html_escaped = html.escape(input_str, quote=False)
# 手动处理JavaScript转义字符
js_escaped = (
html_escaped
.replace('\\r\\n', '<br>')
.replace('\\n', '<br>')
.replace('\\t', '&emsp;')
.replace("\\", "\\\\")
.replace("'", r"\'")
.replace('"', r'\"')
.replace("\n", r'\n')
.replace("\r", r'\r')
.replace("\t", r'\t')
)
return js_escaped
class ExporterBaseBase:
exporter_id = 0
def __init__(self):
ExporterBaseBase.exporter_id += 1
self.id = ExporterBaseBase.exporter_id
self._is_running = True
self._is_paused = False
def cancel(self):
print('cancel')
def pause(self):
self._is_paused = True
def resume(self):
self._is_paused = False
def stop(self):
self._is_running = False
self.resume() # 确保在停止时唤醒线程
class ExporterBase(ExporterBaseBase):
i = 1
def __init__(
self,
database: DataBaseInterface,
contact: Contact,
output_dir,
type_=FileType.TXT, # 导出文件类型
message_types: set[MessageType] = None, # 导出的消息类型
time_range=None, # 导出的日期范围
group_members: set[str] = None, # 群聊中只导出这些人的聊天记录
progress_callback=None, # 进度回调函数func(progress:float)
finish_callback=None # 导出完成回调函数
):
"""
@param database:
@param contact: 要导出的联系人
@param output_dir: 输出文件夹
@param type_: 导出文件类型
@param message_types: 导出的消息类型
@param time_range: 导出的日期范围
@param group_members: 群聊中筛选的群成员
@param progress_callback: 导出进度回调函数
"""
super().__init__()
if progress_callback:
self.update_progress_callback = progress_callback
else:
self.update_progress_callback = self.print_progress
if finish_callback:
self.finish_callback = finish_callback
else:
self.finish_callback = self.finish
self.database = database
self.avatar_urls_dict = {} # 联系人头像地址的字典
self.avatar_urls = [] # 联系人的头像地址写入HTML
self.avatar_paths_dict = {} # 联系人本地头像地址的字典
self.avatar_paths = [] # 联系人的本地头像地址写入HTML
self.message_types = message_types # 导出的消息类型
self.contact: Contact = contact # 联系人
self.output_type = type_ # 导出文件类型
self.total_num = 1 # 总的消息数量
self.num = 0 # 当前处理的消息数量
self.last_timestamp = 0
self.time_range = time_range
self.group_contacts = {} # 群聊里的所有联系人
self.group_members = group_members # 要导出的群聊成员(用于群消息筛选)
self.group_members_set = group_members
self.origin_path = os.path.join(output_dir, '聊天记录', f'{self.contact.remark}({self.contact.wxid})')
makedirs(self.origin_path)
def print_progress(self, progress):
logger.info(f'导出进度:{progress * 100:.2f}%')
# print()
def finish(self, success):
if success:
logger.info(f'导出完成\n{"-" * 20}')
else:
logger.info(f'导出失败\n{"-" * 20}')
def set_update_callback(self, callback):
self.update_progress_callback = callback
def _is_select_by_type(self, message):
# 筛选特定的消息类型
if not self.message_types:
return True
else:
return message.type in self.message_types
def _is_select_by_contact(self, message):
# 筛选群聊里的指定群成员
if self.contact.is_chatroom() and self.group_members_set:
wxid = message.sender_id
if wxid in self.group_members_set:
return True
else:
return False
else:
return True
def is_selected(self, message):
# 判断该消息是否应该导出
return self._is_select_by_type(message) and self._is_select_by_contact(message)
def run(self):
self.export()
def export(self):
return True
def start(self):
self.run()
def is_5_min(self, timestamp) -> bool:
if abs(timestamp - self.last_timestamp) > 300:
self.last_timestamp = timestamp
return True
return False
def save_avatars(self):
if self.contact.is_chatroom():
self.group_contacts = self.database.get_chatroom_members(self.contact.wxid)
self.group_contacts[Me().wxid] = Me()
else:
self.group_contacts = {
Me().wxid: Me(),
self.contact.wxid: self.contact
}
for wxid, contact in self.group_contacts.items():
self.save_avatar(contact)
def save_avatar(self, contact):
avatar_buffer = self.database.get_avatar_buffer(contact.wxid)
avatar_path = os.path.join(self.origin_path, 'avatar', f'{contact.wxid}.png')
contact.avatar_path = avatar_path
if not avatar_buffer:
current_dir = os.path.dirname(os.path.abspath(__file__))
# 构建要读取的文件路径
file_path = os.path.join(current_dir, 'resources', 'default_avatar.png')
with open(file_path, 'rb') as f:
avatar_buffer = f.read()
try:
with open(avatar_path, 'wb') as f:
f.write(avatar_buffer)
except:
logger.error(traceback.format_exc())
finally:
return avatar_path
def get_avatar_path(self, message: Message, is_absolute_path=False) -> str | int:
"""
获取消息发送者的头像
@param message: 消息元组
@param is_absolute_path: 是否是绝对路径
@return: True 返回本地的绝对路径False 返回联系人的索引下标
"""
is_send = message.is_sender
if is_absolute_path:
# 返回头像的本地绝对路径
if message.sender_id in self.group_contacts:
avatar = self.group_contacts[message.sender_id].avatar_path
else:
# 针对那些退群的人,就保存为默认头像
contact = self.database.get_contact_by_username(message.sender_id)
avatar = self.save_avatar(contact)
self.group_contacts[contact.wxid] = contact
else:
if self.contact.is_chatroom():
avatar = self.avatar_urls_dict[message.sender_id]
else:
avatar = 0 if is_send else 1
return avatar
def get_avatar_urls(self):
index = 0
if self.contact.is_chatroom():
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
for message in messages:
contact = message[13]
if contact.wxid not in self.avatar_urls_dict:
avatar_path = os.path.join(self.origin_path, 'avatar', f'{contact.wxid}.png')
contact.save_avatar(avatar_path)
self.avatar_urls.append(contact.small_head_img_url)
self.avatar_paths.append(f'./avatar/{contact.wxid}.png')
self.avatar_urls_dict[contact.wxid] = index
index += 1
else:
self.avatar_urls = [Me().small_head_img_url, self.contact.small_head_img_url]
avatar_path = os.path.join(self.origin_path, 'avatar', f'{Me().wxid}.png')
QMe().save_avatar(avatar_path)
avatar_path1 = os.path.join(self.origin_path, 'avatar', f'{self.contact.wxid}.png')
# self.contact.save_avatar(avatar_path1)
self.avatar_paths = [f'./avatar/{Me().wxid}.png', f'./avatar/{self.contact.wxid}.png']
return self.avatar_urls, self.avatar_paths
def get_avatar_paths(self):
"""
获取全部头像
@return:
"""
index = 0
if self.contact.is_chatroom():
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
for message in messages:
contact = message[13]
if contact.wxid not in self.avatar_paths_dict:
self.avatar_paths.append(contact.small_head_img_url)
self.avatar_paths_dict[contact.wxid] = index
index += 1
else:
self.avatar_paths = [Me().small_head_img_url, self.contact.small_head_img_url]
return self.avatar_paths
class ImageExporter(ExporterBaseBase):
def __init__(self, parent=None):
super().__init__(parent)
def run(self):
self.startSignal.emit(1)
messages = database.get_messages_all()
num = len(messages)
os.makedirs(os.path.join(config.OUTPUT_DIR, 'image'), exist_ok=True)
for index, message in enumerate(messages):
type_ = message[2]
timestamp = message[5]
# 把时间戳转换为格式化时间
time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间
str_time = time.strftime("%Y%m%d_%H%M%S_", time_struct) # 把结构化时间转换为格式化时间
MsgSvrID = str(message[9])
if type_ == 3:
base_path = os.path.join(config.OUTPUT_DIR, 'image')
str_content = message[7]
BytesExtra = message[10]
str_content = escape_js_and_html(str_content)
image_path = hard_link_db.get_image(str_content, BytesExtra, up_dir=Me().wx_dir, thumb=False)
image_path = get_image(image_path, base_path=base_path, dst_name=str_time + MsgSvrID[:6])
globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{image_path}', 1))
self.okSignal.emit(self.id)
class VideoExporter(ExporterBaseBase):
def run(self):
self.startSignal.emit(1)
messages = database.get_messages_all()
num = len(messages)
os.makedirs(os.path.join(config.OUTPUT_DIR, 'video'), exist_ok=True)
for index, message in enumerate(messages):
type_ = message[2]
timestamp = message[5]
# 把时间戳转换为格式化时间
time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间
str_time = time.strftime("%Y%m%d_%H%M%S_", time_struct) # 把结构化时间转换为格式化时间
MsgSvrID = str(message[9])
if type_ == 43:
str_content = message[7]
BytesExtra = message[10]
video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False)
image_path = hard_link_db.get_video(str_content, BytesExtra, thumb=True)
if video_path:
video_path = f'{Me().wx_dir}/{video_path}'
if os.path.exists(video_path):
new_path = os.path.join(config.OUTPUT_DIR, 'video', str_time + MsgSvrID[:6] + '.mp4')
if not os.path.exists(new_path):
shutil.copy(video_path, new_path)
globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{image_path}', 1))
self.okSignal.emit(self.id)
class FileExporter(ExporterBaseBase):
def run(self):
self.startSignal.emit(1)
messages = database.get_messages_all()
num = len(messages)
origin_path = os.path.join(config.OUTPUT_DIR, 'files')
os.makedirs(os.path.join(config.OUTPUT_DIR, 'files'), exist_ok=True)
for index, message in enumerate(messages):
type_ = message[2]
sub_type = message[3]
timestamp = message[5]
# 把时间戳转换为格式化时间
time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间
str_time = time.strftime("%Y%m%d%H%M%S", time_struct) # 把结构化时间转换为格式化时间
if type_ == 49 and sub_type == 6:
bytesExtra = message[10]
compress_content = message[13]
file_info = file(bytesExtra, compress_content, output_path=origin_path)
if not file_info.get('is_error'):
file_path = file_info.get('file_path')
globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{file_path}', 1))
self.okSignal.emit(self.id)
class AudioExporter(ExporterBaseBase):
def run(self):
self.startSignal.emit(1)
messages = msg_db.get_messages_all()
num = len(messages)
for index, message in enumerate(messages):
type_ = message[2]
timestamp = message[5]
# 把时间戳转换为格式化时间
time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间
str_time = time.strftime("%Y%m%d%H%M%S", time_struct) # 把结构化时间转换为格式化时间
MsgSvrID = str(message[9])
if type_ == 43:
str_content = message[7]
BytesExtra = message[10]
video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False)
image_path = hard_link_db.get_video(str_content, BytesExtra, thumb=True)
if video_path:
video_path = f'{Me().wx_dir}/{video_path}'
if os.path.exists(video_path):
new_path = os.path.join(config.OUTPUT_DIR, 'video', str_time + '.mp4')
if not os.path.exists(new_path):
shutil.copy(video_path, new_path)
globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{image_path}', 1))
self.okSignal.emit(self.id)
class ContactExporter(ExporterBaseBase):
def __init__(self, database, output_path):
super().__init__()
self.okSignal = None
self.database = database
self.output_path = output_path
def start(self):
self.run()
def run(self):
# columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
columns = ['UserName', 'Alias', 'Type', 'Remark', 'NickName', 'smallHeadImgUrl',
'bigHeadImgUrl', 'label', 'gender', 'signature', 'country/region', 'province', 'city']
contacts = self.database.get_contacts()
try:
# 写入CSV文件
with open(self.output_path, mode='w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(columns)
# 写入数据
for contact in contacts:
writer.writerow(
[contact.wxid, contact.alias, contact.flag, contact.remark, contact.nickname,
contact.small_head_img_url, contact.big_head_img_url, contact.label_name(), contact.gender,
contact.signature, *contact.region
]
)
except PermissionError:
print('另一个程序正在使用此文件,无法访问。')
class GroupContactExporter(ExporterBaseBase):
def __init__(self, database, output_dir, contact):
super().__init__()
self.contact = contact
self.database = database
if self.contact:
if not isinstance(self.contact, list):
self.origin_path = os.path.join(output_dir, '聊天记录',
f'{self.contact.remark}({self.contact.wxid})')
os.makedirs(self.origin_path, exist_ok=True)
def start(self):
self.run()
def run(self):
filename = os.path.join(self.origin_path, 'contacts.csv')
filename = get_new_filename(filename)
# columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
columns = ['UserName', '微信号', '类型', '群昵称', '昵称', '头像地址',
'头像原图', '标签', '性别', '个性签名', '国家(地区)', '省份', '城市']
contacts = self.database.get_chatroom_members(self.contact.wxid)
try:
# 写入CSV文件
with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(columns)
# 写入数据
# writer.writerows(contacts)
for wxid, contact in contacts.items():
writer.writerow(
[
contact.wxid, contact.alias, contact.flag, contact.remark, contact.nickname,
contact.small_head_img_url, contact.big_head_img_url, contact.label_name(),
contact.gender, contact.signature, *contact.region
]
)
except PermissionError:
print('另一个程序正在使用此文件,无法访问。')
class CsvAllExporter(ExporterBaseBase):
def run(self):
filename = QFileDialog.getSaveFileName(None, "save file", os.path.join(os.getcwd(), 'messages.csv'),
"csv files (*.csv);;all files(*.*)")
if not filename[0]:
return
self.startSignal.emit(1)
filename = filename[0]
# columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
columns = ['localId', 'TalkerId', 'Type', 'SubType',
'IsSender', 'CreateTime', 'Status', 'StrContent',
'StrTime', 'Remark', 'NickName', 'Sender']
packagemsg = PackageMsg()
messages = packagemsg.get_package_message_all()
try:
# 写入CSV文件
with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(columns)
# 写入数据
writer.writerows(messages)
except PermissionError:
globalSignals.information.emit('另一个程序正在使用此文件,无法访问。')
self.okSignal.emit(self.id)
def copy_file(source_file, destination_file):
if os.path.isfile(source_file) and not os.path.exists(destination_file):
try:
# logger.info(f'开始复制:{destination_file}')
shutil.copy(source_file, destination_file)
except:
pass
# logger.error(traceback.format_exc())
finally:
print(f'复制:{destination_file}')
# logger.info(f'复制:{destination_file}')
def copy_files(file_tasks: List[Tuple[str, str, str]]):
"""
:param file_tasks: List[
(原始文件路径,
输出文件夹,
输出文件名
)]
:return:
"""
if len(file_tasks) < 1:
return
futures = []
with ThreadPoolExecutor(max_workers=10) as executor:
for source_file, output_dir, dst_name in file_tasks:
if dst_name:
ext = os.path.basename(source_file).split('.')[-1]
destination_file = os.path.join(output_dir, f'{dst_name}.{ext}')
else:
destination_file = os.path.join(output_dir, os.path.basename(source_file))
if os.path.exists(destination_file):
continue
if not os.path.exists(os.path.dirname(destination_file)):
os.makedirs(os.path.dirname(destination_file), exist_ok=True)
futures.append(executor.submit(copy_file, source_file, destination_file))
# 等待所有任务完成
for future in futures:
future.result()
def get_ffmpeg_path():
# 获取打包后的资源目录
resource_dir = getattr(sys, '_MEIPASS', os.path.abspath(os.path.dirname(__file__)))
# 构建 FFmpeg 可执行文件的路径
ffmpeg_path = os.path.join(resource_dir, 'ffmpeg.exe')
if not os.path.exists(ffmpeg_path):
ffmpeg_path = os.path.join(resource_dir, 'resources', 'ffmpeg.exe')
return ffmpeg_path
def decode_audio_to_mp3(media_buffer, output_dir, filename):
silk_path = f"{output_dir}/{filename}.silk"
pcm_path = f"{output_dir}/{filename}.pcm"
mp3_path = f"{output_dir}/{filename}.mp3"
if os.path.exists(mp3_path):
return mp3_path
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
buf = media_buffer
if not buf:
return ''
with open(silk_path, "wb") as f:
f.write(buf)
# open(silk_path, "wb").write()
try:
pcm_buf = pysilk.decode(buf, to_wav=False, sample_rate=44100)
with open(pcm_path, 'wb') as f:
f.write(pcm_buf)
# pysilk.decode_file(open("brainpower.pcm", "rb"), to_wav=False)
# 调用系统上的 ffmpeg 可执行文件
# 获取 FFmpeg 可执行文件的路径
ffmpeg_path = get_ffmpeg_path()
# print(ffmpeg_path)
# # 调用 FFmpeg
if os.path.exists(ffmpeg_path):
cmd = f'''"{ffmpeg_path}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"'''
# system(cmd)
# 使用subprocess.run()执行命令
subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
# 源码运行的时候下面的有效
# 这里不知道怎么捕捉异常
cmd = f'''"{os.path.join(os.getcwd(), 'app', 'resources', 'data', 'ffmpeg.exe')}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"'''
# system(cmd)
# 使用subprocess.run()执行命令
subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# if os.path.exists(silk_path):
# os.remove(silk_path)
# if os.path.exists(pcm_path):
# os.remove(pcm_path)
except Exception as e:
print(f"Error: {e}")
logger.error(f'语音错误\n{traceback.format_exc()}')
cmd = f'''"{os.path.join(os.getcwd(), 'app', 'resources', 'data', 'ffmpeg.exe')}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"'''
# system(cmd)
# 使用subprocess.run()执行命令
subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
finally:
return mp3_path
def decode_audios(file_tasks: List[Tuple[str, str, str]]):
"""
:param database:
:param file_tasks: List[
(原始文件路径,
输出文件夹,
输出文件名
)]
:return:
"""
if len(file_tasks) < 1:
return
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for media_buffer, output_dir, dst_name in file_tasks:
futures.append(executor.submit(decode_audio_to_mp3, media_buffer, output_dir, dst_name))
# 等待所有任务完成
for future in futures:
future.result()
def remove_privacy_info(text):
# 正则表达式模式
patterns = {
'phone': r'\b(\+?86[-\s]?)?1[3-9]\d{9}\b', # 手机号
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # 邮箱
'id_card': r'\b\d{15}|\d{18}|\d{17}X\b', # 身份证号
'password': r'\b(?:password|pwd|pass|psw)[\s=:]*\S+\b', # 密码
'account': r'\b(?:account|username|user|acct)[\s=:]*\S+\b' # 账号
}
for key, pattern in patterns.items():
text = re.sub(pattern, f'[{key} xxx]', text)
return text
def get_new_filename(filename):
"""
检查给定的文件是否存在,如果存在就加个括号标个号,返回新的文件名
@param filename:
@return:
"""
if not os.path.exists(filename):
return filename
else:
for i in range(1, 10086):
basename = os.path.basename(filename)
tmp = basename.split('.')
name = '.'.join(tmp[:-1])
ext = tmp[-1]
dir_name = os.path.dirname(filename)
new_filename = os.path.join(dir_name, f'{name}({i}).{ext}')
if not os.path.exists(new_filename):
return new_filename
return filename