import csv import html import io import os import re import shutil import subprocess import sys import time import traceback from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Tuple import pysilk from wxManager import MessageType, DataBaseInterface from wxManager.model import Contact, Me, Message from wxManager.log import logger from exporter.config import FileType def makedirs(path): if not os.path.exists(path): os.makedirs(path, exist_ok=True) os.makedirs(os.path.join(path, 'image'), exist_ok=True) os.makedirs(os.path.join(path, 'emoji'), exist_ok=True) os.makedirs(os.path.join(path, 'video'), exist_ok=True) os.makedirs(os.path.join(path, 'voice'), exist_ok=True) os.makedirs(os.path.join(path, 'file'), exist_ok=True) os.makedirs(os.path.join(path, 'avatar'), exist_ok=True) os.makedirs(os.path.join(path, 'music'), exist_ok=True) os.makedirs(os.path.join(path, 'icon'), exist_ok=True) def escape_js_and_html(input_str): if not input_str: return '' # 转义HTML特殊字符 html_escaped = html.escape(input_str, quote=False) # 手动处理JavaScript转义字符 js_escaped = ( html_escaped .replace('\\r\\n', '
') .replace('\\n', '
') .replace('\\t', ' ') .replace("\\", "\\\\") .replace("'", r"\'") .replace('"', r'\"') .replace("\n", r'\n') .replace("\r", r'\r') .replace("\t", r'\t') ) return js_escaped class ExporterBaseBase: exporter_id = 0 def __init__(self): ExporterBaseBase.exporter_id += 1 self.id = ExporterBaseBase.exporter_id self._is_running = True self._is_paused = False def cancel(self): print('cancel') def pause(self): self._is_paused = True def resume(self): self._is_paused = False def stop(self): self._is_running = False self.resume() # 确保在停止时唤醒线程 class ExporterBase(ExporterBaseBase): i = 1 def __init__( self, database: DataBaseInterface, contact: Contact, output_dir, type_=FileType.TXT, # 导出文件类型 message_types: set[MessageType] = None, # 导出的消息类型 time_range=None, # 导出的日期范围 group_members: set[str] = None, # 群聊中只导出这些人的聊天记录 progress_callback=None, # 进度回调函数,func(progress:float) finish_callback=None # 导出完成回调函数 ): """ @param database: @param contact: 要导出的联系人 @param output_dir: 输出文件夹 @param type_: 导出文件类型 @param message_types: 导出的消息类型 @param time_range: 导出的日期范围 @param group_members: 群聊中筛选的群成员 @param progress_callback: 导出进度回调函数 """ super().__init__() if progress_callback: self.update_progress_callback = progress_callback else: self.update_progress_callback = self.print_progress if finish_callback: self.finish_callback = finish_callback else: self.finish_callback = self.finish self.database = database self.avatar_urls_dict = {} # 联系人头像地址的字典 self.avatar_urls = [] # 联系人的头像地址(写入HTML) self.avatar_paths_dict = {} # 联系人本地头像地址的字典 self.avatar_paths = [] # 联系人的本地头像地址(写入HTML) self.message_types = message_types # 导出的消息类型 self.contact: Contact = contact # 联系人 self.output_type = type_ # 导出文件类型 self.total_num = 1 # 总的消息数量 self.num = 0 # 当前处理的消息数量 self.last_timestamp = 0 self.time_range = time_range self.group_contacts = {} # 群聊里的所有联系人 self.group_members = group_members # 要导出的群聊成员(用于群消息筛选) self.group_members_set = group_members self.origin_path = os.path.join(output_dir, '聊天记录', f'{self.contact.remark}({self.contact.wxid})') makedirs(self.origin_path) def print_progress(self, progress): logger.info(f'导出进度:{progress * 100:.2f}%') # print() def finish(self, success): if success: logger.info(f'导出完成\n{"-" * 20}') else: logger.info(f'导出失败\n{"-" * 20}') def set_update_callback(self, callback): self.update_progress_callback = callback def _is_select_by_type(self, message): # 筛选特定的消息类型 if not self.message_types: return True else: return message.type in self.message_types def _is_select_by_contact(self, message): # 筛选群聊里的指定群成员 if self.contact.is_chatroom() and self.group_members_set: wxid = message.sender_id if wxid in self.group_members_set: return True else: return False else: return True def is_selected(self, message): # 判断该消息是否应该导出 return self._is_select_by_type(message) and self._is_select_by_contact(message) def run(self): self.export() def export(self): return True def start(self): self.run() def is_5_min(self, timestamp) -> bool: if abs(timestamp - self.last_timestamp) > 300: self.last_timestamp = timestamp return True return False def save_avatars(self): if self.contact.is_chatroom(): self.group_contacts = self.database.get_chatroom_members(self.contact.wxid) self.group_contacts[Me().wxid] = Me() else: self.group_contacts = { Me().wxid: Me(), self.contact.wxid: self.contact } for wxid, contact in self.group_contacts.items(): self.save_avatar(contact) def save_avatar(self, contact): avatar_buffer = self.database.get_avatar_buffer(contact.wxid) avatar_path = os.path.join(self.origin_path, 'avatar', f'{contact.wxid}.png') contact.avatar_path = avatar_path if not avatar_buffer: current_dir = os.path.dirname(os.path.abspath(__file__)) # 构建要读取的文件路径 file_path = os.path.join(current_dir, 'resources', 'default_avatar.png') with open(file_path, 'rb') as f: avatar_buffer = f.read() try: with open(avatar_path, 'wb') as f: f.write(avatar_buffer) except: logger.error(traceback.format_exc()) finally: return avatar_path def get_avatar_path(self, message: Message, is_absolute_path=False) -> str | int: """ 获取消息发送者的头像 @param message: 消息元组 @param is_absolute_path: 是否是绝对路径 @return: True 返回本地的绝对路径,False 返回联系人的索引下标 """ is_send = message.is_sender if is_absolute_path: # 返回头像的本地绝对路径 if message.sender_id in self.group_contacts: avatar = self.group_contacts[message.sender_id].avatar_path else: # 针对那些退群的人,就保存为默认头像 contact = self.database.get_contact_by_username(message.sender_id) avatar = self.save_avatar(contact) self.group_contacts[contact.wxid] = contact else: if self.contact.is_chatroom(): avatar = self.avatar_urls_dict[message.sender_id] else: avatar = 0 if is_send else 1 return avatar def get_avatar_urls(self): index = 0 if self.contact.is_chatroom(): messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range) for message in messages: contact = message[13] if contact.wxid not in self.avatar_urls_dict: avatar_path = os.path.join(self.origin_path, 'avatar', f'{contact.wxid}.png') contact.save_avatar(avatar_path) self.avatar_urls.append(contact.small_head_img_url) self.avatar_paths.append(f'./avatar/{contact.wxid}.png') self.avatar_urls_dict[contact.wxid] = index index += 1 else: self.avatar_urls = [Me().small_head_img_url, self.contact.small_head_img_url] avatar_path = os.path.join(self.origin_path, 'avatar', f'{Me().wxid}.png') QMe().save_avatar(avatar_path) avatar_path1 = os.path.join(self.origin_path, 'avatar', f'{self.contact.wxid}.png') # self.contact.save_avatar(avatar_path1) self.avatar_paths = [f'./avatar/{Me().wxid}.png', f'./avatar/{self.contact.wxid}.png'] return self.avatar_urls, self.avatar_paths def get_avatar_paths(self): """ 获取全部头像 @return: """ index = 0 if self.contact.is_chatroom(): messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range) for message in messages: contact = message[13] if contact.wxid not in self.avatar_paths_dict: self.avatar_paths.append(contact.small_head_img_url) self.avatar_paths_dict[contact.wxid] = index index += 1 else: self.avatar_paths = [Me().small_head_img_url, self.contact.small_head_img_url] return self.avatar_paths class ImageExporter(ExporterBaseBase): def __init__(self, parent=None): super().__init__(parent) def run(self): self.startSignal.emit(1) messages = database.get_messages_all() num = len(messages) os.makedirs(os.path.join(config.OUTPUT_DIR, 'image'), exist_ok=True) for index, message in enumerate(messages): type_ = message[2] timestamp = message[5] # 把时间戳转换为格式化时间 time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间 str_time = time.strftime("%Y%m%d_%H%M%S_", time_struct) # 把结构化时间转换为格式化时间 MsgSvrID = str(message[9]) if type_ == 3: base_path = os.path.join(config.OUTPUT_DIR, 'image') str_content = message[7] BytesExtra = message[10] str_content = escape_js_and_html(str_content) image_path = hard_link_db.get_image(str_content, BytesExtra, up_dir=Me().wx_dir, thumb=False) image_path = get_image(image_path, base_path=base_path, dst_name=str_time + MsgSvrID[:6]) globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{image_path}', 1)) self.okSignal.emit(self.id) class VideoExporter(ExporterBaseBase): def run(self): self.startSignal.emit(1) messages = database.get_messages_all() num = len(messages) os.makedirs(os.path.join(config.OUTPUT_DIR, 'video'), exist_ok=True) for index, message in enumerate(messages): type_ = message[2] timestamp = message[5] # 把时间戳转换为格式化时间 time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间 str_time = time.strftime("%Y%m%d_%H%M%S_", time_struct) # 把结构化时间转换为格式化时间 MsgSvrID = str(message[9]) if type_ == 43: str_content = message[7] BytesExtra = message[10] video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False) image_path = hard_link_db.get_video(str_content, BytesExtra, thumb=True) if video_path: video_path = f'{Me().wx_dir}/{video_path}' if os.path.exists(video_path): new_path = os.path.join(config.OUTPUT_DIR, 'video', str_time + MsgSvrID[:6] + '.mp4') if not os.path.exists(new_path): shutil.copy(video_path, new_path) globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{image_path}', 1)) self.okSignal.emit(self.id) class FileExporter(ExporterBaseBase): def run(self): self.startSignal.emit(1) messages = database.get_messages_all() num = len(messages) origin_path = os.path.join(config.OUTPUT_DIR, 'files') os.makedirs(os.path.join(config.OUTPUT_DIR, 'files'), exist_ok=True) for index, message in enumerate(messages): type_ = message[2] sub_type = message[3] timestamp = message[5] # 把时间戳转换为格式化时间 time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间 str_time = time.strftime("%Y%m%d%H%M%S", time_struct) # 把结构化时间转换为格式化时间 if type_ == 49 and sub_type == 6: bytesExtra = message[10] compress_content = message[13] file_info = file(bytesExtra, compress_content, output_path=origin_path) if not file_info.get('is_error'): file_path = file_info.get('file_path') globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{file_path}', 1)) self.okSignal.emit(self.id) class AudioExporter(ExporterBaseBase): def run(self): self.startSignal.emit(1) messages = msg_db.get_messages_all() num = len(messages) for index, message in enumerate(messages): type_ = message[2] timestamp = message[5] # 把时间戳转换为格式化时间 time_struct = time.localtime(timestamp) # 首先把时间戳转换为结构化时间 str_time = time.strftime("%Y%m%d%H%M%S", time_struct) # 把结构化时间转换为格式化时间 MsgSvrID = str(message[9]) if type_ == 43: str_content = message[7] BytesExtra = message[10] video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False) image_path = hard_link_db.get_video(str_content, BytesExtra, thumb=True) if video_path: video_path = f'{Me().wx_dir}/{video_path}' if os.path.exists(video_path): new_path = os.path.join(config.OUTPUT_DIR, 'video', str_time + '.mp4') if not os.path.exists(new_path): shutil.copy(video_path, new_path) globalSignals.status_bar_message.emit((f'导出进度:{index + 1}/{num}——{image_path}', 1)) self.okSignal.emit(self.id) class ContactExporter(ExporterBaseBase): def __init__(self, database, output_path): super().__init__() self.okSignal = None self.database = database self.output_path = output_path def start(self): self.run() def run(self): # columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"] columns = ['UserName', 'Alias', 'Type', 'Remark', 'NickName', 'smallHeadImgUrl', 'bigHeadImgUrl', 'label', 'gender', 'signature', 'country/region', 'province', 'city'] contacts = self.database.get_contacts() try: # 写入CSV文件 with open(self.output_path, mode='w', newline='', encoding='utf-8-sig') as file: writer = csv.writer(file) writer.writerow(columns) # 写入数据 for contact in contacts: writer.writerow( [contact.wxid, contact.alias, contact.flag, contact.remark, contact.nickname, contact.small_head_img_url, contact.big_head_img_url, contact.label_name(), contact.gender, contact.signature, *contact.region ] ) except PermissionError: print('另一个程序正在使用此文件,无法访问。') class GroupContactExporter(ExporterBaseBase): def __init__(self, database, output_dir, contact): super().__init__() self.contact = contact self.database = database if self.contact: if not isinstance(self.contact, list): self.origin_path = os.path.join(output_dir, '聊天记录', f'{self.contact.remark}({self.contact.wxid})') os.makedirs(self.origin_path, exist_ok=True) def start(self): self.run() def run(self): filename = os.path.join(self.origin_path, 'contacts.csv') filename = get_new_filename(filename) # columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"] columns = ['UserName', '微信号', '类型', '群昵称', '昵称', '头像地址', '头像原图', '标签', '性别', '个性签名', '国家(地区)', '省份', '城市'] contacts = self.database.get_chatroom_members(self.contact.wxid) try: # 写入CSV文件 with open(filename, mode='w', newline='', encoding='utf-8-sig') as file: writer = csv.writer(file) writer.writerow(columns) # 写入数据 # writer.writerows(contacts) for wxid, contact in contacts.items(): writer.writerow( [ contact.wxid, contact.alias, contact.flag, contact.remark, contact.nickname, contact.small_head_img_url, contact.big_head_img_url, contact.label_name(), contact.gender, contact.signature, *contact.region ] ) except PermissionError: print('另一个程序正在使用此文件,无法访问。') class CsvAllExporter(ExporterBaseBase): def run(self): filename = QFileDialog.getSaveFileName(None, "save file", os.path.join(os.getcwd(), 'messages.csv'), "csv files (*.csv);;all files(*.*)") if not filename[0]: return self.startSignal.emit(1) filename = filename[0] # columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"] columns = ['localId', 'TalkerId', 'Type', 'SubType', 'IsSender', 'CreateTime', 'Status', 'StrContent', 'StrTime', 'Remark', 'NickName', 'Sender'] packagemsg = PackageMsg() messages = packagemsg.get_package_message_all() try: # 写入CSV文件 with open(filename, mode='w', newline='', encoding='utf-8-sig') as file: writer = csv.writer(file) writer.writerow(columns) # 写入数据 writer.writerows(messages) except PermissionError: globalSignals.information.emit('另一个程序正在使用此文件,无法访问。') self.okSignal.emit(self.id) def copy_file(source_file, destination_file): if os.path.isfile(source_file) and not os.path.exists(destination_file): try: # logger.info(f'开始复制:{destination_file}') shutil.copy(source_file, destination_file) except: pass # logger.error(traceback.format_exc()) finally: print(f'复制:{destination_file}') # logger.info(f'复制:{destination_file}') def copy_files(file_tasks: List[Tuple[str, str, str]]): """ :param file_tasks: List[ (原始文件路径, 输出文件夹, 输出文件名 )] :return: """ if len(file_tasks) < 1: return futures = [] with ThreadPoolExecutor(max_workers=10) as executor: for source_file, output_dir, dst_name in file_tasks: if dst_name: ext = os.path.basename(source_file).split('.')[-1] destination_file = os.path.join(output_dir, f'{dst_name}.{ext}') else: destination_file = os.path.join(output_dir, os.path.basename(source_file)) if os.path.exists(destination_file): continue if not os.path.exists(os.path.dirname(destination_file)): os.makedirs(os.path.dirname(destination_file), exist_ok=True) futures.append(executor.submit(copy_file, source_file, destination_file)) # 等待所有任务完成 for future in futures: future.result() def get_ffmpeg_path(): # 获取打包后的资源目录 resource_dir = getattr(sys, '_MEIPASS', os.path.abspath(os.path.dirname(__file__))) # 构建 FFmpeg 可执行文件的路径 ffmpeg_path = os.path.join(resource_dir, 'ffmpeg.exe') if not os.path.exists(ffmpeg_path): ffmpeg_path = os.path.join(resource_dir, 'resources', 'ffmpeg.exe') return ffmpeg_path def decode_audio_to_mp3(media_buffer, output_dir, filename): silk_path = f"{output_dir}/{filename}.silk" pcm_path = f"{output_dir}/{filename}.pcm" mp3_path = f"{output_dir}/{filename}.mp3" if os.path.exists(mp3_path): return mp3_path if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) buf = media_buffer if not buf: return '' with open(silk_path, "wb") as f: f.write(buf) # open(silk_path, "wb").write() try: pcm_buf = pysilk.decode(buf, to_wav=False, sample_rate=44100) with open(pcm_path, 'wb') as f: f.write(pcm_buf) # pysilk.decode_file(open("brainpower.pcm", "rb"), to_wav=False) # 调用系统上的 ffmpeg 可执行文件 # 获取 FFmpeg 可执行文件的路径 ffmpeg_path = get_ffmpeg_path() # print(ffmpeg_path) # # 调用 FFmpeg if os.path.exists(ffmpeg_path): cmd = f'''"{ffmpeg_path}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"''' # system(cmd) # 使用subprocess.run()执行命令 subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: # 源码运行的时候下面的有效 # 这里不知道怎么捕捉异常 cmd = f'''"{os.path.join(os.getcwd(), 'app', 'resources', 'data', 'ffmpeg.exe')}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"''' # system(cmd) # 使用subprocess.run()执行命令 subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # if os.path.exists(silk_path): # os.remove(silk_path) # if os.path.exists(pcm_path): # os.remove(pcm_path) except Exception as e: print(f"Error: {e}") logger.error(f'语音错误\n{traceback.format_exc()}') cmd = f'''"{os.path.join(os.getcwd(), 'app', 'resources', 'data', 'ffmpeg.exe')}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"''' # system(cmd) # 使用subprocess.run()执行命令 subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) finally: return mp3_path def decode_audios(file_tasks: List[Tuple[str, str, str]]): """ :param database: :param file_tasks: List[ (原始文件路径, 输出文件夹, 输出文件名 )] :return: """ if len(file_tasks) < 1: return with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for media_buffer, output_dir, dst_name in file_tasks: futures.append(executor.submit(decode_audio_to_mp3, media_buffer, output_dir, dst_name)) # 等待所有任务完成 for future in futures: future.result() def remove_privacy_info(text): # 正则表达式模式 patterns = { 'phone': r'\b(\+?86[-\s]?)?1[3-9]\d{9}\b', # 手机号 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # 邮箱 'id_card': r'\b\d{15}|\d{18}|\d{17}X\b', # 身份证号 'password': r'\b(?:password|pwd|pass|psw)[\s=:]*\S+\b', # 密码 'account': r'\b(?:account|username|user|acct)[\s=:]*\S+\b' # 账号 } for key, pattern in patterns.items(): text = re.sub(pattern, f'[{key} xxx]', text) return text def get_new_filename(filename): """ 检查给定的文件是否存在,如果存在就加个括号标个号,返回新的文件名 @param filename: @return: """ if not os.path.exists(filename): return filename else: for i in range(1, 10086): basename = os.path.basename(filename) tmp = basename.split('.') name = '.'.join(tmp[:-1]) ext = tmp[-1] dir_name = os.path.dirname(filename) new_filename = os.path.join(dir_name, f'{name}({i}).{ext}') if not os.path.exists(new_filename): return new_filename return filename