WeChatMsg/exporter/exporter.py

import csv
import html
import io
import os
import re
import shutil
import subprocess
import sys
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Tuple

import pysilk

from wxManager import MessageType, DataBaseInterface
from wxManager.model import Contact, Me, Message

from wxManager.log import logger
from exporter.config import FileType


def makedirs(path):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)
    os.makedirs(os.path.join(path, 'image'), exist_ok=True)
    os.makedirs(os.path.join(path, 'emoji'), exist_ok=True)
    os.makedirs(os.path.join(path, 'video'), exist_ok=True)
    os.makedirs(os.path.join(path, 'voice'), exist_ok=True)
    os.makedirs(os.path.join(path, 'file'), exist_ok=True)
    os.makedirs(os.path.join(path, 'avatar'), exist_ok=True)
    os.makedirs(os.path.join(path, 'music'), exist_ok=True)
    os.makedirs(os.path.join(path, 'icon'), exist_ok=True)


def escape_js_and_html(input_str):
    if not input_str:
        return ''
    # 转义HTML特殊字符
    html_escaped = html.escape(input_str, quote=False)

    # 手动处理JavaScript转义字符
    js_escaped = (
        html_escaped
        .replace('\\r\\n', '<br>')
        .replace('\\n', '<br>')
        .replace('\\t', '&emsp;')
        .replace("\\", "\\\\")
        .replace("'", r"\'")
        .replace('"', r'\"')
        .replace("\n", r'\n')
        .replace("\r", r'\r')
        .replace("\t", r'\t')
    )

    return js_escaped


class ExporterBaseBase:
    exporter_id = 0

    def __init__(self):
        ExporterBaseBase.exporter_id += 1
        self.id = ExporterBaseBase.exporter_id
        self._is_running = True
        self._is_paused = False

    def cancel(self):
        print('cancel')

    def pause(self):
        self._is_paused = True

    def resume(self):
        self._is_paused = False

    def stop(self):
        self._is_running = False
        self.resume()  # 确保在停止时唤醒线程


class ExporterBase(ExporterBaseBase):
    i = 1

    def __init__(
            self,
            database: DataBaseInterface,
            contact: Contact,
            output_dir,
            type_=FileType.TXT,  # 导出文件类型
            message_types: set[MessageType] = None,  # 导出的消息类型
            time_range=None,  # 导出的日期范围
            group_members: set[str] = None,  # 群聊中只导出这些人的聊天记录
            progress_callback=None,  # 进度回调函数，func(progress:float)
            finish_callback=None  # 导出完成回调函数
    ):
        """
        @param database:
        @param contact: 要导出的联系人
        @param output_dir: 输出文件夹
        @param type_: 导出文件类型
        @param message_types: 导出的消息类型
        @param time_range: 导出的日期范围
        @param group_members: 群聊中筛选的群成员
        @param progress_callback: 导出进度回调函数
        """
        super().__init__()
        if progress_callback:
            self.update_progress_callback = progress_callback
        else:
            self.update_progress_callback = self.print_progress
        if finish_callback:
            self.finish_callback = finish_callback
        else:
            self.finish_callback = self.finish
        self.database = database
        self.avatar_urls_dict = {}  # 联系人头像地址的字典
        self.avatar_urls = []  # 联系人的头像地址（写入HTML）
        self.avatar_paths_dict = {}  # 联系人本地头像地址的字典
        self.avatar_paths = []  # 联系人的本地头像地址（写入HTML）
        self.message_types = message_types  # 导出的消息类型
        self.contact: Contact = contact  # 联系人
        self.output_type = type_  # 导出文件类型
        self.total_num = 1  # 总的消息数量
        self.num = 0  # 当前处理的消息数量
        self.last_timestamp = 0
        self.time_range = time_range
        self.group_contacts = {}  # 群聊里的所有联系人
        self.group_members = group_members  # 要导出的群聊成员（用于群消息筛选）
        self.group_members_set = group_members
        self.origin_path = os.path.join(output_dir, '聊天记录', f'{self.contact.remark}({self.contact.wxid})')
        makedirs(self.origin_path)

    def print_progress(self, progress):
        logger.info(f'导出进度：{progress * 100:.2f}%')
        # print()

    def finish(self, success):
        if success:
            logger.info(f'导出完成\n{"-" * 20}')
        else:
            logger.info(f'导出失败\n{"-" * 20}')

    def set_update_callback(self, callback):
        self.update_progress_callback = callback

    def _is_select_by_type(self, message):
        # 筛选特定的消息类型
        if not self.message_types:
            return True
        else:
            return message.type in self.message_types

    def _is_select_by_contact(self, message):
        # 筛选群聊里的指定群成员
        if self.contact.is_chatroom() and self.group_members_set:
            wxid = message.sender_id
            if wxid in self.group_members_set:
                return True
            else:
                return False
        else:
            return True

    def is_selected(self, message):
        # 判断该消息是否应该导出
        return self._is_select_by_type(message) and self._is_select_by_contact(message)

    def run(self):
        self.export()

    def export(self):
        return True

    def start(self):
        self.run()

    def is_5_min(self, timestamp) -> bool:
        if abs(timestamp - self.last_timestamp) > 300:
            self.last_timestamp = timestamp
            return True
        return False

    def save_avatars(self):
        if self.contact.is_chatroom():
            self.group_contacts = self.database.get_chatroom_members(self.contact.wxid)
            self.group_contacts[Me().wxid] = Me()
        else:
            self.group_contacts = {
                Me().wxid: Me(),
                self.contact.wxid: self.contact
            }
        for wxid, contact in self.group_contacts.items():
            self.save_avatar(contact)

    def save_avatar(self, contact):
        avatar_buffer = self.database.get_avatar_buffer(contact.wxid)
        avatar_path = os.path.join(self.origin_path, 'avatar', f'{contact.wxid}.png')
        contact.avatar_path = avatar_path
        if not avatar_buffer:
            current_dir = os.path.dirname(os.path.abspath(__file__))
            # 构建要读取的文件路径
            file_path = os.path.join(current_dir, 'resources', 'default_avatar.png')
            with open(file_path, 'rb') as f:
                avatar_buffer = f.read()
        try:
            with open(avatar_path, 'wb') as f:
                f.write(avatar_buffer)
        except:
            logger.error(traceback.format_exc())
        finally:
            return avatar_path

    def get_avatar_path(self, message: Message, is_absolute_path=False) -> str | int:
        """
        获取消息发送者的头像
        @param message: 消息元组
        @param is_absolute_path: 是否是绝对路径
        @return: True 返回本地的绝对路径，False 返回联系人的索引下标
        """
        is_send = message.is_sender
        if is_absolute_path:
            # 返回头像的本地绝对路径
            if message.sender_id in self.group_contacts:
                avatar = self.group_contacts[message.sender_id].avatar_path
            else:
                # 针对那些退群的人，就保存为默认头像
                contact = self.database.get_contact_by_username(message.sender_id)
                avatar = self.save_avatar(contact)
                self.group_contacts[contact.wxid] = contact
        else:
            if self.contact.is_chatroom():
                avatar = self.avatar_urls_dict[message.sender_id]
            else:
                avatar = 0 if is_send else 1
        return avatar

    def get_avatar_urls(self):
        index = 0
        if self.contact.is_chatroom():
            messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
            for message in messages:
                contact = message[13]
                if contact.wxid not in self.avatar_urls_dict:
                    avatar_path = os.path.join(self.origin_path, 'avatar', f'{contact.wxid}.png')
                    contact.save_avatar(avatar_path)
                    self.avatar_urls.append(contact.small_head_img_url)
                    self.avatar_paths.append(f'./avatar/{contact.wxid}.png')
                    self.avatar_urls_dict[contact.wxid] = index
                    index += 1
        else:
            self.avatar_urls = [Me().small_head_img_url, self.contact.small_head_img_url]
            avatar_path = os.path.join(self.origin_path, 'avatar', f'{Me().wxid}.png')
            QMe().save_avatar(avatar_path)
            avatar_path1 = os.path.join(self.origin_path, 'avatar', f'{self.contact.wxid}.png')
            # self.contact.save_avatar(avatar_path1)
            self.avatar_paths = [f'./avatar/{Me().wxid}.png', f'./avatar/{self.contact.wxid}.png']
        return self.avatar_urls, self.avatar_paths

    def get_avatar_paths(self):
        """
        获取全部头像
        @return:
        """
        index = 0
        if self.contact.is_chatroom():
            messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
            for message in messages:
                contact = message[13]
                if contact.wxid not in self.avatar_paths_dict:
                    self.avatar_paths.append(contact.small_head_img_url)
                    self.avatar_paths_dict[contact.wxid] = index
                    index += 1
        else:
            self.avatar_paths = [Me().small_head_img_url, self.contact.small_head_img_url]
        return self.avatar_paths


class ImageExporter(ExporterBaseBase):
    def __init__(self, parent=None):
        super().__init__(parent)

    def run(self):
        self.startSignal.emit(1)
        messages = database.get_messages_all()
        num = len(messages)
        os.makedirs(os.path.join(config.OUTPUT_DIR, 'image'), exist_ok=True)
        for index, message in enumerate(messages):
            type_ = message[2]
            timestamp = message[5]
            # 把时间戳转换为格式化时间
            time_struct = time.localtime(timestamp)  # 首先把时间戳转换为结构化时间
            str_time = time.strftime("%Y%m%d_%H%M%S_", time_struct)  # 把结构化时间转换为格式化时间
            MsgSvrID = str(message[9])
            if type_ == 3:
                base_path = os.path.join(config.OUTPUT_DIR, 'image')
                str_content = message[7]
                BytesExtra = message[10]
                str_content = escape_js_and_html(str_content)
                image_path = hard_link_db.get_image(str_content, BytesExtra, up_dir=Me().wx_dir, thumb=False)
                image_path = get_image(image_path, base_path=base_path, dst_name=str_time + MsgSvrID[:6])
                globalSignals.status_bar_message.emit((f'导出进度：{index + 1}/{num}——{image_path}', 1))
        self.okSignal.emit(self.id)


class VideoExporter(ExporterBaseBase):
    def run(self):
        self.startSignal.emit(1)
        messages = database.get_messages_all()
        num = len(messages)
        os.makedirs(os.path.join(config.OUTPUT_DIR, 'video'), exist_ok=True)
        for index, message in enumerate(messages):
            type_ = message[2]
            timestamp = message[5]
            # 把时间戳转换为格式化时间
            time_struct = time.localtime(timestamp)  # 首先把时间戳转换为结构化时间
            str_time = time.strftime("%Y%m%d_%H%M%S_", time_struct)  # 把结构化时间转换为格式化时间
            MsgSvrID = str(message[9])
            if type_ == 43:
                str_content = message[7]
                BytesExtra = message[10]
                video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False)
                image_path = hard_link_db.get_video(str_content, BytesExtra, thumb=True)
                if video_path:
                    video_path = f'{Me().wx_dir}/{video_path}'
                    if os.path.exists(video_path):
                        new_path = os.path.join(config.OUTPUT_DIR, 'video', str_time + MsgSvrID[:6] + '.mp4')
                        if not os.path.exists(new_path):
                            shutil.copy(video_path, new_path)
                globalSignals.status_bar_message.emit((f'导出进度：{index + 1}/{num}——{image_path}', 1))
        self.okSignal.emit(self.id)


class FileExporter(ExporterBaseBase):
    def run(self):
        self.startSignal.emit(1)
        messages = database.get_messages_all()
        num = len(messages)
        origin_path = os.path.join(config.OUTPUT_DIR, 'files')
        os.makedirs(os.path.join(config.OUTPUT_DIR, 'files'), exist_ok=True)
        for index, message in enumerate(messages):
            type_ = message[2]
            sub_type = message[3]
            timestamp = message[5]
            # 把时间戳转换为格式化时间
            time_struct = time.localtime(timestamp)  # 首先把时间戳转换为结构化时间
            str_time = time.strftime("%Y%m%d%H%M%S", time_struct)  # 把结构化时间转换为格式化时间
            if type_ == 49 and sub_type == 6:
                bytesExtra = message[10]
                compress_content = message[13]
                file_info = file(bytesExtra, compress_content, output_path=origin_path)
                if not file_info.get('is_error'):
                    file_path = file_info.get('file_path')
                    globalSignals.status_bar_message.emit((f'导出进度：{index + 1}/{num}——{file_path}', 1))
        self.okSignal.emit(self.id)


class AudioExporter(ExporterBaseBase):
    def run(self):
        self.startSignal.emit(1)
        messages = msg_db.get_messages_all()
        num = len(messages)
        for index, message in enumerate(messages):
            type_ = message[2]
            timestamp = message[5]
            # 把时间戳转换为格式化时间
            time_struct = time.localtime(timestamp)  # 首先把时间戳转换为结构化时间
            str_time = time.strftime("%Y%m%d%H%M%S", time_struct)  # 把结构化时间转换为格式化时间
            MsgSvrID = str(message[9])
            if type_ == 43:
                str_content = message[7]
                BytesExtra = message[10]
                video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False)
                image_path = hard_link_db.get_video(str_content, BytesExtra, thumb=True)
                if video_path:
                    video_path = f'{Me().wx_dir}/{video_path}'
                    if os.path.exists(video_path):
                        new_path = os.path.join(config.OUTPUT_DIR, 'video', str_time + '.mp4')
                        if not os.path.exists(new_path):
                            shutil.copy(video_path, new_path)
                globalSignals.status_bar_message.emit((f'导出进度：{index + 1}/{num}——{image_path}', 1))
        self.okSignal.emit(self.id)


class ContactExporter(ExporterBaseBase):
    def __init__(self, database, output_path):
        super().__init__()
        self.okSignal = None
        self.database = database
        self.output_path = output_path

    def start(self):
        self.run()

    def run(self):

        # columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
        columns = ['UserName', 'Alias', 'Type', 'Remark', 'NickName', 'smallHeadImgUrl',
                   'bigHeadImgUrl', 'label', 'gender', 'signature', 'country/region', 'province', 'city']

        contacts = self.database.get_contacts()
        try:
            # 写入CSV文件
            with open(self.output_path, mode='w', newline='', encoding='utf-8-sig') as file:
                writer = csv.writer(file)
                writer.writerow(columns)
                # 写入数据
                for contact in contacts:
                    writer.writerow(
                        [contact.wxid, contact.alias, contact.flag, contact.remark, contact.nickname,
                         contact.small_head_img_url, contact.big_head_img_url, contact.label_name(), contact.gender,
                         contact.signature, *contact.region
                         ]
                    )
        except PermissionError:
            print('另一个程序正在使用此文件，无法访问。')


class GroupContactExporter(ExporterBaseBase):
    def __init__(self, database, output_dir, contact):
        super().__init__()
        self.contact = contact
        self.database = database
        if self.contact:
            if not isinstance(self.contact, list):
                self.origin_path = os.path.join(output_dir, '聊天记录',
                                                f'{self.contact.remark}({self.contact.wxid})')
                os.makedirs(self.origin_path, exist_ok=True)

    def start(self):
        self.run()

    def run(self):
        filename = os.path.join(self.origin_path, 'contacts.csv')
        filename = get_new_filename(filename)
        # columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
        columns = ['UserName', '微信号', '类型', '群昵称', '昵称', '头像地址',
                   '头像原图', '标签', '性别', '个性签名', '国家（地区）', '省份', '城市']
        contacts = self.database.get_chatroom_members(self.contact.wxid)
        try:
            # 写入CSV文件
            with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
                writer = csv.writer(file)
                writer.writerow(columns)
                # 写入数据
                # writer.writerows(contacts)
                for wxid, contact in contacts.items():
                    writer.writerow(
                        [
                            contact.wxid, contact.alias, contact.flag, contact.remark, contact.nickname,
                            contact.small_head_img_url, contact.big_head_img_url, contact.label_name(),
                            contact.gender, contact.signature, *contact.region
                        ]
                    )
        except PermissionError:
            print('另一个程序正在使用此文件，无法访问。')


class CsvAllExporter(ExporterBaseBase):
    def run(self):
        filename = QFileDialog.getSaveFileName(None, "save file", os.path.join(os.getcwd(), 'messages.csv'),
                                               "csv files (*.csv);;all files(*.*)")
        if not filename[0]:
            return
        self.startSignal.emit(1)
        filename = filename[0]
        # columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
        columns = ['localId', 'TalkerId', 'Type', 'SubType',
                   'IsSender', 'CreateTime', 'Status', 'StrContent',
                   'StrTime', 'Remark', 'NickName', 'Sender']

        packagemsg = PackageMsg()
        messages = packagemsg.get_package_message_all()
        try:
            # 写入CSV文件
            with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
                writer = csv.writer(file)
                writer.writerow(columns)
                # 写入数据
                writer.writerows(messages)
        except PermissionError:
            globalSignals.information.emit('另一个程序正在使用此文件，无法访问。')
        self.okSignal.emit(self.id)


def copy_file(source_file, destination_file):
    if os.path.isfile(source_file) and not os.path.exists(destination_file):
        try:
            # logger.info(f'开始复制:{destination_file}')
            shutil.copy(source_file, destination_file)
        except:
            pass
            # logger.error(traceback.format_exc())
        finally:
            print(f'复制:{destination_file}')
            # logger.info(f'复制:{destination_file}')


def copy_files(file_tasks: List[Tuple[str, str, str]]):
    """

    :param file_tasks: List[
        (原始文件路径,
            输出文件夹,
            输出文件名
            )]
    :return:
    """
    if len(file_tasks) < 1:
        return
    futures = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        for source_file, output_dir, dst_name in file_tasks:
            if dst_name:
                ext = os.path.basename(source_file).split('.')[-1]
                destination_file = os.path.join(output_dir, f'{dst_name}.{ext}')
            else:
                destination_file = os.path.join(output_dir, os.path.basename(source_file))
            if os.path.exists(destination_file):
                continue
            if not os.path.exists(os.path.dirname(destination_file)):
                os.makedirs(os.path.dirname(destination_file), exist_ok=True)
            futures.append(executor.submit(copy_file, source_file, destination_file))

            # 等待所有任务完成
            for future in futures:
                future.result()


def get_ffmpeg_path():
    # 获取打包后的资源目录
    resource_dir = getattr(sys, '_MEIPASS', os.path.abspath(os.path.dirname(__file__)))

    # 构建 FFmpeg 可执行文件的路径
    ffmpeg_path = os.path.join(resource_dir, 'ffmpeg.exe')
    if not os.path.exists(ffmpeg_path):
        ffmpeg_path = os.path.join(resource_dir, 'resources', 'ffmpeg.exe')
    return ffmpeg_path


def decode_audio_to_mp3(media_buffer, output_dir, filename):
    silk_path = f"{output_dir}/{filename}.silk"
    pcm_path = f"{output_dir}/{filename}.pcm"
    mp3_path = f"{output_dir}/{filename}.mp3"
    if os.path.exists(mp3_path):
        return mp3_path
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    buf = media_buffer
    if not buf:
        return ''
    with open(silk_path, "wb") as f:
        f.write(buf)
    # open(silk_path, "wb").write()
    try:
        pcm_buf = pysilk.decode(buf, to_wav=False, sample_rate=44100)
        with open(pcm_path, 'wb') as f:
            f.write(pcm_buf)
        # pysilk.decode_file(open("brainpower.pcm", "rb"), to_wav=False)
        # 调用系统上的 ffmpeg 可执行文件
        # 获取 FFmpeg 可执行文件的路径
        ffmpeg_path = get_ffmpeg_path()
        # print(ffmpeg_path)
        # # 调用 FFmpeg
        if os.path.exists(ffmpeg_path):
            cmd = f'''"{ffmpeg_path}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"'''
            # system(cmd)
            # 使用subprocess.run()执行命令
            subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        else:
            # 源码运行的时候下面的有效
            # 这里不知道怎么捕捉异常
            cmd = f'''"{os.path.join(os.getcwd(), 'app', 'resources', 'data', 'ffmpeg.exe')}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"'''
            # system(cmd)
            # 使用subprocess.run()执行命令
            subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        # if os.path.exists(silk_path):
        #     os.remove(silk_path)
        # if os.path.exists(pcm_path):
        #     os.remove(pcm_path)
    except Exception as e:
        print(f"Error: {e}")
        logger.error(f'语音错误\n{traceback.format_exc()}')
        cmd = f'''"{os.path.join(os.getcwd(), 'app', 'resources', 'data', 'ffmpeg.exe')}" -loglevel quiet -y -f s16le -i "{pcm_path}" -ar 44100 -ac 1 "{mp3_path}"'''
        # system(cmd)
        # 使用subprocess.run()执行命令
        subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    finally:
        return mp3_path


def decode_audios(file_tasks: List[Tuple[str, str, str]]):
    """

    :param database:
    :param file_tasks: List[
        (原始文件路径,
            输出文件夹,
            输出文件名
            )]
    :return:
    """
    if len(file_tasks) < 1:
        return
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for media_buffer, output_dir, dst_name in file_tasks:
            futures.append(executor.submit(decode_audio_to_mp3, media_buffer, output_dir, dst_name))

        # 等待所有任务完成
        for future in futures:
            future.result()


def remove_privacy_info(text):
    # 正则表达式模式
    patterns = {
        'phone': r'\b(\+?86[-\s]?)?1[3-9]\d{9}\b',  # 手机号
        'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # 邮箱
        'id_card': r'\b\d{15}|\d{18}|\d{17}X\b',  # 身份证号
        'password': r'\b(?:password|pwd|pass|psw)[\s=:]*\S+\b',  # 密码
        'account': r'\b(?:account|username|user|acct)[\s=:]*\S+\b'  # 账号
    }

    for key, pattern in patterns.items():
        text = re.sub(pattern, f'[{key} xxx]', text)

    return text


def get_new_filename(filename):
    """
    检查给定的文件是否存在，如果存在就加个括号标个号，返回新的文件名
    @param filename:
    @return:
    """
    if not os.path.exists(filename):
        return filename
    else:
        for i in range(1, 10086):
            basename = os.path.basename(filename)
            tmp = basename.split('.')
            name = '.'.join(tmp[:-1])
            ext = tmp[-1]
            dir_name = os.path.dirname(filename)
            new_filename = os.path.join(dir_name, f'{name}({i}).{ext}')
            if not os.path.exists(new_filename):
                return new_filename
    return filename