WeChatMsg/wxManager/manager_v3.py
2025-03-28 21:43:32 +08:00

701 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2024/12/11 20:43
@Author : SiYuan
@Email : 863909694@qq.com
@File : MemoTrace-manager_v4.py
@Description :
"""
import concurrent
import os
import traceback
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from datetime import date
from typing import Tuple, List, Any
import xmltodict
from wxManager import MessageType
from wxManager.db_main import DataBaseInterface
from wxManager.db_v3.hard_link_file import HardLinkFile
from wxManager.db_v3.hard_link_image import HardLinkImage
from wxManager.db_v3.hard_link_video import HardLinkVideo
from wxManager.db_v3.misc import Misc
from wxManager.db_v3.msg import Msg
from wxManager.db_v3.media_msg import MediaMsg
from wxManager.db_v3.emotion import Emotion
from wxManager.db_v3.open_im_contact import OpenIMContactDB
from wxManager.db_v3.open_im_media import OpenIMMediaDB
from wxManager.db_v3.open_im_msg import OpenIMMsgDB
from wxManager.db_v3.public_msg import PublicMsg
from wxManager.db_v3.micro_msg import MicroMsg
from wxManager.db_v3.favorite import Favorite
from wxManager.log import logger
from wxManager.model.contact import Contact, Me, ContactType, Person
from wxManager.parser.file_parser import get_image_type
from wxManager.parser.util.protocbuf.roomdata_pb2 import ChatRoomData
from wxManager.parser.wechat_v3 import FACTORY_REGISTRY, parser_sub_type, Singleton
type_name_dict = {
(1, 0): MessageType.Text,
(3, 0): MessageType.Image,
(34, 0): MessageType.Audio,
(43, 0): MessageType.Video,
(47, 0): MessageType.Emoji,
(37, 0): "添加好友",
(42, 0): MessageType.BusinessCard,
(66, 0): MessageType.OpenIMBCard,
(48, 0): MessageType.Position,
(49, 40): MessageType.FavNote,
(49, 24): MessageType.FavNote,
(49, 53): "接龙",
(49, 0): MessageType.File,
(49, 1): MessageType.Text2,
(49, 3): MessageType.Music,
(49, 76): MessageType.Music,
(49, 5): MessageType.LinkMessage,
(49, 6): MessageType.File,
(49, 8): "用户上传的GIF表情",
(49, 17): MessageType.System, # 发起了位置共享
(49, 19): MessageType.MergedMessages,
(49, 33): MessageType.Applet,
(49, 36): MessageType.Applet2,
(49, 51): MessageType.WeChatVideo,
(49, 57): MessageType.Quote,
(49, 63): "视频号直播或直播回放等",
(49, 87): "群公告",
(49, 88): "视频号直播或直播回放等",
(49, 2000): MessageType.Transfer,
(49, 2003): "赠送红包封面",
(50, 0): MessageType.Voip,
(10000, 0): MessageType.System,
(10000, 4): MessageType.Pat,
(10000, 8000): MessageType.System
}
def decodeExtraBuf(extra_buf_content: bytes):
if not extra_buf_content:
return {
"region": ('', '', ''),
"signature": '',
"telephone": '',
"gender": 0,
}
trunkName = {
b"\x46\xCF\x10\xC4": "个性签名",
b"\xA4\xD9\x02\x4A": "国家",
b"\xE2\xEA\xA8\xD1": "省份",
b"\x1D\x02\x5B\xBF": "",
# b"\x81\xAE\x19\xB4": "朋友圈背景url",
# b"\xF9\x17\xBC\xC0": "公司名称",
# b"\x4E\xB9\x6D\x85": "企业微信属性",
# b"\x0E\x71\x9F\x13": "备注图片",
b"\x75\x93\x78\xAD": "手机号",
b"\x74\x75\x2C\x06": "性别",
}
res = {"手机号": ""}
off = 0
try:
for key in trunkName:
trunk_head = trunkName[key]
try:
off = extra_buf_content.index(key) + 4
except:
pass
char = extra_buf_content[off: off + 1]
off += 1
if char == b"\x04": # 四个字节的int小端序
intContent = extra_buf_content[off: off + 4]
off += 4
intContent = int.from_bytes(intContent, "little")
res[trunk_head] = intContent
elif char == b"\x18": # utf-16字符串
lengthContent = extra_buf_content[off: off + 4]
off += 4
lengthContent = int.from_bytes(lengthContent, "little")
strContent = extra_buf_content[off: off + lengthContent]
off += lengthContent
res[trunk_head] = strContent.decode("utf-16").rstrip("\x00")
return {
"region": (res["国家"], res["省份"], res[""]),
"signature": res["个性签名"],
"telephone": res["手机号"],
"gender": res["性别"],
}
except:
logger.error(f'联系人解析错误:\n{traceback.format_exc()}')
return {
"region": ('', '', ''),
"signature": '',
"telephone": '',
"gender": 0,
}
def parser_messages(messages, username, db_dir=''):
context = DataBaseV3()
context.init_database(db_dir)
if username.endswith('@chatroom'):
contacts = context.get_chatroom_members(username)
else:
contacts = {
Me().wxid: context.get_contact_by_username(Me().wxid),
username: context.get_contact_by_username(username)
}
# FACTORY_REGISTRY[-1].set_contacts(contacts)
Singleton.set_contacts(contacts)
for message in messages:
type_ = message[2]
sub_type = parser_sub_type(message[7]) if username.endswith('@openim') else message[3]
msg_type = type_name_dict.get((type_, sub_type))
if msg_type not in FACTORY_REGISTRY:
msg_type = -1
yield FACTORY_REGISTRY[msg_type].create(message, username, context)
def _process_messages_batch(messages_batch, username, db_dir) -> List:
"""Helper function to process a batch of messages."""
processed = []
for message in parser_messages(messages_batch, username, db_dir):
processed.append(message)
return processed
class DataBaseV3(DataBaseInterface):
# todo 把上面这一堆数据库功能整合到这一个class里对外只暴漏一个接口
def __init__(self):
super().__init__()
self.db_dir = None
self.chatroom_members_map = {}
self.contacts_map = {}
self.misc_db = Misc('Misc.db')
self.msg_db = Msg('Multi/MSG0.db', is_series=True)
self.public_msg_db = PublicMsg('PublicMsg.db')
self.micro_msg_db = MicroMsg('MicroMsg.db')
self.hard_link_image_db = HardLinkImage('HardLinkImage.db')
self.hard_link_file_db = HardLinkFile('HardLinkFile.db')
self.hard_link_video_db = HardLinkVideo('HardLinkVideo.db')
self.emotion_db = Emotion('Emotion.db')
self.media_msg_db = MediaMsg('Multi/MediaMSG0.db', is_series=True)
self.open_contact_db = OpenIMContactDB('OpenIMContact.db')
self.open_media_db = OpenIMMediaDB('OpenIMMedia.db')
self.open_msg_db = OpenIMMsgDB('OpenIMMsg.db')
# self.sns_db = Sns()
# self.audio_to_text = Audio2TextDB()
# self.public_msg_db = PublicMsg()
# self.favorite_db = Favorite()
def init_database(self, db_dir=''):
# print('初始化数据库', db_dir)
Me().load_from_json(os.path.join(db_dir, 'info.json')) # 加载自己的信息
flag = True
self.db_dir = db_dir
flag &= self.misc_db.init_database(db_dir)
flag &= self.msg_db.init_database(db_dir)
flag &= self.public_msg_db.init_database(db_dir)
flag &= self.micro_msg_db.init_database(db_dir)
flag &= self.hard_link_image_db.init_database(db_dir)
flag &= self.hard_link_file_db.init_database(db_dir)
flag &= self.hard_link_video_db.init_database(db_dir)
flag &= self.emotion_db.init_database(db_dir)
flag &= self.media_msg_db.init_database(db_dir)
flag &= self.open_contact_db.init_database(db_dir)
flag &= self.open_media_db.init_database(db_dir)
flag &= self.open_msg_db.init_database(db_dir)
return flag
# self.sns_db.init_database(db_dir)
# self.audio_to_text.init_database(db_dir)
# self.public_msg_db.init_database(db_dir)
# self.favorite_db.init_database(db_dir)
def close(self):
self.misc_db.close()
self.msg_db.close()
self.public_msg_db.close()
self.micro_msg_db.close()
self.hard_link_image_db.close()
self.hard_link_file_db.close()
self.hard_link_video_db.close()
self.emotion_db.close()
self.media_msg_db.close()
self.open_contact_db.close()
self.open_media_db.close()
self.open_msg_db.close()
# self.sns_db.close()
# self.audio_to_text.close()
# self.public_msg_db.close()
def get_session(self):
"""
获取聊天会话窗口,在聊天界面显示
@return:
"""
return self.micro_msg_db.get_session()
def get_messages(
self,
username_: str,
time_range: Tuple[int | float | str | date, int | float | str | date] = None,
):
# todo 改成yield进行操作多进程处理加快速度
import time
st = time.time()
logger.error(f'开始获取聊天记录:{st}')
# if username_.startswith('gh'):
# messages = self.public_msg_db.get_messages(username_, time_range)
# elif username_.endswith('@openim'):
# messages = self.open_msg_db.get_messages_by_username(username_, time_range)
# else:
# messages = self.msg_db.get_messages_by_username(username_, time_range)
# result = []
# for messages_ in messages:
# print(len(messages_))
# for message in parser_messages(messages_, username_, self.db_dir):
# result.append(message)
# result.sort()
# et = time.time()
# logger.error(f'获取聊天记录完成:{et}')
# logger.error(f'获取聊天记录耗时:{et - st:.2f}s/{len(result)}条消息')
# return result
res = []
# for messages in self.message_db.get_messages_by_username(username_, time_range):
# for message in self.parser_messages(messages, username_):
# res.append(message)
def split_list(lst, n):
k, m = divmod(len(lst), n)
return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
# # # Step 1: Retrieve raw message batches
if username_.startswith('gh_'):
messages = self.public_msg_db.get_messages_by_username(username_, time_range)
elif username_.endswith('@openim'):
messages = self.open_msg_db.get_messages_by_username(username_, time_range)
else:
messages = self.msg_db.get_messages_by_username(username_, time_range)
if len(messages) < 20000:
for message in parser_messages(messages, username_, self.db_dir):
res.append(message)
else:
raw_message_batches = split_list(messages, len(messages) // 10000 + 1)
#
# # Step 2: Use multiprocessing to process the message batches
# res = []
# for batch in raw_message_batches:
# print(len(batch))
with ProcessPoolExecutor(max_workers=min(len(raw_message_batches), 16)) as executor:
# Submit tasks
future_to_batch = {
executor.submit(_process_messages_batch, batch, username_, self.db_dir): batch
for batch in raw_message_batches
}
# Collect results
for future in future_to_batch.keys():
res.extend(future.result())
et = time.time()
logger.error(f'获取聊天记录完成:{et}')
logger.error(f'获取聊天记录耗时:{et - st:.2f}s/{len(res)}条消息')
res.sort()
return res
def get_messages_by_num(self, username, start_sort_seq, msg_num=20):
"""
获取小于start_sort_seq的msg_num个消息
@param username:
@param start_sort_seq:
@param msg_num:
@return: messages, 最后一条消息的start_sort_seq
"""
if username.startswith('gh'):
messages = self.public_msg_db.get_messages_by_num(username, start_sort_seq, msg_num)
elif username.endswith('@openim'):
messages = self.open_msg_db.get_messages_by_num(username, start_sort_seq, msg_num)
else:
messages = self.msg_db.get_messages_by_num(username, start_sort_seq, msg_num)
result = []
for messages_ in messages:
for message in parser_messages(messages_, username, self.db_dir):
result.append(message)
result.sort(reverse=True)
res = result[:msg_num]
return res, res[-1].sort_seq if res else 0
def get_message_by_server_id(self, username, server_id):
"""
获取小于start_sort_seq的msg_num个消息
@param username:
@param server_id:
@return: messages, 最后一条消息的start_sort_seq
"""
message = self.msg_db.get_message_by_server_id(username, server_id)
if message:
messages_iter = parser_messages([message], username, self.db_dir)
return next(messages_iter)
return None
def get_messages_all(self, time_range=None):
return self.msg_db.get_messages_all(time_range)
def get_messages_calendar(self, username_):
return self.msg_db.get_messages_calendar(username_)
def get_messages_by_type(
self,
username_,
type_: MessageType,
time_range: Tuple[int | float | str | date, int | float | str | date] = None,
):
def split_list(lst, n):
k, m = divmod(len(lst), n)
return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
res = []
# # # Step 1: Retrieve raw message batches
if username_.startswith('gh_'):
messages = self.public_msg_db.get_messages_by_type(username_, type_, time_range)
elif username_.endswith('@openim'):
messages = self.open_msg_db.get_messages_by_type(username_, type_, time_range)
else:
messages = self.msg_db.get_messages_by_type(username_, type_, time_range)
if len(messages) < 20000:
for message in parser_messages(messages, username_, self.db_dir):
res.append(message)
else:
raw_message_batches = split_list(messages, len(messages) // 10000 + 1)
with ProcessPoolExecutor(max_workers=min(len(raw_message_batches), 16)) as executor:
# Submit tasks
future_to_batch = {
executor.submit(_process_messages_batch, batch, username_, self.db_dir): batch
for batch in raw_message_batches
}
# Collect results
for future in future_to_batch.keys():
res.extend(future.result())
res.sort()
return res
def get_emoji_url(self, md5: str, thumb: bool = False) -> str | bytes:
return self.emotion_db.get_emoji_URL(md5, thumb)
def get_emoji_path(self, md5: str, output_path, thumb: bool = False, ) -> str:
"""
@param md5:
@param output_path:
@param thumb:
@return:
"""
data = self.emotion_db.get_emoji_data(md5, thumb)
prefix = "th_" if thumb else ""
f = '.' + get_image_type(data[:10])
file_path = os.path.join(output_path, prefix + md5 + f)
if not os.path.exists(file_path):
try:
with open(file_path, 'wb') as f:
f.write(data)
except:
pass
return file_path
def get_emoji_URL(self, md5: str, thumb: bool = False):
return self.emotion_db.get_emoji_URL(md5, thumb)
# 图片、视频、文件
def get_file(self, md5: bytes | str) -> str:
return self.hard_link_file_db.get_file(md5)
def get_image(self, content, bytesExtra, up_dir="", md5=None, thumb=False, talker_username='') -> str:
return self.hard_link_image_db.get_image(content, bytesExtra, up_dir, md5, thumb)
def get_video(self, content, bytesExtra, md5=None, thumb=False):
return self.hard_link_video_db.get_video(content, bytesExtra, md5, thumb)
# 图片、视频、文件结束
# 语音
def get_media_buffer(self, server_id, is_open_im=False) -> bytes:
if is_open_im:
return self.open_media_db.get_media_buffer(server_id)
else:
return self.media_msg_db.get_media_buffer(server_id)
def get_audio(self, reserved0, output_path, open_im=False, filename=''):
if open_im:
pass
else:
return self.media_msg_db.get_audio(reserved0, output_path, filename)
def get_audio_path(self, reserved0, output_path, filename=''):
return self.media_msg_db.get_audio_path(reserved0, output_path, filename)
def get_audio_text(self, msgSvrId):
return ''
return self.media_msg_db.get_audio_text(msgSvrId)
def update_audio_to_text(self):
messages = self.get_messages_all()
contacts = self.get_contacts()
contacts_set = {contact.wxid for contact in contacts}
for message in messages:
if message[2] == 34:
str_content = message[7]
msgSvrId = message[9]
voice_to_text = self.media_msg_db.get_audio_text(str_content)
if voice_to_text:
self.audio_to_text.add_text(msgSvrId, voice_to_text)
wxid = message[11]
# if wxid not in contacts_set:
# contact = ContactDefault(wxid)
# self.micro_msg_db.add_contact(contact)
# contacts_set.add(wxid)
# 语音结束
# 联系人
def get_avatar_buffer(self, username) -> bytes:
return self.misc_db.get_avatar_buffer(username)
def create_contact(self, contact_info_list) -> Person:
detail = decodeExtraBuf(contact_info_list[9])
wxid = contact_info_list[0]
nickname = contact_info_list[4]
remark = contact_info_list[3]
if not nickname and wxid.endswith('@chatroom'):
nickname = self._get_chatroom_name(contact_info_list[0])
if not remark:
remark = nickname
gender = '未知'
signature = ''
label_list = contact_info_list[10].split(',') if contact_info_list[10] else []
region = ('', '', '')
if detail:
gender_code = detail.get('gender', 0)
if gender_code == 1:
gender = ''
elif gender_code == 2:
gender = ''
type_ = contact_info_list[2]
wxid = contact_info_list[0]
contact = Contact(
wxid=contact_info_list[0],
remark=remark,
alias=contact_info_list[1],
nickname=nickname,
small_head_img_url=contact_info_list[7],
big_head_img_url=contact_info_list[8],
flag=contact_info_list[3],
gender=gender,
signature=signature,
label_list=label_list,
region=region
)
contact.type = ContactType.Normal
if wxid.startswith('gh_'):
contact.type |= ContactType.Public
elif wxid.endswith('@chatroom'):
contact.type |= ContactType.Chatroom
def is_nth_bit_set(number, n):
# 左移 1 到第 n 位
mask = 1 << n
# 使用位与运算判断第 n 位
return (number & mask) != 0
if is_nth_bit_set(type_, 6):
contact.type |= ContactType.Star
if is_nth_bit_set(type_, 11):
contact.type |= ContactType.Sticky
if type_ == 10086:
contact.type = ContactType.Unknown
contact.is_unknown = True
return contact
def create_open_im_contact(self, contact_info_list) -> Person:
contact_info = {
'UserName': contact_info_list[0],
'Alias': contact_info_list[0],
'Type': contact_info_list[2],
'Remark': contact_info_list[3],
'NickName': contact_info_list[1],
'smallHeadImgUrl': contact_info_list[5],
'bigHeadImgUrl': contact_info_list[4],
'detail': None,
'label_name': '',
'wording': contact_info_list[13]
}
wxid = contact_info_list[0]
nickname = contact_info_list[1]
remark = contact_info_list[3]
if not nickname and wxid.endswith('@chatroom'):
nickname = self._get_chatroom_name(contact_info_list[0])
if not remark:
remark = nickname
contact = Contact(
wxid=contact_info_list[0],
alias=contact_info_list[0],
remark=f'{remark}@{contact_info_list[13]}',
nickname=nickname,
small_head_img_url=contact_info_list[5],
big_head_img_url=contact_info_list[4],
)
contact.type = ContactType.Normal
contact.type |= ContactType.OpenIM
return contact
def get_contacts(self) -> List[Person]:
contacts = []
contact_lists = self.micro_msg_db.get_contact()
for contact_info_list in contact_lists:
contact = self.create_contact(contact_info_list)
contacts.append(contact)
contact_lists = self.open_contact_db.get_contacts()
for contact_info_list in contact_lists:
contact = self.create_open_im_contact(contact_info_list)
contacts.append(contact)
return contacts
def set_remark(self, username: str, remark) -> bool:
if username in self.contacts_map:
self.contacts_map[username].remark = remark
if username.endswith('@openim'):
return self.open_contact_db.set_remark(username, remark)
else:
return self.micro_msg_db.set_remark(username, remark)
def set_avatar_buffer(self, username, avatar_path):
return self.misc_db.set_avatar_buffer(username, avatar_path)
def get_contact_by_username(self, wxid: str) -> Contact:
if wxid.endswith('@openim'):
contact_info_list = self.open_contact_db.get_contact_by_username(wxid)
if contact_info_list:
contact = self.create_open_im_contact(contact_info_list)
else:
contact = Contact(
wxid=wxid,
nickname=wxid,
remark=wxid
)
else:
contact_info_list = self.micro_msg_db.get_contact_by_username(wxid)
if contact_info_list:
contact = self.create_contact(contact_info_list)
else:
contact = Contact(
wxid=wxid,
nickname=wxid,
remark=wxid
)
return contact
def get_chatroom_members(self, chatroom_name) -> dict[Any, Contact] | Any:
"""
获取群成员(不包括企业微信联系人)
@param chatroom_name:
@return:
"""
if chatroom_name in self.chatroom_members_map:
return self.chatroom_members_map[chatroom_name]
result = {}
chatroom = self.micro_msg_db.get_chatroom_info(chatroom_name)
if chatroom is None:
return result
# 解析RoomData数据
parsechatroom = ChatRoomData()
parsechatroom.ParseFromString(chatroom[1])
# 群成员数据放入字典存储
for mem in parsechatroom.members:
contact = self.get_contact_by_username(mem.wxID)
if contact:
if mem.displayName:
contact.remark = mem.displayName
result[contact.wxid] = contact
self.chatroom_members_map[chatroom_name] = result
return result
def _get_chatroom_name(self, wxid):
"""
获取没有命名的群聊名
:param wxid:
:return:
"""
chatroom = self.micro_msg_db.get_chatroom_info(wxid)
if chatroom is None:
return ''
# 解析RoomData数据
parsechatroom = ChatRoomData()
parsechatroom.ParseFromString(chatroom[1])
chatroom_name = ''
# 群成员数据放入字典存储
for mem in parsechatroom.members[:5]:
if mem.wxID == Me().wxid:
continue
if mem.displayName:
chatroom_name += f'{mem.displayName}'
else:
contact = self.get_contact_by_username(mem.wxID)
chatroom_name += f'{contact.remark}'
return chatroom_name.rstrip('')
# 联系人结束
def add_audio_txt(self, msgSvrId, text):
return self.audio_to_text.add_text(msgSvrId, text)
def get_favorite_items(self, time_range):
return self.favorite_db.get_items(time_range)
def merge(self, db_dir):
merge_tasks = {
self.msg_db: os.path.join(db_dir, 'Multi', 'MSG0.db'),
self.media_msg_db: os.path.join(db_dir, 'Multi', 'MediaMSG0.db'),
self.misc_db: os.path.join(db_dir, 'Misc.db'),
self.micro_msg_db: os.path.join(db_dir, 'MicroMsg.db'),
self.emotion_db: os.path.join(db_dir, 'Emotion.db'),
self.hard_link_file_db: os.path.join(db_dir, 'HardLinkFile.db'),
self.hard_link_image_db: os.path.join(db_dir, 'HardLinkImage.db'),
self.hard_link_video_db: os.path.join(db_dir, 'HardLinkVideo.db'),
self.open_contact_db: os.path.join(db_dir, 'OpenIMContact.db'),
self.open_media_db: os.path.join(db_dir, 'OpenIMMedia.db'),
self.open_msg_db: os.path.join(db_dir, 'OpenIMMsg.db'),
self.public_msg_db: os.path.join(db_dir, 'PublicMsg.db'),
}
def merge_task(db_instance, db_path):
"""执行单个数据库的合并任务"""
db_instance.merge(db_path)
# 使用 ThreadPoolExecutor 进行多线程合并
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(merge_task, db, path): (db, path) for db, path in merge_tasks.items()}
# 等待所有任务完成
for future in concurrent.futures.as_completed(futures):
db, path = futures[future]
try:
future.result() # 这里会抛出异常(如果有的话)
print(f"成功合并数据库: {path}")
except Exception as e:
print(f"合并 {path} 失败: {e}")