diff --git a/exporter/exporter_xlsx.py b/exporter/exporter_xlsx.py index 59ad159..241dc02 100644 --- a/exporter/exporter_xlsx.py +++ b/exporter/exporter_xlsx.py @@ -188,8 +188,6 @@ class ExcelExporter(ExporterBase): logger.error(traceback.format_exc()) continue type_ = message.type - timestamp = message.timestamp - msgSvrId = message.server_id if type_ == MessageType.Image: message.set_file_name() image_index[message.server_id] = self.row @@ -256,6 +254,8 @@ class ExcelExporter(ExporterBase): if MessageType.Image in self.message_types: for index, message in enumerate(messages): if message.type == MessageType.Image: + if not self.is_selected(message): + continue row = image_index[message.server_id] img_path = find_image_with_known_extensions(os.path.join(self.origin_path, message.path)) if not img_path: diff --git a/wxManager/db_v3/open_im_msg.py b/wxManager/db_v3/open_im_msg.py index a6e6e45..a49d288 100644 --- a/wxManager/db_v3/open_im_msg.py +++ b/wxManager/db_v3/open_im_msg.py @@ -21,6 +21,7 @@ from concurrent.futures import ThreadPoolExecutor from datetime import datetime, date from typing import Tuple +from wxManager import MessageType from wxManager.merge import increase_data, increase_update_data from wxManager.log import logger from wxManager.model import DataBaseBase @@ -61,6 +62,45 @@ def convert_to_timestamp(time_range) -> Tuple[int, int]: return convert_to_timestamp_(time_range[0]), convert_to_timestamp_(time_range[1]) +def get_local_type(type_: MessageType): + type_name_dict = { + MessageType.Text: (1, 0), + MessageType.Image: (3, 0), + MessageType.Audio: (34, 0), + MessageType.Video: (43, 0), + MessageType.Emoji: (47, 0), + MessageType.BusinessCard: (42, 0), + MessageType.OpenIMBCard: (66, 0), + MessageType.Position: (48, 0), + MessageType.FavNote: (49, 40), + MessageType.FavNote: (49, 24), + (49, 53): "接龙", + MessageType.File: (49, 0), + MessageType.Text2: (49, 1), + MessageType.Music: (49, 3), + MessageType.Music: (49, 76), + MessageType.LinkMessage: (49, 5), + MessageType.File: (49, 6), + (49, 8): "用户上传的GIF表情", + MessageType.System: (49, 17), # 发起了位置共享 + MessageType.MergedMessages: (49, 19), + MessageType.Applet: (49, 33), + MessageType.Applet2: (49, 36), + MessageType.WeChatVideo: (49, 51), + (49, 57): MessageType.Quote, + (49, 63): "视频号直播或直播回放等", + (49, 87): "群公告", + (49, 88): "视频号直播或直播回放等", + (49, 2000): MessageType.Transfer, + (49, 2003): "赠送红包封面", + (50, 0): MessageType.Voip, + (10000, 0): MessageType.System, + (10000, 4): MessageType.Pat, + (10000, 8000): MessageType.System + } + return type_name_dict.get(type_, (0, 0)) + + class OpenIMMsgDB(DataBaseBase): def _get_messages_by_num(self, cursor, username_, start_sort_seq, msg_num): @@ -134,6 +174,29 @@ class OpenIMMsgDB(DataBaseBase): return None + def _get_messages_by_type(self, cursor, username: str, type_: MessageType, + time_range: Tuple[int | float | str | date, int | float | str | date] = None, ): + if time_range: + start_time, end_time = convert_to_timestamp(time_range) + local_type, sub_type = get_local_type(type_) + sql = f''' + select localId,TalkerId,Type,SubType,IsSender,CreateTime,Status,StrContent,strftime('%Y-%m-%d %H:%M:%S',CreateTime,'unixepoch','localtime') as StrTime,MsgSvrID,BytesExtra,CompressContent,DisplayContent + from MSG + where StrTalker=? and Type=? and SubType = ? + {'AND CreateTime>' + str(start_time) + ' AND CreateTime<' + str(end_time) if time_range else ''} + order by CreateTime + ''' + cursor.execute(sql, [username, local_type, sub_type]) + result = cursor.fetchall() + if result: + return result + else: + return None + + def get_messages_by_type(self, username: str, type_: MessageType, + time_range: Tuple[int | float | str | date, int | float | str | date] = None, ): + return self.get_messages_by_type(self.DB.cursor, username, type_, time_range) + def merge(self, db_path): if not (os.path.exists(db_path) or os.path.isfile(db_path)): print(f'{db_path} 不存在') @@ -143,4 +206,4 @@ class OpenIMMsgDB(DataBaseBase): increase_data(db_path, self.cursor, self.DB, 'ChatCRMsg', 'MsgSvrID', 1, exclude_column='localId') except: print(f"数据库操作错误: {traceback.format_exc()}") - self.DB.rollback() \ No newline at end of file + self.DB.rollback() diff --git a/wxManager/db_v3/public_msg.py b/wxManager/db_v3/public_msg.py index a10d739..946fa77 100644 --- a/wxManager/db_v3/public_msg.py +++ b/wxManager/db_v3/public_msg.py @@ -8,79 +8,36 @@ from datetime import date from typing import Tuple from concurrent.futures import ThreadPoolExecutor +from wxManager import MessageType from wxManager.merge import increase_data -from wxManager.db_v3.msg import convert_to_timestamp +from wxManager.db_v3.msg import convert_to_timestamp,get_local_type from wxManager.model import DataBaseBase class PublicMsg(DataBaseBase): - def get_messages( - self, - username_: str, - time_range: Tuple[int | float | str | date, int | float | str | date] = None, - ): - """ - return list - a[0]: localId, - a[1]: talkerId, (和strtalker对应的,不是群聊信息发送人) - a[2]: type, - a[3]: subType, - a[4]: is_sender, - a[5]: timestamp, - a[6]: status, (没啥用) - a[7]: str_content, - a[8]: str_time, (格式化的时间) - a[9]: msgSvrId, - a[10]: BytesExtra, - a[11]: CompressContent, - a[12]: DisplayContent, - a[13]: 联系人的类(如果是群聊就有,不是的话没有这个字段) - """ - if not self.open_flag: - return [] + def _get_messages_by_type(self, cursor, username: str, type_: MessageType, + time_range: Tuple[int | float | str | date, int | float | str | date] = None, ): if time_range: start_time, end_time = convert_to_timestamp(time_range) + local_type, sub_type = get_local_type(type_) sql = f''' select localId,TalkerId,Type,SubType,IsSender,CreateTime,Status,StrContent,strftime('%Y-%m-%d %H:%M:%S',CreateTime,'unixepoch','localtime') as StrTime,MsgSvrID,BytesExtra,CompressContent,DisplayContent - from PublicMsg - where StrTalker=? + from MSG + where StrTalker=? and Type=? and SubType = ? {'AND CreateTime>' + str(start_time) + ' AND CreateTime<' + str(end_time) if time_range else ''} order by CreateTime ''' - try: - lock.acquire(True) - self.cursor.execute(sql, [username_]) - result = self.cursor.fetchall() - finally: - lock.release() - return result + cursor.execute(sql, [username, local_type, sub_type]) + result = cursor.fetchall() + if result: + return result + else: + return None - def get_messages_by_type( - self, - username_: str, - type_, - sub_type=None, - time_range: Tuple[int | float | str | date, int | float | str | date] = None, - ): - if not self.open_flag: - return [] - if time_range: - start_time, end_time = convert_to_timestamp(time_range) - sql = f''' - select localId,TalkerId,Type,SubType,IsSender,CreateTime,Status,StrContent,strftime('%Y-%m-%d %H:%M:%S',CreateTime,'unixepoch','localtime') as StrTime,MsgSvrID,BytesExtra,CompressContent,DisplayContent - from PublicMsg - where StrTalker=? AND Type=? {'AND SubType=' + str(sub_type) if sub_type else ''} - {'AND CreateTime>' + str(start_time) + ' AND CreateTime<' + str(end_time) if time_range else ''} - order by CreateTime - ''' - try: - lock.acquire(True) - self.cursor.execute(sql, [username_, type_]) - result = self.cursor.fetchall() - finally: - lock.release() - return result + def get_messages_by_type(self, username: str, type_: MessageType, + time_range: Tuple[int | float | str | date, int | float | str | date] = None, ): + return self.get_messages_by_type(self.DB.cursor, username, type_, time_range) def get_sport_score_by_name(self, username, time_range: Tuple[int | float | str | date, int | float | str | date] = None, ): @@ -126,7 +83,7 @@ class PublicMsg(DataBaseBase): def get_messages_by_username(self, username: str, time_range: Tuple[int | float | str | date, int | float | str | date] = None, ): - return self._get_messages_by_username(self.DB.cursor(),username,time_range) + return self._get_messages_by_username(self.DB.cursor(), username, time_range) def get_message_by_server_id(self, username, server_id): """ diff --git a/wxManager/parser/emoji_parser.py b/wxManager/parser/emoji_parser.py index ff04bed..1cc0268 100644 --- a/wxManager/parser/emoji_parser.py +++ b/wxManager/parser/emoji_parser.py @@ -26,7 +26,7 @@ def parser_emoji(xml_content): 'height': 0, 'desc': '' } - xml_content = xml_content.strip() + xml_content = xml_content.strip().replace('&', '&') try: xml_dict = xmltodict.parse(xml_content) emoji_dic = xml_dict.get('msg', {}).get('emoji', {}) diff --git a/wxManager/parser/link_parser.py b/wxManager/parser/link_parser.py index 5078126..970ff9e 100644 --- a/wxManager/parser/link_parser.py +++ b/wxManager/parser/link_parser.py @@ -9,6 +9,7 @@ @Description : """ import html +import re import traceback from datetime import datetime, timedelta import xml.etree.ElementTree as ET @@ -206,13 +207,27 @@ def parser_business(xml_content): return result +def replace_entity(match): + # 获取匹配的数字 + return '' + + +def process_xml(xml_string): + # 使用正则表达式替换所有十进制转义字符 + processed_xml = re.sub(r'&#(\d+);', replace_entity, xml_string) + return processed_xml + + def parser_record_item(recorditem, output_dir, wxid, msg_time, level=0): xml_string = recorditem if isinstance(xml_string, dict): recorditem_dic = xml_string else: - recorditem_dic = xmltodict.parse(xml_string) - + try: + recorditem_dic = xmltodict.parse(xml_string) + except: + xml_string = process_xml(xml_string) + recorditem_dic = xmltodict.parse(xml_string) # logger.error(recorditem_dic) datalist = recorditem_dic.get('recordinfo', {}).get('datalist', {}) count = datalist.get('@count', 0) @@ -522,7 +537,7 @@ def parser_record_item(recorditem, output_dir, wxid, msg_time, level=0): return result -def parser_merged_messages(xml, output_dir, wxid, msg_time, level=0): +def parser_merged_messages(xml: str, output_dir, wxid, msg_time, level=0): try: try: data_dic = xmltodict.parse(xml).get('msg', {}) @@ -543,8 +558,8 @@ def parser_merged_messages(xml, output_dir, wxid, msg_time, level=0): } except: logger.error(xml) - logger.error(new_xml1) - logger.error(new_xml2) + # logger.error(new_xml1) + # logger.error(new_xml2) logger.error(traceback.format_exc()) # raise ValueError('合并转发的消息解析失败') return { diff --git a/wxManager/parser/util/protocbuf/packed_info_data_img2.proto b/wxManager/parser/util/protocbuf/packed_info_data_img2.proto index e288c54..f4e3c62 100644 --- a/wxManager/parser/util/protocbuf/packed_info_data_img2.proto +++ b/wxManager/parser/util/protocbuf/packed_info_data_img2.proto @@ -3,9 +3,11 @@ syntax = "proto3"; message PackedInfoDataImg2 { int32 field1 = 1; int32 field2 = 2; - ImageInfo imageInfo = 3; - VideoInfo videoInfo = 4; - FileInfo fileInfo = 7; + ImageInfo imageInfo = 3; // 图片 + VideoInfo videoInfo = 4; // 视频 + AudioInfo audioInfo = 5; // 语音 + FileInfo fileInfo = 7; // 文件 + MergeInfo mergeInfo = 9; // 合并转发的聊天记录 } message ImageInfo { @@ -35,4 +37,13 @@ message FileSubMessage2 { string field1 = 1; string field2 = 2; string field3 = 3; +} + +message MergeInfo { + string dir = 1; +} + +message AudioInfo { + uint32 field1 = 1; + string audioTxt = 2; // 语音转文字结果 } \ No newline at end of file diff --git a/wxManager/parser/wechat_v4.py b/wxManager/parser/wechat_v4.py index 99bb42e..32ac546 100644 --- a/wxManager/parser/wechat_v4.py +++ b/wxManager/parser/wechat_v4.py @@ -190,7 +190,7 @@ class Singleton: self.contacts[wxid] = manager.get_contact_by_username(wxid) if isinstance(message[12], bytes): message_content = decompress(message[12]) - message_content = message_content.replace('', '').replace(' ', ' ') + message_content = message_content.replace('&#x01;', '').replace(' ', ' ') # logger.error(message_content) else: message_content = message[12] @@ -198,7 +198,9 @@ class Singleton: 2] != MessageType.Pat: # 群聊文字消息格式:: message_content = ':'.join(message_content.split(':')[1:]).strip() - + if message_content and message_content.startswith(username): + # md 微信不知道在搞什么,弄一些乱七八糟的东西 4.0.3.22 + message_content = message_content.strip(f'{username}:').replace('', '') return is_sender, wxid, message_content @@ -876,7 +878,7 @@ class FileMessageFactory(MessageFactory, Singleton): is_sender, wxid, message_content = self.common_attribute(message, username, manager) info = parser_file(message_content) md5 = info.get('md5', '') - filename = info.get('filename','') + filename = info.get('filename', '') if not filename: try: # 2025年3月微信4.0.3正式版修改了img命名方式才有了这个东西