From 2cb3f4e5b84ed5a53c6173e84a63c2fd10b6f783 Mon Sep 17 00:00:00 2001
From: STDquantum <405720329@qq.com>
Date: Sun, 10 Dec 2023 11:19:22 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86=E5=AF=B9BytesExtra?=
 =?UTF-8?q?=E7=9A=84=E8=A7=A3=E6=9E=90=EF=BC=8C=E5=9F=BA=E6=9C=AC=E8=BE=BE?=
 =?UTF-8?q?=E5=88=B0=E5=9B=BE=E7=89=87=E8=A7=86=E9=A2=91=E5=BE=AE=E4=BF=A1?=
 =?UTF-8?q?=E8=83=BD=E7=9C=8B=E6=9C=AC=E5=9C=B0=E5=B0=B1=E8=83=BD=E7=9C=8B?=
 =?UTF-8?q?=E7=9A=84=E7=A8=8B=E5=BA=A6=E3=80=82=E5=85=B6=E4=BB=96=E7=B1=BB?=
 =?UTF-8?q?=E5=9E=8B=E6=B6=88=E6=81=AF=E7=9A=84BytesExtra=E4=B9=9F?=
 =?UTF-8?q?=E9=A1=BA=E5=B8=A6=E8=A7=A3=E6=9E=90=E4=BA=86=EF=BC=88=E4=B8=80?=
 =?UTF-8?q?=E4=B8=AA=E9=81=93=E7=90=86=EF=BC=8C=E4=BB=A5=E5=90=8E=E5=8F=AF?=
 =?UTF-8?q?=E4=BB=A5=E4=BB=8E=E8=BF=99=E9=87=8C=E6=89=A9=E5=B1=95=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                |   3 +-
 app/DataBase/hard_link.py | 149 +++++++++++++++++++++++++++++++++++++-
 app/DataBase/msg.py       |   2 +-
 app/DataBase/output_pc.py |   5 +-
 4 files changed, 153 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index b027ad4..1dd9b9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,5 @@ app/DataBase/Msg/*
 *.pyc
 *.log
 *.spec
-test*
\ No newline at end of file
+test*
+wordcloud.html
\ No newline at end of file
diff --git a/app/DataBase/hard_link.py b/app/DataBase/hard_link.py
index fdc4014..90c77ea 100644
--- a/app/DataBase/hard_link.py
+++ b/app/DataBase/hard_link.py
@@ -30,6 +30,139 @@ def get_md5_from_xml(content, type_='img'):
         return None
 
 
+
+class tencent_struct:
+    def __setVals__(self, data, off):
+        if data:
+            self.__data = data
+        if self.__data:
+            self.__size = len(self.__data)
+        self.__off = off
+
+    def __readString(self):
+        try:
+            length = self.__readUleb()
+            res = self.__data[self.__off : self.__off + length]
+            self.__add(length)
+        except:
+            raise
+        return res.decode("utf-8")
+
+    def __readUleb(self):
+        try:
+            i = self.__data[self.__off]
+            self.__add()
+            if i & 0x80:
+                j = self.__data[self.__off]
+                i = i & 0x7F
+                i = i | (j << 7)
+                self.__add()
+                if i & 0x4000:
+                    j = self.__data[self.__off]
+                    i = i & 0x3FFF
+                    i = i | (j << 14)
+                    self.__add()
+                    if i & 0x200000:
+                        j = self.__data[self.__off]
+                        i = i & 0x1FFFFF
+                        i = i | (j << 21)
+                        self.__add()
+                        if i & 0x10000000:
+                            j = self.__data[self.__off]
+                            i = i & 0xFFFFFFF
+                            i = i | (j << 28)
+                            self.__add()
+            return i
+        except:
+            raise
+
+    def __readData(self):
+        try:
+            length = self.__readUleb()
+            data = self.__data[self.__off : self.__off + length]
+            self.__add(length)
+            return data
+        except:
+            raise
+
+    def __init__(self, data=None, off=0):
+        self.__data = data
+        self.__off = off
+        if self.__data:
+            self.__size = len(self.__data)
+        else:
+            self.__size = 0
+
+    def __add(self, value=1):
+        self.__off += value
+        if self.__off > self.__size:
+            raise "偏移量超出size"
+
+    def readStruct(self, struct_type):
+        current_dict = None
+        if isinstance(struct_type, str):
+            current_dict = getattr(self, struct_type)
+        else:
+            current_dict = struct_type
+        res = {}
+        try:
+            while self.__off < self.__size:
+                key = self.__readUleb()
+                key = key >> 3
+                if key == 0:
+                    break
+                op = None
+                fieldName = ""
+                if key in current_dict:
+                    op = current_dict[key][1]
+                    fieldName = current_dict[key][0]
+                else:
+                    break
+                if isinstance(op, dict):
+                    if not key in res:
+                        res[key] = []
+                    current_struct = self.__readData()
+                    recursion = tencent_struct(current_struct)
+                    res[key].append((fieldName, recursion.readStruct(op)))
+                elif op != "":
+                    res[key] = (fieldName, self.__contenttype__[op](self))
+                else:
+                    break
+        except:
+            raise
+        return res
+
+    __struct1__ = {1: ("", "I"), 2: ("", "I")}
+
+    __msgInfo__ = {1: ("", "I"), 2: ("msg_info", "s")}
+
+    __bytesExtra__ = {
+        1: ("", __struct1__),
+        3: ("msg_info_struct", __msgInfo__),
+    }
+
+    def get_bytesExta_Content(self, data=None, off=0):
+        self.__setVals__(data, off)
+        try:
+            return self.readStruct("__bytesExtra__")
+        except:
+            raise
+
+    __contenttype__ = {
+        "s": __readString,
+        "I": __readUleb,
+        "P": __readData,
+    }
+
+
+def parseBytes(content: bytes):
+    try:
+        bytesExtra = tencent_struct().get_bytesExta_Content(content)
+        return bytesExtra
+    except:
+        pass
+
+
 def singleton(cls):
     _instance = {}
 
@@ -115,7 +248,13 @@ class HardLink:
         finally:
             video_db_lock.release()
 
-    def get_image(self, content, thumb=False):
+    def get_image(self, content, bytesExtra, thumb=False):
+        bytesDict = parseBytes(bytesExtra)
+        for msginfo in bytesDict[3]:
+            if msginfo[1][1][1] == (3 if thumb else 4):
+                pathh = msginfo[1][2][1] # wxid\FileStorage\...
+                pathh = "\\".join(pathh.split('\\')[1:])
+                return pathh
         md5 = get_md5_from_xml(content)
         if not md5:
             return None
@@ -129,7 +268,13 @@ class HardLink:
             dat_image = os.path.join(root_path, dir1, dir0, dir2, data_image)
             return dat_image
 
-    def get_video(self, content, thumb=False):
+    def get_video(self, content, bytesExtra, thumb=False):
+        bytesDict = parseBytes(bytesExtra)
+        for msginfo in bytesDict[3]:
+            if msginfo[1][1][1] == (3 if thumb else 4):
+                pathh = msginfo[1][2][1] # wxid\FileStorage\...
+                pathh = "\\".join(pathh.split('\\')[1:])
+                return pathh
         md5 = get_md5_from_xml(content, type_='video')
         if not md5:
             return None
diff --git a/app/DataBase/msg.py b/app/DataBase/msg.py
index c08c673..e4410ee 100644
--- a/app/DataBase/msg.py
+++ b/app/DataBase/msg.py
@@ -56,7 +56,7 @@ class Msg:
         if not self.open_flag:
             return None
         sql = '''
-            select localId,TalkerId,Type,SubType,IsSender,CreateTime,Status,StrContent,strftime('%Y-%m-%d %H:%M:%S',CreateTime,'unixepoch','localtime') as StrTime,MsgSvrID
+            select localId,TalkerId,Type,SubType,IsSender,CreateTime,Status,StrContent,strftime('%Y-%m-%d %H:%M:%S',CreateTime,'unixepoch','localtime') as StrTime,MsgSvrID,BytesExtra
             from MSG
             where StrTalker=?
             order by CreateTime
diff --git a/app/DataBase/output_pc.py b/app/DataBase/output_pc.py
index d41eccb..806d315 100644
--- a/app/DataBase/output_pc.py
+++ b/app/DataBase/output_pc.py
@@ -637,6 +637,7 @@ const chatMessages = [
             str_time = message[8]
             # print(type_, type(type_))
             is_send = message[4]
+            BytesExtra = message[10]
             # avatar = MePC().avatar_path if is_send else self.contact.avatar_path
             # avatar = avatar.replace('\\', '\\\\')
             avatar = 'myhead.png' if is_send else 'tahead.png'
@@ -657,7 +658,7 @@ const chatMessages = [
                     f'''{{ type:{type_}, text: '{str_content}',is_send:{is_send},avatar_path:'{avatar}'}},'''
                 )
             elif type_ == 3:
-                image_path = hard_link_db.get_image(content=str_content, thumb=False)
+                image_path = hard_link_db.get_image(str_content, BytesExtra, thumb=False)
                 image_path = path.get_relative_path(image_path, base_path=f'/data/聊天记录/{self.contact.remark}/image')
                 image_path = image_path.replace('\\', '/')
                 # print(f"tohtml:---{image_path}")
@@ -669,7 +670,7 @@ const chatMessages = [
                     f'''{{ type:{type_}, text: '{image_path}',is_send:{is_send},avatar_path:'{avatar}'}},'''
                 )
             elif type_ == 43:
-                video_path = hard_link_db.get_video(content=str_content, thumb=False)
+                video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False)
                 video_path = f'{MePC().wx_dir}/{video_path}'
                 if os.path.exists(video_path):
                     new_path = origin_docx_path + '/video/' + os.path.basename(video_path)