mirror of
https://github.com/LC044/WeChatMsg
synced 2025-02-21 01:52:35 +08:00
新增导出json
This commit is contained in:
parent
28e47cf116
commit
50869068de
@ -45,12 +45,14 @@ class ContactInfo(QWidget, Ui_Form):
|
||||
self.toCSVAct = QAction(Icon.ToCSV, '导出CSV', self)
|
||||
self.toHtmlAct = QAction(Icon.ToHTML, '导出HTML', self)
|
||||
self.toTxtAct = QAction(Icon.ToTXT, '导出TXT', self)
|
||||
self.toJsonAct = QAction(Icon.ToTXT, '导出json', self)
|
||||
self.toolButton_output.setPopupMode(QToolButton.MenuButtonPopup)
|
||||
self.toolButton_output.clicked.connect(self.toolButton_show)
|
||||
menu.addAction(self.toDocxAct)
|
||||
menu.addAction(self.toCSVAct)
|
||||
menu.addAction(self.toHtmlAct)
|
||||
menu.addAction(self.toTxtAct)
|
||||
menu.addAction(self.toJsonAct)
|
||||
self.toolButton_output.setMenu(menu)
|
||||
self.toolButton_output.setIcon(Icon.Output)
|
||||
# self.toolButton_output.addSeparator()
|
||||
@ -58,6 +60,7 @@ class ContactInfo(QWidget, Ui_Form):
|
||||
self.toDocxAct.triggered.connect(self.output)
|
||||
self.toCSVAct.triggered.connect(self.output)
|
||||
self.toTxtAct.triggered.connect(self.output)
|
||||
self.toJsonAct.triggered.connect(self.output)
|
||||
|
||||
def set_contact(self, contact: Contact):
|
||||
self.view_userinfo.set_contact(contact)
|
||||
@ -126,7 +129,9 @@ class ContactInfo(QWidget, Ui_Form):
|
||||
elif self.sender() == self.toTxtAct:
|
||||
dialog = ExportDialog(self.contact, title='选择导出的消息类型', file_type='txt', parent=self)
|
||||
result = dialog.exec_() # 使用exec_()获取用户的操作结果
|
||||
|
||||
elif self.sender() == self.toJsonAct:
|
||||
dialog = ExportDialog(self.contact, title='选择导出的消息类型', file_type='json', parent=self)
|
||||
result = dialog.exec_() # 使用exec_()获取用户的操作结果
|
||||
|
||||
class ReportThread(QThread):
|
||||
okSignal = pyqtSignal(bool)
|
||||
|
@ -66,6 +66,9 @@ class ExportDialog(QDialog, Ui_Dialog):
|
||||
self.export_type = Output.DOCX
|
||||
self.export_choices = {"文本": True, "图片": False, "语音": False, "视频": False,
|
||||
"表情包": False, '拍一拍等系统消息': True} # 定义导出的数据类型,默认全部选择
|
||||
elif file_type == 'json':
|
||||
self.export_type = Output.JSON
|
||||
self.export_choices = {} # 定义导出的数据类型,默认全部选择
|
||||
else:
|
||||
self.export_choices = {"文本": True, "图片": True, "视频": True, "表情包": True} # 定义导出的数据类型,默认全部选择
|
||||
self.setWindowTitle(title)
|
||||
|
@ -34,6 +34,7 @@ file_format = {
|
||||
'TXT': Output.TXT,
|
||||
'HTML': Output.HTML,
|
||||
'CSV': Output.CSV,
|
||||
'JSON': Output.JSON,
|
||||
}
|
||||
Stylesheet = """
|
||||
"""
|
||||
@ -150,7 +151,7 @@ class ExportDialog(QDialog, Ui_Dialog):
|
||||
print("选择的数据类型:", selected_types)
|
||||
|
||||
file_types = []
|
||||
for checkbox in [self.checkBox_txt, self.checkBox_csv, self.checkBox_html, self.checkBox_word]:
|
||||
for checkbox in [self.checkBox_txt, self.checkBox_csv, self.checkBox_html, self.checkBox_word,self.checkBox_json]:
|
||||
if checkbox.isChecked():
|
||||
file_types.append(file_format[checkbox.text()])
|
||||
select_contacts = []
|
||||
|
@ -62,15 +62,17 @@ class Ui_Dialog(object):
|
||||
self.checkBox_csv.setObjectName("checkBox_csv")
|
||||
self.verticalLayout.addWidget(self.checkBox_csv)
|
||||
self.checkBox_txt = QtWidgets.QCheckBox(Dialog)
|
||||
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Fixed)
|
||||
sizePolicy.setHorizontalStretch(0)
|
||||
sizePolicy.setVerticalStretch(0)
|
||||
sizePolicy.setHeightForWidth(self.checkBox_txt.sizePolicy().hasHeightForWidth())
|
||||
self.checkBox_txt.setSizePolicy(sizePolicy)
|
||||
self.checkBox_txt.setChecked(True)
|
||||
self.checkBox_txt.setObjectName("checkBox_txt")
|
||||
self.verticalLayout.addWidget(self.checkBox_txt)
|
||||
spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
|
||||
self.verticalLayout.addItem(spacerItem1)
|
||||
self.horizontalLayout_2.addLayout(self.verticalLayout)
|
||||
self.verticalLayout_2 = QtWidgets.QVBoxLayout()
|
||||
self.verticalLayout_2.setSizeConstraint(QtWidgets.QLayout.SetMinimumSize)
|
||||
self.verticalLayout_2.setObjectName("verticalLayout_2")
|
||||
self.checkBox_json = QtWidgets.QCheckBox(Dialog)
|
||||
self.checkBox_json.setObjectName("checkBox_json")
|
||||
self.verticalLayout.addWidget(self.checkBox_json)
|
||||
self.label_2 = QtWidgets.QLabel(Dialog)
|
||||
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
|
||||
sizePolicy.setHorizontalStretch(0)
|
||||
@ -78,13 +80,20 @@ class Ui_Dialog(object):
|
||||
sizePolicy.setHeightForWidth(self.label_2.sizePolicy().hasHeightForWidth())
|
||||
self.label_2.setSizePolicy(sizePolicy)
|
||||
self.label_2.setObjectName("label_2")
|
||||
self.verticalLayout_2.addWidget(self.label_2)
|
||||
self.horizontalLayout_2.addLayout(self.verticalLayout_2)
|
||||
self.verticalLayout.addWidget(self.label_2)
|
||||
self.verticalLayout_2 = QtWidgets.QVBoxLayout()
|
||||
self.verticalLayout_2.setSizeConstraint(QtWidgets.QLayout.SetMinimumSize)
|
||||
self.verticalLayout_2.setObjectName("verticalLayout_2")
|
||||
self.verticalLayout.addLayout(self.verticalLayout_2)
|
||||
spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
|
||||
self.verticalLayout.addItem(spacerItem1)
|
||||
self.horizontalLayout_2.addLayout(self.verticalLayout)
|
||||
self.listWidget = QtWidgets.QListWidget(Dialog)
|
||||
self.listWidget.setMinimumSize(QtCore.QSize(0, 0))
|
||||
self.listWidget.setObjectName("listWidget")
|
||||
self.horizontalLayout_2.addWidget(self.listWidget)
|
||||
self.horizontalLayout_2.setStretch(2, 1)
|
||||
self.horizontalLayout_2.setStretch(0, 1)
|
||||
self.horizontalLayout_2.setStretch(1, 1)
|
||||
self.verticalLayout_3.addLayout(self.horizontalLayout_2)
|
||||
self.textBrowser = QtWidgets.QTextBrowser(Dialog)
|
||||
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum)
|
||||
@ -152,5 +161,6 @@ class Ui_Dialog(object):
|
||||
self.checkBox_html.setText(_translate("Dialog", "HTML"))
|
||||
self.checkBox_csv.setText(_translate("Dialog", "CSV"))
|
||||
self.checkBox_txt.setText(_translate("Dialog", "TXT"))
|
||||
self.checkBox_json.setText(_translate("Dialog", "JSON"))
|
||||
self.label_2.setText(_translate("Dialog", "消息类型"))
|
||||
self.btn_start.setText(_translate("Dialog", "开始"))
|
||||
|
@ -104,8 +104,8 @@ class ExporterBase(QThread):
|
||||
self.last_timestamp = 0
|
||||
self.time_range = time_range
|
||||
self.messages = messages
|
||||
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
|
||||
makedirs(origin_path)
|
||||
self.origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
|
||||
makedirs(self.origin_path)
|
||||
|
||||
def run(self):
|
||||
self.export()
|
||||
|
193
app/util/exporter/exporter_json.py
Normal file
193
app/util/exporter/exporter_json.py
Normal file
@ -0,0 +1,193 @@
|
||||
import json
|
||||
import random
|
||||
import os
|
||||
|
||||
from app.DataBase import msg_db
|
||||
from app.person import Me
|
||||
from .exporter import ExporterBase
|
||||
|
||||
|
||||
def merge_content(conversions_list) -> list:
|
||||
"""
|
||||
合并一组对话中连续发送的句子
|
||||
@param conversions_list:
|
||||
@return:
|
||||
"""
|
||||
merged_data = []
|
||||
current_role = None
|
||||
current_content = ""
|
||||
str_time = ''
|
||||
for item in conversions_list:
|
||||
if 'str_time' in item:
|
||||
str_time = item['str_time']
|
||||
else:
|
||||
str_time = ''
|
||||
if current_role is None:
|
||||
current_role = item["role"]
|
||||
current_content = item["content"]
|
||||
elif current_role == item["role"]:
|
||||
current_content += "\n" + item["content"]
|
||||
else:
|
||||
# merged_data.append({"role": current_role, "content": current_content, 'str_time': str_time})
|
||||
merged_data.append({"role": current_role, "content": current_content})
|
||||
current_role = item["role"]
|
||||
current_content = item["content"]
|
||||
str_time = item.get('str_time')
|
||||
|
||||
# 处理最后一组
|
||||
if current_role is not None:
|
||||
# merged_data.append({"role": current_role, "content": current_content,'str_time': str_time})
|
||||
merged_data.append({"role": current_role, "content": current_content})
|
||||
return merged_data
|
||||
|
||||
|
||||
def system_prompt():
|
||||
system = {
|
||||
"role": "system",
|
||||
# "content": f"你是{Me().name},一个聪明、热情、善良的男大学生,后面的对话来自{self.contact.remark}(!!!注意:对方的身份十分重要,你务必记住对方的身份,因为跟不同的人对话要用不同的态度、语气),你要认真地回答他"
|
||||
"content": f"你是{Me().name},一个聪明、热情、善良的人,后面的对话来自你的朋友,你要认真地回答他"
|
||||
}
|
||||
return system
|
||||
|
||||
|
||||
def message_to_conversion(group):
|
||||
conversions = [system_prompt()]
|
||||
while len(group) and group[-1][4] == 0:
|
||||
group.pop()
|
||||
for message in group:
|
||||
is_send = message[4]
|
||||
if len(conversions) == 1 and is_send:
|
||||
continue
|
||||
if is_send:
|
||||
json_msg = {
|
||||
"role": "assistant",
|
||||
"content": message[7]
|
||||
}
|
||||
else:
|
||||
json_msg = {
|
||||
"role": "user",
|
||||
"content": message[7]
|
||||
}
|
||||
json_msg['str_time'] = message[8]
|
||||
conversions.append(json_msg)
|
||||
if len(conversions) == 1:
|
||||
return []
|
||||
return merge_content(conversions)
|
||||
|
||||
|
||||
class JsonExporter(ExporterBase):
|
||||
def split_by_time(self, length=300):
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, type_=1, time_range=self.time_range)
|
||||
start_time = 0
|
||||
res = []
|
||||
i = 0
|
||||
while i < len(messages):
|
||||
message = messages[i]
|
||||
timestamp = message[5]
|
||||
is_send = message[4]
|
||||
group = [
|
||||
system_prompt()
|
||||
]
|
||||
while i < len(messages) and timestamp - start_time < length:
|
||||
if is_send:
|
||||
json_msg = {
|
||||
"role": "assistant",
|
||||
"content": message[7]
|
||||
}
|
||||
else:
|
||||
json_msg = {
|
||||
"role": "user",
|
||||
"content": message[7]
|
||||
}
|
||||
group.append(json_msg)
|
||||
i += 1
|
||||
if i >= len(messages):
|
||||
break
|
||||
message = messages[i]
|
||||
timestamp = message[5]
|
||||
is_send = message[4]
|
||||
while is_send:
|
||||
json_msg = {
|
||||
"role": "assistant",
|
||||
"content": message[7]
|
||||
}
|
||||
group.append(json_msg)
|
||||
i += 1
|
||||
if i >= len(messages):
|
||||
break
|
||||
message = messages[i]
|
||||
timestamp = message[5]
|
||||
is_send = message[4]
|
||||
start_time = timestamp
|
||||
res.append(
|
||||
{
|
||||
"conversations": group
|
||||
}
|
||||
)
|
||||
res_ = []
|
||||
for item in res:
|
||||
conversations = item['conversations']
|
||||
res_.append({
|
||||
'conversations': merge_content(conversations)
|
||||
})
|
||||
return res_
|
||||
|
||||
def split_by_intervals(self, max_diff_seconds=300):
|
||||
messages = msg_db.get_messages_by_type(self.contact.wxid, type_=1, time_range=self.time_range)
|
||||
res = []
|
||||
i = 0
|
||||
current_group = []
|
||||
while i < len(messages):
|
||||
message = messages[i]
|
||||
timestamp = message[5]
|
||||
is_send = message[4]
|
||||
while is_send and i + 1 < len(messages):
|
||||
i += 1
|
||||
message = messages[i]
|
||||
is_send = message[4]
|
||||
current_group = [messages[i]]
|
||||
i += 1
|
||||
while i < len(messages) and messages[i][5] - current_group[-1][5] <= max_diff_seconds:
|
||||
current_group.append(messages[i])
|
||||
i += 1
|
||||
while i < len(messages) and messages[i][4]:
|
||||
current_group.append(messages[i])
|
||||
i += 1
|
||||
res.append(current_group)
|
||||
res_ = []
|
||||
for group in res:
|
||||
conversations = message_to_conversion(group)
|
||||
if conversations:
|
||||
res_.append({
|
||||
'conversations': conversations
|
||||
})
|
||||
return res_
|
||||
|
||||
def to_json(self):
|
||||
print(f"【开始导出 json {self.contact.remark}】")
|
||||
origin_path = self.origin_path
|
||||
os.makedirs(origin_path, exist_ok=True)
|
||||
filename = os.path.join(origin_path, f"{self.contact.remark}")
|
||||
|
||||
# res = self.split_by_time()
|
||||
res = self.split_by_intervals(60)
|
||||
# 打乱列表顺序
|
||||
random.shuffle(res)
|
||||
|
||||
# 计算切分比例
|
||||
split_ratio = 0.2 # 20% for the second list
|
||||
|
||||
# 计算切分点
|
||||
split_point = int(len(res) * split_ratio)
|
||||
|
||||
# 分割列表
|
||||
train_data = res[split_point:]
|
||||
dev_data = res[:split_point]
|
||||
with open(f'{filename}_train.json', "w", encoding="utf-8") as f:
|
||||
json.dump(train_data, f, ensure_ascii=False, indent=4)
|
||||
with open(f'{filename}_dev.json', "w", encoding="utf-8") as f:
|
||||
json.dump(dev_data, f, ensure_ascii=False, indent=4)
|
||||
self.okSignal.emit(1)
|
||||
|
||||
def run(self):
|
||||
self.to_json()
|
@ -13,6 +13,7 @@ from docxcompose.composer import Composer
|
||||
from app.util.exporter.exporter_csv import CSVExporter
|
||||
from app.util.exporter.exporter_docx import DocxExporter
|
||||
from app.util.exporter.exporter_html import HtmlExporter
|
||||
from app.util.exporter.exporter_json import JsonExporter
|
||||
from app.util.exporter.exporter_txt import TxtExporter
|
||||
from app.DataBase.hard_link import decodeExtraBuf
|
||||
from app.config import OUTPUT_DIR
|
||||
@ -42,6 +43,7 @@ class Output(QThread):
|
||||
CSV_ALL = 3
|
||||
CONTACT_CSV = 4
|
||||
TXT = 5
|
||||
JSON = 6
|
||||
Batch = 10086
|
||||
|
||||
def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None, parent=None):
|
||||
@ -160,6 +162,8 @@ class Output(QThread):
|
||||
self.to_csv(contact, self.message_types, True)
|
||||
elif type_ == self.HTML:
|
||||
self.to_html(contact, self.message_types, True)
|
||||
elif type_ == self.JSON:
|
||||
self.to_json(contact,self.message_types,True)
|
||||
|
||||
def batch_finish_one(self, num):
|
||||
self.nowContact.emit(self.contact[self.batch_num // len(self.sub_type)].remark)
|
||||
@ -210,6 +214,15 @@ class Output(QThread):
|
||||
Child.okSignal.connect(self.merge_docx if not is_batch else self.batch_finish_one)
|
||||
Child.start()
|
||||
|
||||
def to_json(self, contact, message_types, is_batch=False):
|
||||
Child = JsonExporter(contact, type_=self.JSON, message_types=message_types, time_range=self.time_range)
|
||||
self.children.append(Child)
|
||||
Child.progressSignal.connect(self.progress)
|
||||
if not is_batch:
|
||||
Child.rangeSignal.connect(self.rangeSignal)
|
||||
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
|
||||
Child.start()
|
||||
|
||||
def to_txt(self, contact, message_types, is_batch=False):
|
||||
Child = TxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range)
|
||||
self.children.append(Child)
|
||||
@ -275,6 +288,8 @@ class Output(QThread):
|
||||
self.to_csv(self.contact, self.message_types)
|
||||
elif self.output_type == self.HTML:
|
||||
self.to_html(self.contact, self.message_types)
|
||||
elif self.output_type == self.JSON:
|
||||
self.to_json(self.contact, self.message_types)
|
||||
elif self.output_type == self.Batch:
|
||||
self.batch_export()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user