From 50869068dec55448cc01a4546af303c2e38abf0a Mon Sep 17 00:00:00 2001 From: shuaikangzhou <863909694@qq.com> Date: Fri, 29 Mar 2024 14:35:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=AF=BC=E5=87=BAjson?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/ui/contact/contactInfo.py | 7 +- app/ui/contact/export/export_dialog.py | 3 + app/ui/menu/export.py | 3 +- app/ui/menu/exportUi.py | 28 ++-- app/util/exporter/exporter.py | 4 +- app/util/exporter/exporter_json.py | 193 +++++++++++++++++++++++++ app/util/exporter/output.py | 15 ++ 7 files changed, 240 insertions(+), 13 deletions(-) create mode 100644 app/util/exporter/exporter_json.py diff --git a/app/ui/contact/contactInfo.py b/app/ui/contact/contactInfo.py index 97b6a65..78f58b0 100644 --- a/app/ui/contact/contactInfo.py +++ b/app/ui/contact/contactInfo.py @@ -45,12 +45,14 @@ class ContactInfo(QWidget, Ui_Form): self.toCSVAct = QAction(Icon.ToCSV, '导出CSV', self) self.toHtmlAct = QAction(Icon.ToHTML, '导出HTML', self) self.toTxtAct = QAction(Icon.ToTXT, '导出TXT', self) + self.toJsonAct = QAction(Icon.ToTXT, '导出json', self) self.toolButton_output.setPopupMode(QToolButton.MenuButtonPopup) self.toolButton_output.clicked.connect(self.toolButton_show) menu.addAction(self.toDocxAct) menu.addAction(self.toCSVAct) menu.addAction(self.toHtmlAct) menu.addAction(self.toTxtAct) + menu.addAction(self.toJsonAct) self.toolButton_output.setMenu(menu) self.toolButton_output.setIcon(Icon.Output) # self.toolButton_output.addSeparator() @@ -58,6 +60,7 @@ class ContactInfo(QWidget, Ui_Form): self.toDocxAct.triggered.connect(self.output) self.toCSVAct.triggered.connect(self.output) self.toTxtAct.triggered.connect(self.output) + self.toJsonAct.triggered.connect(self.output) def set_contact(self, contact: Contact): self.view_userinfo.set_contact(contact) @@ -126,7 +129,9 @@ class ContactInfo(QWidget, Ui_Form): elif self.sender() == self.toTxtAct: dialog = ExportDialog(self.contact, title='选择导出的消息类型', file_type='txt', parent=self) result = dialog.exec_() # 使用exec_()获取用户的操作结果 - + elif self.sender() == self.toJsonAct: + dialog = ExportDialog(self.contact, title='选择导出的消息类型', file_type='json', parent=self) + result = dialog.exec_() # 使用exec_()获取用户的操作结果 class ReportThread(QThread): okSignal = pyqtSignal(bool) diff --git a/app/ui/contact/export/export_dialog.py b/app/ui/contact/export/export_dialog.py index cfcc22e..910d4d6 100644 --- a/app/ui/contact/export/export_dialog.py +++ b/app/ui/contact/export/export_dialog.py @@ -66,6 +66,9 @@ class ExportDialog(QDialog, Ui_Dialog): self.export_type = Output.DOCX self.export_choices = {"文本": True, "图片": False, "语音": False, "视频": False, "表情包": False, '拍一拍等系统消息': True} # 定义导出的数据类型,默认全部选择 + elif file_type == 'json': + self.export_type = Output.JSON + self.export_choices = {} # 定义导出的数据类型,默认全部选择 else: self.export_choices = {"文本": True, "图片": True, "视频": True, "表情包": True} # 定义导出的数据类型,默认全部选择 self.setWindowTitle(title) diff --git a/app/ui/menu/export.py b/app/ui/menu/export.py index 0aa6d2c..a28a29d 100644 --- a/app/ui/menu/export.py +++ b/app/ui/menu/export.py @@ -34,6 +34,7 @@ file_format = { 'TXT': Output.TXT, 'HTML': Output.HTML, 'CSV': Output.CSV, + 'JSON': Output.JSON, } Stylesheet = """ """ @@ -150,7 +151,7 @@ class ExportDialog(QDialog, Ui_Dialog): print("选择的数据类型:", selected_types) file_types = [] - for checkbox in [self.checkBox_txt, self.checkBox_csv, self.checkBox_html, self.checkBox_word]: + for checkbox in [self.checkBox_txt, self.checkBox_csv, self.checkBox_html, self.checkBox_word,self.checkBox_json]: if checkbox.isChecked(): file_types.append(file_format[checkbox.text()]) select_contacts = [] diff --git a/app/ui/menu/exportUi.py b/app/ui/menu/exportUi.py index 2093915..9d05f5c 100644 --- a/app/ui/menu/exportUi.py +++ b/app/ui/menu/exportUi.py @@ -62,15 +62,17 @@ class Ui_Dialog(object): self.checkBox_csv.setObjectName("checkBox_csv") self.verticalLayout.addWidget(self.checkBox_csv) self.checkBox_txt = QtWidgets.QCheckBox(Dialog) + sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Fixed) + sizePolicy.setHorizontalStretch(0) + sizePolicy.setVerticalStretch(0) + sizePolicy.setHeightForWidth(self.checkBox_txt.sizePolicy().hasHeightForWidth()) + self.checkBox_txt.setSizePolicy(sizePolicy) self.checkBox_txt.setChecked(True) self.checkBox_txt.setObjectName("checkBox_txt") self.verticalLayout.addWidget(self.checkBox_txt) - spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) - self.verticalLayout.addItem(spacerItem1) - self.horizontalLayout_2.addLayout(self.verticalLayout) - self.verticalLayout_2 = QtWidgets.QVBoxLayout() - self.verticalLayout_2.setSizeConstraint(QtWidgets.QLayout.SetMinimumSize) - self.verticalLayout_2.setObjectName("verticalLayout_2") + self.checkBox_json = QtWidgets.QCheckBox(Dialog) + self.checkBox_json.setObjectName("checkBox_json") + self.verticalLayout.addWidget(self.checkBox_json) self.label_2 = QtWidgets.QLabel(Dialog) sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) sizePolicy.setHorizontalStretch(0) @@ -78,13 +80,20 @@ class Ui_Dialog(object): sizePolicy.setHeightForWidth(self.label_2.sizePolicy().hasHeightForWidth()) self.label_2.setSizePolicy(sizePolicy) self.label_2.setObjectName("label_2") - self.verticalLayout_2.addWidget(self.label_2) - self.horizontalLayout_2.addLayout(self.verticalLayout_2) + self.verticalLayout.addWidget(self.label_2) + self.verticalLayout_2 = QtWidgets.QVBoxLayout() + self.verticalLayout_2.setSizeConstraint(QtWidgets.QLayout.SetMinimumSize) + self.verticalLayout_2.setObjectName("verticalLayout_2") + self.verticalLayout.addLayout(self.verticalLayout_2) + spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) + self.verticalLayout.addItem(spacerItem1) + self.horizontalLayout_2.addLayout(self.verticalLayout) self.listWidget = QtWidgets.QListWidget(Dialog) self.listWidget.setMinimumSize(QtCore.QSize(0, 0)) self.listWidget.setObjectName("listWidget") self.horizontalLayout_2.addWidget(self.listWidget) - self.horizontalLayout_2.setStretch(2, 1) + self.horizontalLayout_2.setStretch(0, 1) + self.horizontalLayout_2.setStretch(1, 1) self.verticalLayout_3.addLayout(self.horizontalLayout_2) self.textBrowser = QtWidgets.QTextBrowser(Dialog) sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum) @@ -152,5 +161,6 @@ class Ui_Dialog(object): self.checkBox_html.setText(_translate("Dialog", "HTML")) self.checkBox_csv.setText(_translate("Dialog", "CSV")) self.checkBox_txt.setText(_translate("Dialog", "TXT")) + self.checkBox_json.setText(_translate("Dialog", "JSON")) self.label_2.setText(_translate("Dialog", "消息类型")) self.btn_start.setText(_translate("Dialog", "开始")) diff --git a/app/util/exporter/exporter.py b/app/util/exporter/exporter.py index 8d4614a..d74d33e 100644 --- a/app/util/exporter/exporter.py +++ b/app/util/exporter/exporter.py @@ -104,8 +104,8 @@ class ExporterBase(QThread): self.last_timestamp = 0 self.time_range = time_range self.messages = messages - origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark) - makedirs(origin_path) + self.origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark) + makedirs(self.origin_path) def run(self): self.export() diff --git a/app/util/exporter/exporter_json.py b/app/util/exporter/exporter_json.py new file mode 100644 index 0000000..0de2510 --- /dev/null +++ b/app/util/exporter/exporter_json.py @@ -0,0 +1,193 @@ +import json +import random +import os + +from app.DataBase import msg_db +from app.person import Me +from .exporter import ExporterBase + + +def merge_content(conversions_list) -> list: + """ + 合并一组对话中连续发送的句子 + @param conversions_list: + @return: + """ + merged_data = [] + current_role = None + current_content = "" + str_time = '' + for item in conversions_list: + if 'str_time' in item: + str_time = item['str_time'] + else: + str_time = '' + if current_role is None: + current_role = item["role"] + current_content = item["content"] + elif current_role == item["role"]: + current_content += "\n" + item["content"] + else: + # merged_data.append({"role": current_role, "content": current_content, 'str_time': str_time}) + merged_data.append({"role": current_role, "content": current_content}) + current_role = item["role"] + current_content = item["content"] + str_time = item.get('str_time') + + # 处理最后一组 + if current_role is not None: + # merged_data.append({"role": current_role, "content": current_content,'str_time': str_time}) + merged_data.append({"role": current_role, "content": current_content}) + return merged_data + + +def system_prompt(): + system = { + "role": "system", + # "content": f"你是{Me().name},一个聪明、热情、善良的男大学生,后面的对话来自{self.contact.remark}(!!!注意:对方的身份十分重要,你务必记住对方的身份,因为跟不同的人对话要用不同的态度、语气),你要认真地回答他" + "content": f"你是{Me().name},一个聪明、热情、善良的人,后面的对话来自你的朋友,你要认真地回答他" + } + return system + + +def message_to_conversion(group): + conversions = [system_prompt()] + while len(group) and group[-1][4] == 0: + group.pop() + for message in group: + is_send = message[4] + if len(conversions) == 1 and is_send: + continue + if is_send: + json_msg = { + "role": "assistant", + "content": message[7] + } + else: + json_msg = { + "role": "user", + "content": message[7] + } + json_msg['str_time'] = message[8] + conversions.append(json_msg) + if len(conversions) == 1: + return [] + return merge_content(conversions) + + +class JsonExporter(ExporterBase): + def split_by_time(self, length=300): + messages = msg_db.get_messages_by_type(self.contact.wxid, type_=1, time_range=self.time_range) + start_time = 0 + res = [] + i = 0 + while i < len(messages): + message = messages[i] + timestamp = message[5] + is_send = message[4] + group = [ + system_prompt() + ] + while i < len(messages) and timestamp - start_time < length: + if is_send: + json_msg = { + "role": "assistant", + "content": message[7] + } + else: + json_msg = { + "role": "user", + "content": message[7] + } + group.append(json_msg) + i += 1 + if i >= len(messages): + break + message = messages[i] + timestamp = message[5] + is_send = message[4] + while is_send: + json_msg = { + "role": "assistant", + "content": message[7] + } + group.append(json_msg) + i += 1 + if i >= len(messages): + break + message = messages[i] + timestamp = message[5] + is_send = message[4] + start_time = timestamp + res.append( + { + "conversations": group + } + ) + res_ = [] + for item in res: + conversations = item['conversations'] + res_.append({ + 'conversations': merge_content(conversations) + }) + return res_ + + def split_by_intervals(self, max_diff_seconds=300): + messages = msg_db.get_messages_by_type(self.contact.wxid, type_=1, time_range=self.time_range) + res = [] + i = 0 + current_group = [] + while i < len(messages): + message = messages[i] + timestamp = message[5] + is_send = message[4] + while is_send and i + 1 < len(messages): + i += 1 + message = messages[i] + is_send = message[4] + current_group = [messages[i]] + i += 1 + while i < len(messages) and messages[i][5] - current_group[-1][5] <= max_diff_seconds: + current_group.append(messages[i]) + i += 1 + while i < len(messages) and messages[i][4]: + current_group.append(messages[i]) + i += 1 + res.append(current_group) + res_ = [] + for group in res: + conversations = message_to_conversion(group) + if conversations: + res_.append({ + 'conversations': conversations + }) + return res_ + + def to_json(self): + print(f"【开始导出 json {self.contact.remark}】") + origin_path = self.origin_path + os.makedirs(origin_path, exist_ok=True) + filename = os.path.join(origin_path, f"{self.contact.remark}") + + # res = self.split_by_time() + res = self.split_by_intervals(60) + # 打乱列表顺序 + random.shuffle(res) + + # 计算切分比例 + split_ratio = 0.2 # 20% for the second list + + # 计算切分点 + split_point = int(len(res) * split_ratio) + + # 分割列表 + train_data = res[split_point:] + dev_data = res[:split_point] + with open(f'{filename}_train.json', "w", encoding="utf-8") as f: + json.dump(train_data, f, ensure_ascii=False, indent=4) + with open(f'{filename}_dev.json', "w", encoding="utf-8") as f: + json.dump(dev_data, f, ensure_ascii=False, indent=4) + self.okSignal.emit(1) + + def run(self): + self.to_json() diff --git a/app/util/exporter/output.py b/app/util/exporter/output.py index f5bfb4a..a7934c3 100644 --- a/app/util/exporter/output.py +++ b/app/util/exporter/output.py @@ -13,6 +13,7 @@ from docxcompose.composer import Composer from app.util.exporter.exporter_csv import CSVExporter from app.util.exporter.exporter_docx import DocxExporter from app.util.exporter.exporter_html import HtmlExporter +from app.util.exporter.exporter_json import JsonExporter from app.util.exporter.exporter_txt import TxtExporter from app.DataBase.hard_link import decodeExtraBuf from app.config import OUTPUT_DIR @@ -42,6 +43,7 @@ class Output(QThread): CSV_ALL = 3 CONTACT_CSV = 4 TXT = 5 + JSON = 6 Batch = 10086 def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None, parent=None): @@ -160,6 +162,8 @@ class Output(QThread): self.to_csv(contact, self.message_types, True) elif type_ == self.HTML: self.to_html(contact, self.message_types, True) + elif type_ == self.JSON: + self.to_json(contact,self.message_types,True) def batch_finish_one(self, num): self.nowContact.emit(self.contact[self.batch_num // len(self.sub_type)].remark) @@ -210,6 +214,15 @@ class Output(QThread): Child.okSignal.connect(self.merge_docx if not is_batch else self.batch_finish_one) Child.start() + def to_json(self, contact, message_types, is_batch=False): + Child = JsonExporter(contact, type_=self.JSON, message_types=message_types, time_range=self.time_range) + self.children.append(Child) + Child.progressSignal.connect(self.progress) + if not is_batch: + Child.rangeSignal.connect(self.rangeSignal) + Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one) + Child.start() + def to_txt(self, contact, message_types, is_batch=False): Child = TxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range) self.children.append(Child) @@ -275,6 +288,8 @@ class Output(QThread): self.to_csv(self.contact, self.message_types) elif self.output_type == self.HTML: self.to_html(self.contact, self.message_types) + elif self.output_type == self.JSON: + self.to_json(self.contact, self.message_types) elif self.output_type == self.Batch: self.batch_export()