WeChatMsg/app/analysis/analysis.py

from collections import Counter
from datetime import datetime
import re

from PyQt5.QtCore import QFile, QTextStream, QIODevice

import sys

sys.path.append('.')

from app.DataBase import msg_db, MsgType
from pyecharts import options as opts
from pyecharts.charts import WordCloud, Calendar, Bar, Line
from app.resources import resource_rc
from app.util.emoji import get_emoji

var = resource_rc.qt_resource_name
charts_width = 800
charts_height = 450
wordcloud_width = 780
wordcloud_height = 720


def wordcloud(wxid, year='all', who='1'):
    '''
        词云分析
        parameters:
            year: 默认分析全部年份，如果自定义，格式为'2023'，
            who: 默认为1，即自己发送的记录,否则为0，对方发送的记录

        returns:
            chart_data: 词云数据
            keyword: 关键词
            max_num: 说的最多的数量
            dialogs：部分关键词对话
    '''
    import jieba
    txt_messages = msg_db.get_messages_by_type(wxid, MsgType.TEXT, year)
    if not txt_messages:
        return {
            'chart_data': None,
            'keyword': "没有聊天你想分析啥",
            'max_num': "0",
            'dialogs': []
        }
    # text = ''.join(map(lambda x: x[7], txt_messages))
    text = ''.join(map(lambda x: x[7] if x[4] == int(who) else '', txt_messages))  # 1“我”说的话，0“Ta”说的话

    total_msg_len = len(text)
    # 使用jieba进行分词，并加入停用词
    words = jieba.cut(text)
    # 统计词频
    word_count = Counter(words)
    # 过滤停用词
    stopwords_file = './app000/data/stopwords.txt'
    try:
        with open(stopwords_file, "r", encoding="utf-8") as stopword_file:
            stopwords = set(stopword_file.read().splitlines())
    except:
        file = QFile(':/data/stopwords.txt')
        if file.open(QIODevice.ReadOnly | QIODevice.Text):
            stream = QTextStream(file)
            stream.setCodec('utf-8')
            content = stream.readAll()
            file.close()
            stopwords = set(content.splitlines())
    filtered_word_count = {word: count for word, count in word_count.items() if len(word) > 1 and word not in stopwords}

    # 转换为词云数据格式
    data = [(word, count) for word, count in filtered_word_count.items()]
    # text_data = data
    data.sort(key=lambda x: x[1], reverse=True)

    text_data = data[:100] if len(data) > 100 else data
    # 创建词云图
    keyword, max_num = text_data[0]
    w = (
        WordCloud(init_opts=opts.InitOpts(width=f"{wordcloud_width}px", height=f"{wordcloud_height}px"))
        .add(series_name="聊天文字", data_pair=text_data, word_size_range=[5, 40])
    )
    # return w.render_embed()
    return {
        'chart_data': w.dump_options_with_quotes(),
        'keyword': keyword,
        'max_num': str(max_num),
        'dialogs': msg_db.get_messages_by_keyword(wxid, keyword, num=5, max_len=12)
    }


def calendar_chart(wxid, year='all'):
    '''
        日历图分析
        parameters:
            year: 默认分析全部年份，如果自定义，格式为'2023'

        returns:
            chart_data: 图表数据
            data_length: 和对方的聊天记录总数
            max_date: 聊天最多的一天日期
            max_num: 聊天最多的一天记录条数
            date_num: 聊天天数
            dialogs: 最早聊天记录
    '''
    data_length = msg_db.get_messages_length_with_ta(wxid, year)  # 获取和他的聊天条数
    print(f'聊天总数：{data_length}')
    calendar_data = msg_db.get_messages_by_days(wxid, year)

    if not calendar_data:
        return False
    min_ = min(map(lambda x: x[1], calendar_data))
    max_ = max(map(lambda x: x[1], calendar_data))
    max_date = next(x[0] for x in calendar_data if x[1] == max_)
    date_obj = datetime.strptime(max_date, "%Y-%m-%d")
    formatted_date = date_obj.strftime("%Y年%m月%d日")
    print(formatted_date)

    start_date_ = calendar_data[0][0]
    end_date_ = calendar_data[-1][0]
    print(start_date_, '---->', end_date_)

    # 计算两个日期之间的天数差
    date1 = datetime.strptime(str(start_date_), "%Y-%m-%d")
    date2 = datetime.strptime(str(end_date_), "%Y-%m-%d")
    date_num = (date2 - date1).days + 1
    print(date_num)

    if year != 'all':
        calendar_days = year
        calendar_title = f'{year}年聊天情况'
    else:
        calendar_days = (start_date_, end_date_)
        calendar_title = '和Ta的聊天情况'
    c = (
        Calendar(init_opts=opts.InitOpts(width=f"{charts_width}px", height=f"{charts_height}px"))
        .add(
            "",
            calendar_data,
            calendar_opts=opts.CalendarOpts(range_=calendar_days)
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title=calendar_title),
            visualmap_opts=opts.VisualMapOpts(
                max_=max_,
                min_=min_,
                orient="horizontal",
                # is_piecewise=True,
                # pos_top="200px",
                pos_bottom="0px",
                pos_left="0px",
            ),
            legend_opts=opts.LegendOpts(is_show=False)
        )
    )
    return {
        'chart_data': c.dump_options_with_quotes(),
        'data_length': data_length,  # 和xx的聊天记录总数
        'max_date': formatted_date,
        'max_num': str(max_),
        'date_num': str(date_num),
        'dialogs': msg_db.get_first_time_of_message(wxid)
    }


def month_count(wxid, year='all'):
    '''
        每月聊天条数
        parameters:
            year: 默认分析全部年份，如果自定义，格式为'2023'，

        returns:
            chart_data: 图表数据
            txt: 文字描述,
            month_average_num: 月平均聊天数量
            max_num_month: 聊天最多的一个月
            max_num: 聊天最多的月的记录条数
            min_num_month: 聊天最少的一个月
            min_num: 聊天最少的月的记录条数
    '''
    msg_data = msg_db.get_messages_by_month(wxid, year)
    y_data = list(map(lambda x: x[1], msg_data))
    x_axis = list(map(lambda x: x[0], msg_data))
    # 获取聊天的月数
    if year != 'all':
        if all(y > 0 for y in y_data):
            conc = "我们这一年每个月都有在聊天"
        else:
            months_with_chat = sum(1 for y in y_data if y > 0)
            conc = f"我们这一年有{months_with_chat}个月都在聊天"
    else:
        months_with_chat = sum(1 for y in y_data if y > 0)
        conc = f"我们有{months_with_chat}个月都在聊天"
    print("聊天月数", conc)
    # 月平均聊天条数
    average_num = round(sum(y_data)/12)
    print(f'月平均聊天条数:{average_num}')
    # 月聊天条数最大值和最小值
    max_num = max(y_data)
    max_num_month = next(x[0] for x in msg_data if x[1] == max_num)
    min_num = min(y_data)
    min_num_month = next(x[0] for x in msg_data if x[1] == max_num)
    print(f'{max_num_month}月聊天条数:{max_num},{min_num_month}月聊天条数:{min_num}')

    m = (
        Bar(init_opts=opts.InitOpts(width=f"{charts_width}px", height=f"{charts_height}px"))
        .add_xaxis(x_axis)
        .add_yaxis("消息数量", y_data,
                   label_opts=opts.LabelOpts(is_show=False),
                   itemstyle_opts=opts.ItemStyleOpts(color="skyblue"),
                   )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="逐月统计", subtitle=None),
            datazoom_opts=opts.DataZoomOpts(),
            toolbox_opts=opts.ToolboxOpts(),
            visualmap_opts=opts.VisualMapOpts(
                min_=min(y_data),
                max_=max(y_data),
                dimension=1,  # 根据第2个维度（y 轴）进行映射
                is_piecewise=False,  # 是否分段显示
                range_color=["#66ccff", "#003366"],  # 设置颜色范围
                type_="color",
                pos_right="0%",
            ),
        )
    )

    return {
        'chart_data': m.dump_options_with_quotes(),
        'txt': conc,
        'month_average_num': average_num,
        'max_num_month': max_num_month,
        'max_num': max_num,
        'min_num_month': max_num_month,
        'min_num': min_num
    }


def hour_count(wxid, year='all'):
    '''
        小时计数聊天条数
        parameters:
            year: 默认分析全部年份，如果自定义，格式为'2023'，

        returns:
            chart_data: 图表数据
            max_num_hour: 聊天最多的时间段
            max_num: 该时间段聊天条数
            late_data: 聊天晚的时间和聊天数据
            early_data: 聊天最早的时间和聊天数据
    '''
    msg_data = msg_db.get_messages_by_hour(wxid, year)
    print(msg_data)
    y_data = list(map(lambda x: x[1], msg_data))
    x_axis = list(map(lambda x: x[0], msg_data))
    max_num = max(y_data)
    max_num_hour = next(x[0] for x in msg_data if x[1] == max_num)
    print(f'{max_num_hour}：{max_num}')
    h = (
        Line(init_opts=opts.InitOpts(width=f"{charts_width}px", height=f"{charts_height}px"))
        .add_xaxis(xaxis_data=x_axis)
        .add_yaxis(
            series_name="聊天频率",
            y_axis=y_data,
            markpoint_opts=opts.MarkPointOpts(
                data=[
                    opts.MarkPointItem(type_="max", name="最大值"),
                    opts.MarkPointItem(type_="min", name="最小值", value=int(10)),
                ]
            ),
            markline_opts=opts.MarkLineOpts(
                data=[opts.MarkLineItem(type_="average", name="平均值")]
            ),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="聊天时段", subtitle=None),
            # datazoom_opts=opts.DataZoomOpts(),
            # toolbox_opts=opts.ToolboxOpts(),
        )
        .set_series_opts(
            label_opts=opts.LabelOpts(
                is_show=False
            )
        )
    )
    late_data = msg_db.get_lateDay_messages(wxid, year)  # 最晚的消息记录
    early_data = msg_db.get_earlyDay_messages(wxid, year)  # 早上最早的记录
    print(late_data)
    print(early_data)
    return {
        'chart_data': h.dump_options_with_quotes(),
        'max_num_hour': max_num_hour,
        'max_num': max_num,
        'late_data': late_data,
        'early_data': early_data
    }


def emoji_count(wxid, year='all'):
    '''
        表情和表情包统计
        parameters:
            year: 默认分析全部年份，如果自定义，格式为'2023'，

        returns:
            ta_total_emoji_num: TA发送的表情数量
            me_total_emoji_num: 我发送的表情数量
            ta_max_emoji: 他发送的表情数量最多的10个：格式为 [(表情,数量)]
            me_max_emoji: 我发送的表情数量最多的10个：格式为 [(表情,数量)]
            MeImgDict: 我常发的3张表情包图片地址+数量，字典格式，path为key
            MeImgDict: TA常发的3张表情包图片地址+数量，字典格式，path为key
    '''
    # 最常发的表情
    txt_messages = msg_db.get_messages_by_type(wxid, MsgType.TEXT, year)
    me_txt_messages = ''.join(map(lambda x: x[7] if x[4] == 1 else '', txt_messages))
    ta_txt_messages = ''.join(map(lambda x: x[7] if x[4] == 0 else '', txt_messages))

    pattern = re.compile(r"\[.+?\]")
    MeEmoji = re.findall(pattern, me_txt_messages)
    TaEmoji = re.findall(pattern, ta_txt_messages)

    # 按照出现次数统计
    MeEmoji_num = Counter(MeEmoji)
    TaEmoji_num = Counter(TaEmoji)

    # 打印统计结果
    ta_total_emoji_num = len(TaEmoji)
    me_total_emoji_num = len(MeEmoji)
    ta_max_emoji = TaEmoji_num.most_common(10)
    me_max_emoji = MeEmoji_num.most_common(10)
    print("ta发的表情数：", len(TaEmoji))
    print("我发的表情数：", len(MeEmoji))
    print("---"*10)
    print("ta最常用的 10 个表情：\n", TaEmoji_num.most_common(10))
    print("---"*10)
    print("我最常用的 10 个表情：\n", MeEmoji_num.most_common(10))

    # 最常发的表情包图片
    MeImgList, TaImgList = msg_db.get_emoji_Img(wxid, year)
    MeImgDict = {}
    TaImgDict = {}
    for xml, num in MeImgList:
        MeImgDict[get_emoji(xml)] = num
    for xml, num in TaImgList:
        TaImgDict[get_emoji(xml)] = num
    return {
        'ta_total_emoji_num': ta_total_emoji_num,
        'me_total_emoji_num': me_total_emoji_num,
        'ta_max_emoji': ta_max_emoji,
        'me_max_emoji': me_max_emoji,
        'MeImgDict': MeImgDict,  # 三张图片地址+数量，字典格式，path为key
        'MeImgDict': MeImgDict
    }


class Analysis:
    pass


if __name__ == '__main__':
    msg_db.init_database(path='../DataBase/Msg/MSG.db')
    # w = wordcloud('wxid_0o18ef858vnu22')
    w_data = wordcloud('wxid_27hqbq7vx5hf22', '2023')
    # # print(w_data)
    # # w['chart_data'].render("./data/聊天统计/wordcloud.html")
    c = calendar_chart('wxid_27hqbq7vx5hf22', '2023')
    # c['chart_data'].render("./data/聊天统计/calendar.html")
    # # print('c:::', c)
    m = month_count('wxid_27hqbq7vx5hf22', False, '2023')
    # m['chart_data'].render("./data/聊天统计/month_num.html")
    # h = hour_count('wxid_27hqbq7vx5hf22')
    # h['chart_data'].render("./data/聊天统计/hour_count.html")

    h = emoji_count('wxid_27hqbq7vx5hf22')
    # h['chart_data'].render("./data/聊天统计/hour_count.html")