聊天记录可视化

有了之前两篇的数据，接下来就可以生成一些好玩的报告了😋

可以用之前学过的 ipywidgets 做交互式的界面，大部分交给 ai，细节部分部分自己调就好了。

废话 & 叠甲 & 提醒#

感谢某位好友几个月内的 22329 条聊天数据支持。
下面词云图片的 args 是 month='全部', user='用户1'，都是我发出去的消息~~，我侵我自己的权~~，情感分析里面的 content 也都是我发的，如果仍然介意我就换别人的图。
jieba 分词时用的 stopwords.txt 参考了 SmartPorridge/Wechat_QQ_wordcloud 并额外屏蔽了几个人名。
QQ 那篇里面声明过代码运行的环境是 macOS 15.5，其他版本或者操作系统可能需要微调，比如字体。
下面的代码依赖: pandas, numpy, matplotlib, seaborn, scipy, ipywidgets, wordcloud, jieba, Pillow，snownlp。如果已经安装了 Anaconda，在 base 环境的基础上只需要 conda install ipywidgets snownlp, pip install wordcloud jieba.
~~🐦的训练赛题解明天考完科二就补……~~(已经是明天了)

预处理 & 初始化设置#

1
import pandas as pd
2
from datetime import datetime
3
import numpy as np
4
import seaborn as sns
5
import matplotlib.pyplot as plt
6
from ipywidgets import interact, widgets
7

8
df_qq = pd.read_csv('qq.csv')
9
df_wx = pd.read_csv('wx.csv')
10

11
df = pd.concat([df_qq, df_wx], ignore_index=True) # 合并 csv
12

13
df['time'] = pd.to_datetime(df['time']) # 转成 datetime 格式
14

15
plt.rcParams['font.family'] = 'Heiti TC'  # 黑体
16
plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号
17
# 添加辅助列
18
df["month"] = df["time"].dt.to_period("M").astype(str)
19
df["date"] = df["time"].dt.date
20
df["weekday"] = df["time"].dt.day_name()
21

22
# 用户分类（假设 True 和 False 分别代表两个用户）
23
user_labels = {True: "用户1", False: "用户2"}

数据可视化#

可以做很多有意思的分析，比如

每日聊天记录条数柱状图#

按月分#

1
from scipy.interpolate import make_interp_spline
2

3
def plot_daily_counts_by_month(month):
4
    month_df = df[df["month"] == month].copy()
5
    month_df["date"] = month_df["time"].dt.date
6

7
    # 每日用户记录数
8
    daily_user_counts = month_df.groupby(["date", "user"]).size().unstack(fill_value=0)
9
    daily_user_counts["总计"] = daily_user_counts.sum(axis=1)
10

11
    dates = daily_user_counts.index
12
    x = np.arange(len(dates))
13
    y = daily_user_counts["总计"].values
14

15
    # 配色
16
    color_user1 = "#8ecae6"
17
    color_user2 = "#ffb3b3"
18
    color_line = "#023047"
19

20
    # 画图
21
    plt.figure(figsize=(10, 6))
22

23
    # 堆叠柱状图
24
    plt.bar(x, daily_user_counts[True], label="用户1", color=color_user1)
25
    plt.bar(x, daily_user_counts[False], bottom=daily_user_counts[True], label="用户2", color=color_user2)
26

27
    # 平滑折线（用样条插值）
28
    if len(x) >= 4:
29
        x_smooth = np.linspace(x.min(), x.max(), 300)
30
        spline = make_interp_spline(x, y, k=3)
31
        y_smooth = spline(x_smooth)
32
        plt.plot(x_smooth, y_smooth, color=color_line, linewidth=2, label="总聊天数")
33
    else:
34
        plt.plot(x, y, color=color_line, linewidth=2, label="总聊天数")
35

36
    # x 轴标签用真实日期
37
    plt.xticks(ticks=x[::max(1, len(x)//15)], labels=[f'{d.day : 02d}' for d in dates[::max(1, len(x)//15)]])
38

39
    plt.title(f"{month} 每日聊天记录数量")
40
    plt.xlabel("日期")
41
    plt.ylabel("聊天数")
42
    plt.legend()
43
    plt.tight_layout()
44
    plt.show()
45

46
interact(plot_daily_counts_by_month, month=widgets.Dropdown(options=sorted(df["month"].unique())))

全部的#

1
# 按天、用户分组统计
2
daily_user_counts = df.groupby(["date", "user"]).size().unstack(fill_value=0)
3

4
# 总记录数
5
daily_user_counts["总计"] = daily_user_counts.sum(axis=1)
6

7
# 画图
8
plt.figure(figsize=(12, 6))
9

10
# 堆叠柱状图
11
plt.bar(daily_user_counts.index, daily_user_counts[True], label="用户1", color="#90caf9")
12
plt.bar(daily_user_counts.index, daily_user_counts[False], bottom=daily_user_counts[True], label="用户2", color="#f48fb1")
13

14
# 总记录数折线
15
plt.plot(daily_user_counts.index, daily_user_counts["总计"], label="总聊天数", color="black", linewidth=1)
16

17
plt.title("每日聊天记录数量")
18
plt.xlabel("日期")
19
plt.ylabel("聊天数")
20
plt.legend()
21
plt.xticks(rotation=45)
22
plt.tight_layout()
23
plt.show()

去年 12 月 1 号发生什么了🤯

各月中不同星期的聊天记录热力图#

1
weekday_counts = df.groupby(["month", "weekday"]).size().unstack().fillna(0)
2

3
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
4
weekday_counts = weekday_counts[weekday_order]
5

6
plt.figure(figsize=(12, 6))
7
sns.heatmap(weekday_counts.T, cmap="YlGnBu", annot=True, fmt=".0f")
8
plt.title("各月中不同星期的聊天记录热力图")
9
plt.xlabel("月份")
10
plt.ylabel("星期")
11
plt.tight_layout()
12
plt.show()

显然，经常周天晚上聊到周一凌晨……

各月聊天时间分布图#

1
# 确保字段准备好
2
df["hour"] = df["time"].dt.hour
3

4
# 分2小时一段
5
bins = list(range(0, 25, 2))  # 0-2, 2-4, ..., 22-24
6
labels = [f"{h:02d}-{h+2:02d}" for h in bins[:-1]]
7
df["time_bin"] = pd.cut(df["hour"], bins=bins, labels=labels, right=False)
8

9
# 统计每月每时间段的聊天数
10
pivot = df.pivot_table(index="month", columns="time_bin", aggfunc="size", fill_value=0, observed=False)
11

12
# 计算百分比
13
percent = pivot.div(pivot.sum(axis=1), axis=0)
14

15
# 按时间段顺序排列列
16
percent = percent[labels]  # 保证顺序一致
17

18
sorted_index = pivot.sum(axis=1).sort_values().index
19
percent = percent.loc[sorted_index]
20

21
# 绘图
22
fig, ax = plt.subplots(figsize=(12, len(percent)*0.6))
23

24
left = [0] * len(percent)  # 初始位置
25
colors = sns.color_palette("Spectral", len(labels))  # 可自定义色板
26

27
for i, col in enumerate(percent.columns):
28
    ax.barh(percent.index, percent[col], left=left, label=col, color=colors[i])
29
    left = [l + p for l, p in zip(left, percent[col])]
30

31
# 标注总聊天数（右侧）
32
totals = pivot.sum(axis=1).loc[sorted_index]
33
for i, (y, total) in enumerate(zip(percent.index, totals)):
34
    ax.text(1.01, i, f"{total}条", va="center", fontsize=10)
35

36
ax.set_title("各月聊天时间段分布（百分比）")
37
ax.set_xlabel("比例")
38
ax.set_xlim(0, 1)
39
ax.set_ylabel("月份")
40
ax.legend(
41
    title="时间段",
42
    loc="upper center",
43
    bbox_to_anchor=(0.5, -0.12),
44
    ncol=6,
45
    frameon=False
46
)
47
plt.tight_layout()
48
plt.show()

验证了上述猜想，确实是经常从晚上聊到凌晨……

聊天关键词——词云图#

词云这里面可以自定义很多参数，比如

1
import jieba
2
from wordcloud import WordCloud, ImageColorGenerator
3
from PIL import Image
4
import numpy as np
5
import matplotlib.pyplot as plt
6
from ipywidgets import interact, widgets
7

8
# 读取停用词
9
with open('stopwords.txt', 'r', encoding='utf-8') as f:
10
    stopwords = set(line.strip() for line in f if line.strip())
11

12
# 词云生成函数
13
def generate_wordcloud(month, user):
14
    # 筛选数据
15
    if month == "全部" and user == "全部":
16
        sub_df = df
17
    elif month == "全部":
18
        user_bool = (user == "用户1")
19
        sub_df = df[df["user"] == user_bool]
20
    elif user == "全部":
21
        sub_df = df[df["month"] == month]
22
    else:
23
        user_bool = (user == "用户1")
24
        sub_df = df[(df["month"] == month) & (df["user"] == user_bool)]
25
    # 拼接所有内容
26
    text = "\n".join(str(x) for x in sub_df["content"] if pd.notnull(x) and str(x).strip())
27
    # 分词并去除停用词
28
    segs = [seg.strip() for seg in jieba.cut(text) if seg.strip() and seg not in stopwords]
29
    if not segs:
30
        print("无内容可生成词云")
31
        return
32
    # 词云参数
33
    coloring = np.array(Image.open("mask.jpeg"))
34
    wc = WordCloud(
35
        font_path='fonts/NotoSansSC-Regular.ttf',
36
        background_color="white",
37
        max_words=400,
38
        mask=coloring,
39
        stopwords=stopwords,
40
        max_font_size=400,
41
        random_state=42
42
    )
43
    wc.generate(' '.join(segs))
44
    image_colors = ImageColorGenerator(coloring)
45
    wc.recolor(color_func=image_colors)
46
    # 展示
47
    plt.figure(figsize=(10, 10))
48
    plt.imshow(wc, interpolation='bilinear')
49
    plt.axis("off")
50
    plt.title(f"{month} - {user} 词云")
51
    plt.show()
52

53
# 交互式界面
54
month_options = ["全部"] + sorted(df["month"].unique())
55
user_options = ["全部", "用户1", "用户2"]
56
interact(
57
    generate_wordcloud,
58
    month=widgets.Dropdown(options=month_options, description="月份"),
59
    user=widgets.Dropdown(options=user_options, description="用户")
60
)

聊天时长直方图#

没有上下文直接这样判断很容易把能续上的聊天阶段，后面大部分就看个乐子了。

1
df = df.sort_values("time")
2
df = df.reset_index(drop=True)
3

4
# 时间差（单位：秒）
5
df["time_diff"] = df["time"].diff().dt.total_seconds().fillna(0)
6

7
# 若时间间隔超过 N 秒，则视为新会话
8
session_threshold = 600  # 单位：秒
9
df["session_id"] = (df["time_diff"] > session_threshold).cumsum()
10

11
# 每个 session 的起止时间与持续时长
12
sessions = df.groupby("session_id").agg(
13
    start_time=("time", "first"),
14
    end_time=("time", "last"),
15
    message_count=("time", "count")
16
)
17

18
sessions["duration_sec"] = (sessions["end_time"] - sessions["start_time"]).dt.total_seconds()
19
sessions["duration_min"] = sessions["duration_sec"] / 60  # 转成分钟
20

21
# 过滤掉只发一条消息的聊天
22
sessions = sessions[sessions["message_count"] > 1]
23

24
# 对数直方图：log1p(duration)
25
log_values = np.log1p(sessions["duration_min"])
26

27
plt.figure(figsize=(10, 6))
28
plt.hist(log_values, bins=40, color="#f48fb1", edgecolor="black")
29

30
# 设置横轴刻度：原始时间（分钟）
31
ticks_raw = [1, 5, 10, 30, 60, 120, 300]  # 你可以按需要改
32
ticks_log = np.log1p(ticks_raw)
33
tick_labels = [str(t) for t in ticks_raw]
34

35
plt.xticks(ticks=ticks_log, labels=tick_labels)
36

37
plt.xlabel("每次聊天持续时间（分钟）")
38
plt.ylabel("聊天轮次数")
39
plt.title("聊天持续时间分布（对数压缩，横轴显示原始分钟）")
40
plt.tight_layout()
41
plt.show()

每轮聊天双方发言文本长度柱状图#

超过 30 min 的对话中双方的文本长度，一定程度上反应了聊天（最好不是吵架）的主动权。

1
# 标记每个 session 的持续时间（单位：分钟）
2
session_duration = df.groupby("session_id")["time"].agg(["first", "last"])
3
session_duration["duration_min"] = (session_duration["last"] - session_duration["first"]).dt.total_seconds() / 60
4
long_sessions = session_duration[session_duration["duration_min"] > 30].index
5

6
# 仅保留长 session 的数据
7
df_long = df[df["session_id"].isin(long_sessions) & df["content"].notna()].copy()
8
df_long["text_length"] = df_long["content"].astype(str).str.len()
9

10
# 分组统计每个 session 每个用户的文本总长度
11
length_by_user = df_long.groupby(["session_id", "user"])["text_length"].sum().unstack(fill_value=0)
12

13
# 重命名列，避免布尔列名引起混淆
14
length_by_user = length_by_user.rename(columns={True: "用户1", False: "用户2"})
15

16
# 增加总量和主导者列
17
length_by_user["total"] = length_by_user["用户1"] + length_by_user["用户2"]
18
length_by_user["dominant_user"] = length_by_user[["用户1", "用户2"]].idxmax(axis=1)
19

20
length_by_user[["用户1", "用户2"]].plot(kind="bar", stacked=True, figsize=(14, 6), color=["#8ecae6", "#ffb3b3"])
21
plt.title("每轮聊天中双方发言的文本长度（大于30分钟）")
22
plt.xticks([])
23
plt.xlabel("session")
24
plt.ylabel("文本长度")
25
plt.legend()
26
plt.tight_layout()
27
plt.show()

可以看出来，我话比较多……但这并不一定(~~一定不~~)是件好事。

聊天内容情感分析#

SnowNLP 的训练数据主要是买卖东西时的评价，所以不是很准，列举几个比较离谱的

content	sentiment
睡醒了？	0.07963618947920525
那个号只有个位数的英雄	0.9351870482742778
昂	0.8
我本来就有三张改名卡	0.02719995770409467
[图片]	0.5826086956521737
…	…

另外我让他分析 “我要上吊” 竟然有 0.5455370938053228 的高分，总之他特抽象，大部分也是看个乐子。

1
conda install snownlp

1
from snownlp import SnowNLP
2

3
def get_sentiment(text):
4
    try:
5
        s = SnowNLP(text)
6
        return s.sentiments # 0 ~ 1 越大越积极
7
    except:
8
        return None
9

10
df["sentiment"] = df["content"].astype(str).apply(get_sentiment)

谁的情绪更稳定——箱线图#

1
sns.boxplot(data=df, x="user", y="sentiment")
2
plt.title("不同用户的情绪分布")
3
plt.xticks([0, 1], ["用户2", "用户1"])
4
plt.tight_layout()
5
plt.show()

显然，我情绪不稳定，而且还消极……

每日平均情绪变化——折线图#

1
# 按日期和用户分组，计算平均情绪
2
daily_sentiment = df.groupby(["date", "user"])["sentiment"].mean().unstack()
3

4
# 画图
5
plt.figure(figsize=(12, 4))
6
plt.plot(daily_sentiment.index, daily_sentiment[True], label="用户1", color="#8ecae6")
7
plt.plot(daily_sentiment.index, daily_sentiment[False], label="用户2", color="#ffb3b3")
8
plt.axhline(0.5, color="gray", linestyle="--", label="中性线")
9

10
plt.title("每日平均情绪变化")
11
plt.ylabel("情绪值（0=负面，1=正面）")
12
plt.xlabel("日期")
13
plt.legend()
14
plt.tight_layout()
15
plt.show()

后面数据比较少了，可能受极端值影响太大了，波动非常剧烈。前面的数据相对还正常一点，一眼就可以看出来，红线在大部分时间都比蓝线积极，~~而且好像还负相关~~。

总结 & 建议#

学都学会了，快去祸害自己的好友吧~~~

另外如果有其他有意思的建议可以在下面的 comments 里提。

Star's Blog