More than 8 years of messages analysis

Jul 7, 2019 4 min read

Total messages by Day

I downloaded my data from Facebook in a .json format. I used Python with Jupyter Notebook to play with data.

The json and pandas libraries are very useful to read and display data in a stylized way.

I added some columns to the data such as date (since the available time was a millisecond timestamp) and total characters for each row (each row represents a sent message).

Finally, I exported the data as an .xls file and opened it with Tableau to make the graphs.

# data time range : 2010-10-24 to 2019-07-05

# Imports

import json
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import operator

fileName = 'message_1.json'
orderedDays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

with open(fileName) as jsonFile:  
    data = json.load(jsonFile)
    dataList = []
    words = {}
    for index, message in enumerate(data['messages']):
        row = []
        row.append('Me' if message["sender_name"] == 'Mokhles Bouzaien' else 'Her')
        row.append(message['timestamp_ms'])
        try:
            row.append(message['content'])
        except KeyError:
            row.append(None)
            
        try:
            wordList = message["content"].split()
        except KeyError:
            pass
            
        for word in wordList:
            try:
                words[word] += 1
            except KeyError:
                words[word] = 1
                
        dataList.append(row)
dataList = np.array(dataList)
df = pd.DataFrame(dataList,columns=['sender', 'timestamp_ms', 'content'])
df['date'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000), axis = 1)
df['year'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).year, axis = 1)
df['month'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).month, axis = 1)
df['day'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).day, axis = 1)
df['weekday'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).strftime('%A'), axis = 1)
df['hour'] = df.apply(lambda row: datetime.datetime.fromtimestamp(row.timestamp_ms / 1000).hour, axis = 1)
df['caracters'] = df.apply(lambda row: len(row.content) if row.content != None else 0, axis = 1)
df.head()

	sender	timestamp_ms	content	date	year	month	day	weekday	hour	caracters
0	Her	1562350465800	Hidden Message	2019-07-05	2019	7	5	Friday	20	51
1	Her	1562350430104	Hidden Message	2019-07-05	2019	7	5	Friday	20	33
2	Her	1562350413998	Hidden Message	2019-07-05	2019	7	5	Friday	20	43
3	Me	1562350377019	Hidden Message	2019-07-05	2019	7	5	Friday	20	12
4	Me	1562350371595	Hidden Message	2019-07-05	2019	7	5	Friday	20	8

# Data is exported as .slsx file and imported to Tableau
df.to_excel("output.xlsx", engine='xlsxwriter')

sortedWords = sorted(words.items(), key=operator.itemgetter(1), reverse=True)

# group data by sender
dfBySender = df.groupby('sender')['caracters'].agg(['sum', 'count'])
# dfBySender = pd.DataFrame(dfBySender).reset_index()
dfBySender.head()

	sum	count
sender
Her	6351619	210679
Me	2953745	152407

# dfBySender.plot.pie(y='count', figsize=(5, 5))

# group data by date
dfByDate = df.groupby('date')['caracters'].agg(['sum', 'count'])
dfByDate.head()

	sum	count
date
2010-10-24	43	2
2010-10-26	9	1
2010-10-27	13	1
2010-12-17	19	1
2011-03-31	16	5

print(dfByDate.loc[dfByDate['sum'].idxmax()])
print(dfByDate.loc[dfByDate['count'].idxmax()])
print(dfByDate['sum'].mean())
print(dfByDate['count'].mean())

sum      36354
count      346
Name: 2012-07-23, dtype: int64
sum      22975
count     1171
Name: 2018-08-20, dtype: int64
3553.0217640320734
138.63535700649103

# plt.figure(figsize=(16,12))
# dfByDate['sum'].plot()
# plt.figure(figsize=(16,12))
# dfByDate['count'].plot()
# plt.figure(figsize=(16,6))
# plt.subplot(121)
# dfByDate.cumsum()['sum'].plot()
# plt.subplot(122)
# dfByDate.cumsum()['count'].plot()

# group data by month
dfByMonth = df.groupby(['year','month'])['caracters'].agg(['sum', 'count'])
dfByMonth = dfByMonth.reset_index()
dfByMonth.head()

	year	month	sum	count
0	2010	10	65	4
1	2010	12	19	1
2	2011	3	16	5
3	2011	7	2039	154
4	2011	8	177	22

# sns.set()
# caracters = dfByMonth.pivot("month", "year", "sum")
# messages = dfByMonth.pivot("month", "year", "count")
# f, ax = plt.subplots(figsize=(16, 12))
# sns.heatmap(caracters, annot=False, ax=ax, cmap='YlGnBu')
# f, ax = plt.subplots(figsize=(16, 12))
# sns.heatmap(messages, annot=False, ax=ax, cmap='YlGnBu')

# group data by year
dfByYear = df.groupby('year')['caracters'].agg(['sum', 'count'])
dfByYear = pd.DataFrame(dfByYear).reset_index()
dfByYear.head()

	year	sum	count
0	2010	84	5
1	2011	180787	6445
2	2012	1976590	53485
3	2013	817978	26960
4	2014	698706	22401

# group data by weekday
dfByWeekday = df.groupby('weekday')['caracters'].agg(['sum', 'count'])
dfByWeekday = dfByWeekday.reindex(orderedDays)
dfByWeekday = dfByWeekday.reset_index()
dfByWeekday

	weekday	sum	count
0	Monday	1383277	52854
1	Tuesday	1269405	50000
2	Wednesday	1248049	48457
3	Thursday	1179119	46607
4	Friday	1387426	54655
5	Saturday	1237290	49303
6	Sunday	1600798	61210

# # group data by date and weekday
# dfByDateWeekday = df.groupby(['date', 'weekday'])['caracters'].agg(['sum', 'count'])
# dfByDateWeekday = dfByDateWeekday.reset_index()
# dfByDateWeekday.head()

# sns.catplot(x="weekday", y="count", order=orderedDays, data=dfByDateWeekday, height=12)

# sns.catplot(x="weekday", y="count", order=orderedDays, kind='bar', data=dfByWeekday, height=12)

# sns.boxplot(x="weekday", y="count", data=dfByDateWeekday)

# group data by hour
dfByHour = df.groupby('hour')['caracters'].agg(['sum', 'count'])
dfByHour = dfByHour.reset_index()
dfByHour.head()

	hour	sum	count
0	0	629842	26814
1	1	410867	17451
2	2	239291	9799
3	3	176846	7712
4	4	123076	5105

# dfByHour['count'].plot.bar()

Reddit Post: https://www.reddit.com/r/dataisbeautiful/comments/caa9il/more_than_8_years_of_messages_between_my/

More than 8 years of messages analysis

Mokhles Bouzaien

Master of Science in Engineering Student