WhatsApp Chat Cleaner R Package
WhatsApp Chat Cleaner is a simple R package that helps in analyzing WhatsApp chat, this package consists of simple functions that will help you to clean the raw chat along with emojis.
This package consists of the following eight simple functions:
install_load_packages
, Install and Load Librariesclean_whatsapp_chat
, Cleans whatsapp chat datamedia_remove
, Remove media recordsemoji_loader
, Loads Emojis from fileemoji_human_color_ignore
, Human Emojis [OPTIONAL]emoji_replacer
, Replace Emoji with namemake_emoji_count_column
, Make column with emoji countremove_unused_emoji
, Remove unused emojis
Install and Load Libraries
This function simply loads all the required libraries for this package.
install_load_packages <- function()
{
install.packages("stringr", "dplyr", "zoo")
library("stringr")
library("dplyr")
library("zoo")
}
Cleans WhatsApp chat data
This function takes the raw chat file and cleans the data for later analyzingpurposes. See detailed explanation Cleaning WhatsApp Chat.
clean_whatsapp_chat <- function(filepath)
{
all_data = readLines(filepath)
#Removing the first message
all_data = all_data[-1]
#Extracting date and time
date_time <- format(strptime(all_data, "%m/%d/%y, %I:%M %p"),"%m/%d/%y, %H:%M")
head(date_time)
#Extracting date
date = gsub(",.*$","",date_time) #Fetching all before ","
#Extracting time
time = gsub("^.*,","",date_time) #Fetching all after ","
time = str_trim(time) #Removing spaces from both ends
time
sender <- 'sender' #Temorary Data
message <- all_data
#Creating Data Frame
clean_data = data.frame(date,time,sender,message)
head(clean_data)
#Extracting sender and message from the data frame
#Fetching only complete cases
sender_message = clean_data[complete.cases(clean_data),4]
sender_message = gsub("^.*?-","",sender_message)
sender_message = str_trim(sender_message)
#Extracting message
message = gsub("^.*?:","",sender_message)
message = str_trim(message) #Removing spaces from both ends
head(message)
#Updating the data frame with new message data
clean_data$message <- as.character(clean_data$message)
clean_data[complete.cases(clean_data),4] <- message
#Extracting sender names
sender = gsub("?:.*$","",sender_message)
sender = str_trim(sender) #Removing spaces from both ends
head(sender)
#Updating the data frame with new sender data
clean_data$sender <- as.character(clean_data$sender)
clean_data[complete.cases(clean_data),3] <- sender
#Replacing remaining "sender" values with NA
clean_data[clean_data=="sender"]<- NA
#Using transform function from Zoo Package
#Filling NA with previous values
#Detailed explanation > www.tensorflowhub.org
clean_data <- transform(clean_data, date = na.locf(date), time = na.locf(time),
sender = na.locf(sender))
#Refactorizing
clean_data$sender <- as.factor(clean_data$sender)
return(clean_data)
}
Remove media records
This function removes all the media records from the WhatsApp Chat Data Frame, use if you want them to be ignored.
media_remove <- function(clean_data)
{
clean_data_without_media <- clean_data
clean_data_without_media[clean_data_without_media=="<Media omitted>"] <- NA
clean_data_without_media <-
clean_data_without_media[complete.cases(clean_data_without_media),]
head(clean_data_without_media)
return(clean_data_without_media)
}
Load Emojis from file
This function loads the emojis from the file, updates the emoji names to make them unique, and replaces all functional symbols like .
, $
with backslashes to use them in the gsub
regex expression. It also removes the white spaces from both ends.
emoji_loader <- function(filepath)
{
emoji_data = read.csv(filepath)
head(emoji_data)
#Adding emoji- to each name to extract patterns with accuracy
emoji_data$Names <- paste0("emoji-", emoji_data$Names)
emoji_data$Names <- gsub(" ", "-", emoji_data$Names)
#Adding \\ before a regex functional symbol
sym <- c("\\$","\\.","\\^","\\*","\\[","\\]","\\?","\\(","\\)")
sym_replace <- c("\\\\$","\\\\.","\\\\^","\\\\*","\\\\[","\\\\]","\\\\?","\\\\(","\\\\)")
for(i in 1:length(sym)){
emoji_data$Symbols <- gsub(sym[i],sym_replace[i],emoji_data$Symbols)
}
#Removing spaces from both ends
emoji_data$Symbols = str_trim(emoji_data$Symbols)
return(emoji_data)
}
Human Emojis [OPTIONAL]
This function ignores the skin colors of all human(body/face/hand) emojis, use only when you want all the six tones of a each emoji to be considered same.
emoji_human_color_ignore <- function(emoji_data)
{
emoji_data$Names <- gsub(":.*$","",emoji_data$Names)
return(emoji_data)
}
Replace Emoji with name
This function replaces all the emojis with thei meanings/names from the emoji files.
emoji_replacer <- function(emoji_data, text_data)
{
for(i in seq_len(nrow(emoji_data))){
clean_data_without_media_temp = gsub(emoji_data[i,2],emoji_data[i,1],text_data[,4])
text_data$message = gsub(emoji_data[i,2],emoji_data[i,1],text_data$message)
}
return(text_data)
}
Make column with emoji count
This function makes columns for each emoji with their respective count in their messages.
make_emoji_count_column <- function(emoji_data, text_data)
{
#Creating columns for each emoji with count
emoji_array <- as.character(emoji_data$Names)
clean_data_without_media_with_emoji <- text_data
for(i in emoji_array){
clean_data_without_media_with_emoji[,i] <- str_count(text_data$message,i)
}
return(clean_data_without_media_with_emoji)
}
Remove unused emojis
This function removes unused emojis by checking their sum. It also updates thecolumn names back to the original name.
remove_unused_emoji <- function(text_data)
{
unused_emoji_index = c(NA)
#Getting indexes of unused emojis
for(i in 6:length(text_data)){
sum = sum(text_data[,i])
cat(sprintf("%s = %i \n",names(text_data)[i],sum))
if(sum == 0)
{
unused_emoji_index <- c(unused_emoji_index,i)
}
}
unused_emoji_index <- unused_emoji_index[-1]
length(unused_emoji_index)
#Removing unused emojis
text_data <- text_data[,-unused_emoji_index]
#Removing emoji- from column names
names(text_data) <- gsub("^.*?emoji-", "", names(text_data))
return(text_data)
}
Github Repo for this package can be found here.
You can also directly downloadfrom github using the following:
commands.install_github('MJFND/R/WhatsappChatCleanerPackage/WhatsAppChatCleaner')
library('WhatsAppChatCleaner')
This package will also require you to download the emoji data sets from here. These datasets do not contain all the emojis, we will welcome everyone to contribute and update the list. Also, would love to have more functionalities in this package.