import pandas as pd
import datetime as dt
import numpy as np
import requests
import tweepy
import json
import re
from nltk import pos_tag
archive = pd.read_csv('twitter-archive-enhanced.csv')
archive.set_index('tweet_id', inplace = True)
archive.head(2)
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(url.split('/')[-1], mode='wb') as file:
file.write(response.content)
images = pd.read_csv('image-predictions.tsv', sep='\t')
images.head(2)
consumer_key = #######
consumer_secret = #######
access_token = #######
access_token_secret = #######
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser(), wait_on_rate_limit=True)
df_list = []
e_list = []
tweet_id = images['tweet_id']
for id in tweet_id:
try:
page = api.get_status(id)
favorites = page['favorite_count']
retweet_count = page['retweet_count']
day_time = pd.to_datetime(page['created_at'])
df_list.append({'tweet_id': int(id),
'favorites': int(favorites),
'retweet_count': int(retweet_count)})
except Exception as e:
e_list.append(id)
e_list
# I ran the df_list code a few times and each time I had the same few dud links but also a few different working links.
# The working links were different each time so running the code again made the most sense.
ee_list = []
for e in e_list:
try:
page = api.get_status(e)
favorites = page['favorite_count']
retweet_count = page['retweet_count']
day_time = pd.to_datetime(page['created_at'])
df_list.append({'tweet_id': int(e),
'favorites': int(favorites),
'retweet_count': int(retweet_count)})
except Exception:
ee_list.append(id)
jsondf = pd.DataFrame(df_list, columns = ['tweet_id', 'favorites', 'retweet_count'])
jsondf.to_csv('tweet_json.txt', encoding = 'utf-8')
jsondf = pd.read_csv('tweet_json.txt', encoding = 'utf-8')
jsondf.set_index('tweet_id', inplace = True)
jsondf.tail(4)
images.set_index('tweet_id', inplace = True)
df = pd.merge(left=archive, right=images, left_index=True, right_index=True, how='left')
df = pd.merge(left=df, right=jsondf, left_index=True, right_index=True, how='left')
df.to_csv('dfcopy.csv', encoding = 'utf-8') # this is how I make a copy, back it up to a file
df = pd.read_csv('dfcopy.csv') # then start here whenever I load the notebook while woring on it
df
df.info()
df.describe()
df['rating_numerator'].value_counts()
df['rating_denominator'].value_counts()
df['name'].value_counts()
(accuracy, validity, consistency, completeness)
(structural issues)
del df['Unnamed: 0'] #code
list(df) #test
df['hashtag'] = df['text'].str.extract(r"#(\w+)", expand=True)
df['hashtag'].value_counts()
# there will be a lot of empty values but I think the info is interesting.
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.info()
#before we proceed, let's get rid of the rows we won't use, the ones without pics and the retweets
df = df[pd.notnull(df['jpg_url'])] #without pics
df = df[pd.isnull(df['retweeted_status_id'])] #additional retweets
len(df['text'])
#and now to weed out a few more surpluse columns, these regarding the retweet status
del df['retweeted_status_id']
del df['retweeted_status_user_id']
del df['retweeted_status_timestamp']
list(df)
#lets condense the 'dog type' columns into one
dog_type = []
string_in = ['puppo', 'pupper', 'doggo', 'floof']
string_out = ['puppo', 'pupper', 'doggo', 'floofer']
for row in df['text']:
row = row.lower()
for word in string_in:
if word in str(row):
dog_type.append(string_out[string_in.index(word)])
break
else:
dog_type.append('None')
df['dog_type'] = dog_type
df['dog_type'].value_counts()
#now I can delete those redundant columns
df.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)
# next let's condense the dog breed analysis
breed = []
confidence = []
def breed_confidence(row):
if row['p1_dog'] == True:
breed.append(row['p1'])
confidence.append(row['p1_conf'])
elif row['p2_dog'] == True:
breed.append(row['p2'])
confidence.append(row['p2_conf'])
elif row['p3_dog'] == True:
breed.append(row['p3'])
confidence.append(row['p3_conf'])
else:
breed.append('Unidentifiable')
confidence.append(0)
# this isn't about the parsing AI so the rest of this data is deletable
df.apply(breed_confidence, axis=1)
df['breed'] = breed
df['confidence'] = confidence
df.head()
#I know I could have used column numbers but I didn't want an accident if I accidentally ran this block twice
df.drop(['p1', 'p1_conf', 'p1_dog', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog',], axis=1, inplace=True)
df.head()
df['in_reply_to_status_id'].value_counts()
df['in_reply_to_user_id'].value_counts()
#the ['in_reply_to_user_id'] are all 4196983835, which is @dog_rates, so this info is not useful
df.drop(['in_reply_to_status_id', 'in_reply_to_user_id'], axis=1, inplace=True)
df['source'].value_counts() # not sure if data worth keeping, will keep for now
rates = []
extract_rates = lambda x: rates.append(re.findall(r'(\d+(\.\d+)|(\d+))\/(\d+0)', x, flags=0))
df['text'].apply(extract_rates)
numerator = []
dog_count = []
for item in rates:
# for tweets with no rating, but a picture, so a dog
if len(item) == 0:
numerator.append('NaN')
dog_count.append(1)
# for tweetss with one rating and one dog
elif len(item) == 1 and item[0][-1] == '10':
numerator.append(float(item[0][0]))
dog_count.append(1)
# for group ratings
elif len(item) == 1:
avg = float(item[0][0]) / (float(item[0][-1]) / 10)
numerator.append(avg)
dog_count.append(float(item[0][-1]) / 10)
# for tweets with more than one rating
elif len(item) > 1:
total = 0
list = []
for i in range(len(item)):
if item[i][-1] == '10': #one tweet has the phrase '50/50' so I'm coding to exclude it
list.append(item[i])
for rate in list:
total = total + float(rate[0])
avg = total / len(item)
numerator.append(avg)
dog_count.append(len(item))
# in order to catch bugs
else:
numerator.append('Not parsed')
dog_count.append('Not parsed')
df['rating'] = numerator # not need to also add denominator since they are all 10!
df['dog_count'] = dog_count
df['rating'].value_counts()
# All are below 14 except the joke ratings of 420 and 1776, so success!
df.drop([ 'rating_numerator', 'rating_denominator'], axis=1, inplace=True)
#no longer needed since 'rating' has the info
df.info()
df['dog_count'].value_counts()
df['text_split'] = df['text'].str.split()
names = []
# use string starts with method to clean this up
def extract_names(row):
# 'This is Charlie'
if row['text'].startswith('This is ') and re.match(r'[A-Z].*', row['text_split'][2]):
names.append(row['text_split'][2].strip('.').strip(','))
# 'Meet Charlie'
elif row['text'].startswith('Meet ') and re.match(r'[A-Z].*', row['text_split'][1]):
names.append(row['text_split'][1].strip('.').strip(','))
# 'Say hello to Charlie'
elif row['text'].startswith('Say hello to ') and re.match(r'[A-Z].*', row['text_split'][3]):
names.append(row['text_split'][3].strip('.').strip(','))
# 'Here we have Charlie'
elif row['text'].startswith('Here we have ') and re.match(r'[A-Z].*', row['text_split'][3]):
names.append(row['text_split'][3].strip('.').strip(','))
# 'named Charlie'
elif 'named' in row['text'] and re.match(r'[A-Z].*', row['text_split'][(row['text_split'].index('named') + 1)]):
names.append(row['text_split'][(row['text_split'].index('named') + 1)])
else:
names.append('Nameless')
df.apply(extract_names, axis=1)
len(names)
df['names'] = names
df['names'].value_counts()
df['name'].value_counts()
# So mine doesn't have 'a', 'the', and 'an' as common names
# Mine has more nameless but I think that's appropriate.
tagger = lambda x: pos_tag(x)
df['tagged'] = df['text_split'].apply(tagger)
pronouner = lambda x: [word for word, pos in x if pos == 'PRP']
df['pronouns'] = df['tagged'].apply(pronouner)
lowerer = lambda x: [a.lower() for a in x]
df['pronouns'] = df['pronouns'].apply(lowerer)
df['pronouns'].head(10)
pronouns = df['pronouns']
pronouns.to_csv('pronouns.csv')
gender = []
male = ['he', 'him', 'his', "he's", 'himself']
female = ['she', 'her', 'hers', 'herself', "she's"]
def genderer(row):
row['text'] = row['text'].islower()
if len(row['pronouns']) > 0 and any(i in female for i in row['pronouns']):
gender.append('Female')
elif len(row['pronouns']) > 0 and any(i in male for i in row['pronouns']):
gender.append('Male')
elif 'girl' in str(row['text']):
gender.append('Female')
elif 'boy' in str(row['text']):
gender.append('Male')
else:
gender.append('Neutral')
df.apply(genderer, axis=1)
df['gender'] = gender
df['gender'].value_counts()
df.drop(['text_split', 'tagged', 'pronouns'], axis=1, inplace=True)
df.head(20)
df.loc[df['gender'] == 'Neutral', 'gender'] = None
df.loc[df['names'] == 'Nameless', 'names'] = None
df.loc[df['breed'] == 'Unidentifiable', 'breed'] = None
df.loc[df['dog_type'] == 'None', 'dog_type'] = None
df.loc[df['rating'] == 0.0, 'rating'] = np.nan
df.loc[df['confidence'] == 0.0, 'confidence'] = np.nan
df.info()
df.to_csv('twitter_archive_master.csv', encoding = 'utf-8') #saved