In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import requests
import tweepy
import json
import re
from nltk import pos_tag

Gather

In [2]:
archive = pd.read_csv('twitter-archive-enhanced.csv')
archive.set_index('tweet_id', inplace = True)
archive.head(2)
Out[2]:
in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
tweet_id
892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
In [3]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(url.split('/')[-1], mode='wb') as file:
    file.write(response.content)
    
images = pd.read_csv('image-predictions.tsv', sep='\t')
images.head(2)
Out[3]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
In [4]:
consumer_key = #######
consumer_secret = #######
access_token = #######
access_token_secret = #######

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, parser=tweepy.parsers.JSONParser(), wait_on_rate_limit=True)
In [5]:
df_list = []
e_list = []
tweet_id = images['tweet_id']

for id in tweet_id:
    try:
        page = api.get_status(id)
        favorites = page['favorite_count']
        retweet_count = page['retweet_count']
        day_time = pd.to_datetime(page['created_at'])
        df_list.append({'tweet_id': int(id),
                        'favorites': int(favorites),
                        'retweet_count': int(retweet_count)})
    
    except Exception as e:
        e_list.append(id)
In [6]:
e_list
# I ran the df_list code a few times and each time I had the same few dud links but also a few different working links. 
# The working links were different each time so running the code again made the most sense. 
Out[6]:
[699801817392291840,
 802247111496568832,
 831939777352105988,
 861769973181624320,
 888202515573088257]
In [7]:
ee_list = []
for e in e_list:
    try:
        page = api.get_status(e)
        favorites = page['favorite_count']
        retweet_count = page['retweet_count']
        day_time = pd.to_datetime(page['created_at'])
        df_list.append({'tweet_id': int(e),
                        'favorites': int(favorites),
                        'retweet_count': int(retweet_count)})
        
    except Exception:
        ee_list.append(id)
In [8]:
jsondf = pd.DataFrame(df_list, columns = ['tweet_id', 'favorites', 'retweet_count'])
jsondf.to_csv('tweet_json.txt', encoding = 'utf-8')
In [9]:
jsondf = pd.read_csv('tweet_json.txt', encoding = 'utf-8')
jsondf.set_index('tweet_id', inplace = True)
jsondf.tail(4)
Out[9]:
Unnamed: 0 favorites retweet_count
tweet_id
892177421306343426 2068 33766 6475
892420643555336193 2069 39455 8836
699801817392291840 2070 3326 1083
831939777352105988 2071 26268 6983
In [10]:
images.set_index('tweet_id', inplace = True)
df = pd.merge(left=archive, right=images, left_index=True, right_index=True, how='left')
df = pd.merge(left=df, right=jsondf, left_index=True, right_index=True, how='left')
df.to_csv('dfcopy.csv', encoding = 'utf-8') # this is how I make a copy, back it up to a file
In [11]:
df = pd.read_csv('dfcopy.csv') # then start here whenever I load the notebook while woring on it

Assess

In [12]:
df
Out[12]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls ... p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog Unnamed: 0 favorites retweet_count
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... ... False bagel 0.085851 False banana 0.076110 False 2069.0 39455.0 8836.0
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... ... True Pekinese 0.090647 True papillon 0.068957 True 2068.0 33766.0 6475.0
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... ... True malamute 0.078253 True kelpie 0.031379 True 2067.0 25424.0 4295.0
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... ... False Labrador_retriever 0.168086 True spatula 0.040836 False 2066.0 42828.0 8915.0
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... ... True English_springer 0.225770 True German_short-haired_pointer 0.175219 True 2065.0 40975.0 9708.0
5 891087950875897856 NaN NaN 2017-07-29 00:08:17 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a majestic great white breaching ... NaN NaN NaN https://twitter.com/dog_rates/status/891087950... ... True Irish_terrier 0.116317 True Indian_elephant 0.076902 False 2064.0 20534.0 3237.0
6 890971913173991426 NaN NaN 2017-07-28 16:27:12 +0000 <a href="http://twitter.com/download/iphone" r... Meet Jax. He enjoys ice cream so much he gets ... NaN NaN NaN https://gofundme.com/ydvmve-surgery-for-jax,ht... ... True Border_collie 0.199287 True ice_lolly 0.193548 False 2063.0 12047.0 2140.0
7 890729181411237888 NaN NaN 2017-07-28 00:22:40 +0000 <a href="http://twitter.com/download/iphone" r... When you watch your owner call another dog a g... NaN NaN NaN https://twitter.com/dog_rates/status/890729181... ... True Eskimo_dog 0.178406 True Pembroke 0.076507 True 2062.0 66574.0 19527.0
8 890609185150312448 NaN NaN 2017-07-27 16:25:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Zoey. She doesn't want to be one of th... NaN NaN NaN https://twitter.com/dog_rates/status/890609185... ... True Irish_setter 0.193054 True Chesapeake_Bay_retriever 0.118184 True 2061.0 28168.0 4396.0
9 890240255349198849 NaN NaN 2017-07-26 15:59:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Cassie. She is a college pup. Studying... NaN NaN NaN https://twitter.com/dog_rates/status/890240255... ... True Cardigan 0.451038 True Chihuahua 0.029248 True 2060.0 32444.0 7664.0
10 890006608113172480 NaN NaN 2017-07-26 00:31:25 +0000 <a href="http://twitter.com/download/iphone" r... This is Koda. He is a South Australian decksha... NaN NaN NaN https://twitter.com/dog_rates/status/890006608... ... True Pomeranian 0.013884 True chow 0.008167 True 2059.0 31104.0 7564.0
11 889880896479866881 NaN NaN 2017-07-25 16:11:53 +0000 <a href="http://twitter.com/download/iphone" r... This is Bruno. He is a service shark. Only get... NaN NaN NaN https://twitter.com/dog_rates/status/889880896... ... True Labrador_retriever 0.151317 True muzzle 0.082981 False 2058.0 28194.0 5107.0
12 889665388333682689 NaN NaN 2017-07-25 01:55:32 +0000 <a href="http://twitter.com/download/iphone" r... Here's a puppo that seems to be on the fence a... NaN NaN NaN https://twitter.com/dog_rates/status/889665388... ... True Cardigan 0.027356 True basenji 0.004633 True 2057.0 38717.0 8484.0
13 889638837579907072 NaN NaN 2017-07-25 00:10:02 +0000 <a href="http://twitter.com/download/iphone" r... This is Ted. He does his best. Sometimes that'... NaN NaN NaN https://twitter.com/dog_rates/status/889638837... ... True boxer 0.002129 True Staffordshire_bullterrier 0.001498 True 2056.0 27606.0 4691.0
14 889531135344209921 NaN NaN 2017-07-24 17:02:04 +0000 <a href="http://twitter.com/download/iphone" r... This is Stuart. He's sporting his favorite fan... NaN NaN NaN https://twitter.com/dog_rates/status/889531135... ... True Labrador_retriever 0.013834 True redbone 0.007958 True 2055.0 15319.0 2302.0
15 889278841981685760 NaN NaN 2017-07-24 00:19:32 +0000 <a href="http://twitter.com/download/iphone" r... This is Oliver. You're witnessing one of his m... NaN NaN NaN https://twitter.com/dog_rates/status/889278841... ... True borzoi 0.194742 True Saluki 0.027351 True 2054.0 25698.0 5626.0
16 888917238123831296 NaN NaN 2017-07-23 00:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This is Jim. He found a fren. Taught him how t... NaN NaN NaN https://twitter.com/dog_rates/status/888917238... ... True Tibetan_mastiff 0.120184 True Labrador_retriever 0.105506 True 2053.0 29533.0 4670.0
17 888804989199671297 NaN NaN 2017-07-22 16:56:37 +0000 <a href="http://twitter.com/download/iphone" r... This is Zeke. He has a new stick. Very proud o... NaN NaN NaN https://twitter.com/dog_rates/status/888804989... ... True Labrador_retriever 0.184172 True English_setter 0.073482 True 2052.0 25997.0 4524.0
18 888554962724278272 NaN NaN 2017-07-22 00:23:06 +0000 <a href="http://twitter.com/download/iphone" r... This is Ralphus. He's powering up. Attempting ... NaN NaN NaN https://twitter.com/dog_rates/status/888554962... ... True Eskimo_dog 0.166511 True malamute 0.111411 True 2051.0 20250.0 3710.0
19 888202515573088257 NaN NaN 2017-07-21 01:02:36 +0000 <a href="http://twitter.com/download/iphone" r... RT @dog_rates: This is Canela. She attempted s... 8.874740e+17 4.196984e+09 2017-07-19 00:47:34 +0000 https://twitter.com/dog_rates/status/887473957... ... True Rhodesian_ridgeback 0.054950 True beagle 0.038915 True NaN NaN NaN
20 888078434458587136 NaN NaN 2017-07-20 16:49:33 +0000 <a href="http://twitter.com/download/iphone" r... This is Gerald. He was just told he didn't get... NaN NaN NaN https://twitter.com/dog_rates/status/888078434... ... True pug 0.000932 True bull_mastiff 0.000903 True 2050.0 22129.0 3625.0
21 887705289381826560 NaN NaN 2017-07-19 16:06:48 +0000 <a href="http://twitter.com/download/iphone" r... This is Jeffrey. He has a monopoly on the pool... NaN NaN NaN https://twitter.com/dog_rates/status/887705289... ... True redbone 0.087582 True Weimaraner 0.026236 True 2049.0 30661.0 5572.0
22 887517139158093824 NaN NaN 2017-07-19 03:39:09 +0000 <a href="http://twitter.com/download/iphone" r... I've yet to rate a Venezuelan Hover Wiener. Th... NaN NaN NaN https://twitter.com/dog_rates/status/887517139... ... False tow_truck 0.029175 False shopping_cart 0.026321 False 2048.0 46897.0 12031.0
23 887473957103951883 NaN NaN 2017-07-19 00:47:34 +0000 <a href="http://twitter.com/download/iphone" r... This is Canela. She attempted some fancy porch... NaN NaN NaN https://twitter.com/dog_rates/status/887473957... ... True Rhodesian_ridgeback 0.054950 True beagle 0.038915 True 2047.0 69956.0 18756.0
24 887343217045368832 NaN NaN 2017-07-18 16:08:03 +0000 <a href="http://twitter.com/download/iphone" r... You may not have known you needed to see this ... NaN NaN NaN https://twitter.com/dog_rates/status/887343217... ... True sea_lion 0.275645 False Weimaraner 0.134203 True 2046.0 34207.0 10692.0
25 887101392804085760 NaN NaN 2017-07-18 00:07:08 +0000 <a href="http://twitter.com/download/iphone" r... This... is a Jubilant Antarctic House Bear. We... NaN NaN NaN https://twitter.com/dog_rates/status/887101392... ... True Eskimo_dog 0.035029 True Staffordshire_bullterrier 0.029705 True 2045.0 31033.0 6135.0
26 886983233522544640 NaN NaN 2017-07-17 16:17:36 +0000 <a href="http://twitter.com/download/iphone" r... This is Maya. She's very shy. Rarely leaves he... NaN NaN NaN https://twitter.com/dog_rates/status/886983233... ... True toy_terrier 0.143528 True can_opener 0.032253 False 2044.0 35743.0 8019.0
27 886736880519319552 NaN NaN 2017-07-16 23:58:41 +0000 <a href="http://twitter.com/download/iphone" r... This is Mingus. He's a wonderful father to his... NaN NaN NaN https://www.gofundme.com/mingusneedsus,https:/... ... True Great_Pyrenees 0.186136 True Dandie_Dinmont 0.086346 True 2043.0 12276.0 3412.0
28 886680336477933568 NaN NaN 2017-07-16 20:14:00 +0000 <a href="http://twitter.com/download/iphone" r... This is Derek. He's late for a dog meeting. 13... NaN NaN NaN https://twitter.com/dog_rates/status/886680336... ... False sports_car 0.139952 False car_wheel 0.044173 False 2042.0 22780.0 4591.0
29 886366144734445568 NaN NaN 2017-07-15 23:25:31 +0000 <a href="http://twitter.com/download/iphone" r... This is Roscoe. Another pupper fallen victim t... NaN NaN NaN https://twitter.com/dog_rates/status/886366144... ... True Chihuahua 0.000361 True Boston_bull 0.000076 True 2041.0 21464.0 3289.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2326 666411507551481857 NaN NaN 2015-11-17 00:24:19 +0000 <a href="http://twitter.com/download/iphone" r... This is quite the dog. Gets really excited whe... NaN NaN NaN https://twitter.com/dog_rates/status/666411507... ... False barracouta 0.271485 False gar 0.189945 False 29.0 457.0 337.0
2327 666407126856765440 NaN NaN 2015-11-17 00:06:54 +0000 <a href="http://twitter.com/download/iphone" r... This is a southern Vesuvius bumblegruff. Can d... NaN NaN NaN https://twitter.com/dog_rates/status/666407126... ... True bloodhound 0.244220 True flat-coated_retriever 0.173810 True 28.0 113.0 43.0
2328 666396247373291520 NaN NaN 2015-11-16 23:23:41 +0000 <a href="http://twitter.com/download/iphone" r... Oh goodness. A super rare northeast Qdoba kang... NaN NaN NaN https://twitter.com/dog_rates/status/666396247... ... True toy_terrier 0.009397 True papillon 0.004577 True 27.0 171.0 91.0
2329 666373753744588802 NaN NaN 2015-11-16 21:54:18 +0000 <a href="http://twitter.com/download/iphone" r... Those are sunglasses and a jean jacket. 11/10 ... NaN NaN NaN https://twitter.com/dog_rates/status/666373753... ... True Afghan_hound 0.259551 True briard 0.206803 True 26.0 194.0 96.0
2330 666362758909284353 NaN NaN 2015-11-16 21:10:36 +0000 <a href="http://twitter.com/download/iphone" r... Unique dog here. Very small. Lives in containe... NaN NaN NaN https://twitter.com/dog_rates/status/666362758... ... False skunk 0.002402 False hamster 0.000461 False 25.0 800.0 589.0
2331 666353288456101888 NaN NaN 2015-11-16 20:32:58 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a mixed Asiago from the Galápagos... NaN NaN NaN https://twitter.com/dog_rates/status/666353288... ... True Siberian_husky 0.147655 True Eskimo_dog 0.093412 True 24.0 228.0 76.0
2332 666345417576210432 NaN NaN 2015-11-16 20:01:42 +0000 <a href="http://twitter.com/download/iphone" r... Look at this jokester thinking seat belt laws ... NaN NaN NaN https://twitter.com/dog_rates/status/666345417... ... True Chesapeake_Bay_retriever 0.054787 True Labrador_retriever 0.014241 True 23.0 307.0 146.0
2333 666337882303524864 NaN NaN 2015-11-16 19:31:45 +0000 <a href="http://twitter.com/download/iphone" r... This is an extremely rare horned Parthenon. No... NaN NaN NaN https://twitter.com/dog_rates/status/666337882... ... False Newfoundland 0.278407 True groenendael 0.102643 True 22.0 203.0 96.0
2334 666293911632134144 NaN NaN 2015-11-16 16:37:02 +0000 <a href="http://twitter.com/download/iphone" r... This is a funny dog. Weird toes. Won't come do... NaN NaN NaN https://twitter.com/dog_rates/status/666293911... ... False otter 0.015250 False great_grey_owl 0.013207 False 21.0 519.0 365.0
2335 666287406224695296 NaN NaN 2015-11-16 16:11:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an Albanian 3 1/2 legged Episcopalian... NaN NaN NaN https://twitter.com/dog_rates/status/666287406... ... True toy_poodle 0.063064 True miniature_poodle 0.025581 True 20.0 152.0 71.0
2336 666273097616637952 NaN NaN 2015-11-16 15:14:19 +0000 <a href="http://twitter.com/download/iphone" r... Can take selfies 11/10 https://t.co/ws2AMaNwPW NaN NaN NaN https://twitter.com/dog_rates/status/666273097... ... True toy_terrier 0.111884 True basenji 0.111152 True 19.0 183.0 81.0
2337 666268910803644416 NaN NaN 2015-11-16 14:57:41 +0000 <a href="http://twitter.com/download/iphone" r... Very concerned about fellow dog trapped in com... NaN NaN NaN https://twitter.com/dog_rates/status/666268910... ... False desk 0.085547 False bookcase 0.079480 False 18.0 108.0 37.0
2338 666104133288665088 NaN NaN 2015-11-16 04:02:55 +0000 <a href="http://twitter.com/download/iphone" r... Not familiar with this breed. No tail (weird).... NaN NaN NaN https://twitter.com/dog_rates/status/666104133... ... False cock 0.033919 False partridge 0.000052 False 17.0 14689.0 6822.0
2339 666102155909144576 NaN NaN 2015-11-16 03:55:04 +0000 <a href="http://twitter.com/download/iphone" r... Oh my. Here you are seeing an Adobe Setter giv... NaN NaN NaN https://twitter.com/dog_rates/status/666102155... ... True Newfoundland 0.149842 True borzoi 0.133649 True 16.0 81.0 15.0
2340 666099513787052032 NaN NaN 2015-11-16 03:44:34 +0000 <a href="http://twitter.com/download/iphone" r... Can stand on stump for what seems like a while... NaN NaN NaN https://twitter.com/dog_rates/status/666099513... ... True Shih-Tzu 0.166192 True Dandie_Dinmont 0.089688 True 15.0 161.0 73.0
2341 666094000022159362 NaN NaN 2015-11-16 03:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This appears to be a Mongolian Presbyterian mi... NaN NaN NaN https://twitter.com/dog_rates/status/666094000... ... True German_shepherd 0.078260 True malinois 0.075628 True 14.0 167.0 78.0
2342 666082916733198337 NaN NaN 2015-11-16 02:38:37 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a well-established sunblockerspan... NaN NaN NaN https://twitter.com/dog_rates/status/666082916... ... True bull_mastiff 0.404722 True French_bulldog 0.048960 True 13.0 121.0 47.0
2343 666073100786774016 NaN NaN 2015-11-16 01:59:36 +0000 <a href="http://twitter.com/download/iphone" r... Let's hope this flight isn't Malaysian (lol). ... NaN NaN NaN https://twitter.com/dog_rates/status/666073100... ... True English_foxhound 0.175382 True Ibizan_hound 0.097471 True 12.0 334.0 172.0
2344 666071193221509120 NaN NaN 2015-11-16 01:52:02 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a northern speckled Rhododendron.... NaN NaN NaN https://twitter.com/dog_rates/status/666071193... ... True Yorkshire_terrier 0.174201 True Pekinese 0.109454 True 11.0 154.0 66.0
2345 666063827256086533 NaN NaN 2015-11-16 01:22:45 +0000 <a href="http://twitter.com/download/iphone" r... This is the happiest dog you will ever see. Ve... NaN NaN NaN https://twitter.com/dog_rates/status/666063827... ... True Tibetan_mastiff 0.093718 True Labrador_retriever 0.072427 True 10.0 494.0 229.0
2346 666058600524156928 NaN NaN 2015-11-16 01:01:59 +0000 <a href="http://twitter.com/download/iphone" r... Here is the Rand Paul of retrievers folks! He'... NaN NaN NaN https://twitter.com/dog_rates/status/666058600... ... True komondor 0.192305 True soft-coated_wheaten_terrier 0.082086 True 9.0 118.0 61.0
2347 666057090499244032 NaN NaN 2015-11-16 00:55:59 +0000 <a href="http://twitter.com/download/iphone" r... My oh my. This is a rare blond Canadian terrie... NaN NaN NaN https://twitter.com/dog_rates/status/666057090... ... False shopping_basket 0.014594 False golden_retriever 0.007959 True 8.0 304.0 146.0
2348 666055525042405380 NaN NaN 2015-11-16 00:49:46 +0000 <a href="http://twitter.com/download/iphone" r... Here is a Siberian heavily armored polar bear ... NaN NaN NaN https://twitter.com/dog_rates/status/666055525... ... True Tibetan_mastiff 0.058279 True fur_coat 0.054449 False 7.0 449.0 261.0
2349 666051853826850816 NaN NaN 2015-11-16 00:35:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an odd dog. Hard on the outside but lo... NaN NaN NaN https://twitter.com/dog_rates/status/666051853... ... False mud_turtle 0.045885 False terrapin 0.017885 False 6.0 1248.0 876.0
2350 666050758794694657 NaN NaN 2015-11-16 00:30:50 +0000 <a href="http://twitter.com/download/iphone" r... This is a truly beautiful English Wilson Staff... NaN NaN NaN https://twitter.com/dog_rates/status/666050758... ... True English_springer 0.263788 True Greater_Swiss_Mountain_dog 0.016199 True 5.0 136.0 60.0
2351 666049248165822465 NaN NaN 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN NaN NaN https://twitter.com/dog_rates/status/666049248... ... True Rottweiler 0.243682 True Doberman 0.154629 True 4.0 111.0 41.0
2352 666044226329800704 NaN NaN 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN NaN NaN https://twitter.com/dog_rates/status/666044226... ... True redbone 0.360687 True miniature_pinscher 0.222752 True 3.0 310.0 146.0
2353 666033412701032449 NaN NaN 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN NaN NaN https://twitter.com/dog_rates/status/666033412... ... True malinois 0.138584 True bloodhound 0.116197 True 2.0 128.0 47.0
2354 666029285002620928 NaN NaN 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN NaN NaN https://twitter.com/dog_rates/status/666029285... ... True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True 1.0 132.0 48.0
2355 666020888022790149 NaN NaN 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN NaN NaN https://twitter.com/dog_rates/status/666020888... ... True collie 0.156665 True Shetland_sheepdog 0.061428 True 0.0 2523.0 530.0

2356 rows × 31 columns

In [13]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 31 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       2075 non-null object
img_num                       2075 non-null float64
p1                            2075 non-null object
p1_conf                       2075 non-null float64
p1_dog                        2075 non-null object
p2                            2075 non-null object
p2_conf                       2075 non-null float64
p2_dog                        2075 non-null object
p3                            2075 non-null object
p3_conf                       2075 non-null float64
p3_dog                        2075 non-null object
Unnamed: 0                    2072 non-null float64
favorites                     2072 non-null float64
retweet_count                 2072 non-null float64
dtypes: float64(11), int64(3), object(17)
memory usage: 570.7+ KB
In [14]:
df.describe()
Out[14]:
tweet_id in_reply_to_status_id in_reply_to_user_id retweeted_status_id retweeted_status_user_id rating_numerator rating_denominator img_num p1_conf p2_conf p3_conf Unnamed: 0 favorites retweet_count
count 2.356000e+03 7.800000e+01 7.800000e+01 1.810000e+02 1.810000e+02 2356.000000 2356.000000 2075.000000 2075.000000 2.075000e+03 2.075000e+03 2072.000000 2072.000000 2072.000000
mean 7.427716e+17 7.455079e+17 2.014171e+16 7.720400e+17 1.241698e+16 13.126486 10.455433 1.203855 0.594548 1.345886e-01 6.032417e-02 1035.500000 8584.507722 2955.537645
std 6.856705e+16 7.582492e+16 1.252797e+17 6.236928e+16 9.599254e+16 45.876648 6.745237 0.561875 0.271174 1.006657e-01 5.090593e-02 598.279199 12280.428591 5025.796121
min 6.660209e+17 6.658147e+17 1.185634e+07 6.661041e+17 7.832140e+05 0.000000 0.000000 1.000000 0.044333 1.011300e-08 1.740170e-10 0.000000 0.000000 15.000000
25% 6.783989e+17 6.757419e+17 3.086374e+08 7.186315e+17 4.196984e+09 10.000000 10.000000 1.000000 0.364412 5.388625e-02 1.622240e-02 517.750000 1666.250000 629.500000
50% 7.196279e+17 7.038708e+17 4.196984e+09 7.804657e+17 4.196984e+09 11.000000 10.000000 1.000000 0.588230 1.181810e-01 4.944380e-02 1035.500000 3846.000000 1398.000000
75% 7.993373e+17 8.257804e+17 4.196984e+09 8.203146e+17 4.196984e+09 12.000000 10.000000 1.000000 0.843855 1.955655e-01 9.180755e-02 1553.250000 10901.000000 3413.000000
max 8.924206e+17 8.862664e+17 8.405479e+17 8.874740e+17 7.874618e+17 1776.000000 170.000000 4.000000 1.000000 4.880140e-01 2.734190e-01 2071.000000 132214.000000 79032.000000
In [15]:
df['rating_numerator'].value_counts()
df['rating_denominator'].value_counts()
Out[15]:
10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64
In [16]:
df['name'].value_counts()
Out[16]:
None         745
a             55
Charlie       12
Cooper        11
Lucy          11
Oliver        11
Lola          10
Penny         10
Tucker        10
Winston        9
Bo             9
Sadie          8
the            8
Bailey         7
an             7
Toby           7
Buddy          7
Daisy          7
Bella          6
Leo            6
Dave           6
Milo           6
Jack           6
Scout          6
Stanley        6
Jax            6
Koda           6
Rusty          6
Oscar          6
Sammy          5
            ... 
Kendall        1
Bodie          1
Adele          1
Reagan         1
Napolean       1
Grey           1
Freddery       1
Devón          1
Brockly        1
Kody           1
Hubertson      1
Kial           1
Jett           1
Shawwn         1
Tuco           1
Chase          1
Bradlay        1
Alexander      1
Burt           1
Rumpole        1
Stark          1
Tedders        1
Katie          1
Beckham        1
Julius         1
Boston         1
Socks          1
Eriq           1
Buddah         1
Mookie         1
Name: name, Length: 957, dtype: int64

Quality

(accuracy, validity, consistency, completeness)

  • There were 2075 rows in the images dataframe compared to 2356 rows in the archive dataframe. This due to non-pictures and retweets included.
  • Several columns have empty values, such as in_reply_to_status, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp.
  • The name column has a lot of non-name values. The most popular name is 'a' which is not a name at all.
  • The numerator and denominator columns have wacky values.
  • The timestamp type is an object, not a timestamp.
  • The text column could be parsed to include gender.
  • The text column could also be parsed to include hashtags.
  • In several columns null objects are non-null.

Tidiness

(structural issues)

  • The columns predicting the dog breed could be condensed.
  • We don't need the 'Unnamed: 0' column
  • The whole dataframe could be split into a 'dog' table and a 'tweet' table, but I'm not sure if I want to do that.
  • The dog 'stages' have values as columns, instead of one column filled with the values.

Clean

Delete Superfluous Column

In [17]:
del df['Unnamed: 0'] #code
list(df) #test
Out[17]:
['tweet_id',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'timestamp',
 'source',
 'text',
 'retweeted_status_id',
 'retweeted_status_user_id',
 'retweeted_status_timestamp',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo',
 'jpg_url',
 'img_num',
 'p1',
 'p1_conf',
 'p1_dog',
 'p2',
 'p2_conf',
 'p2_dog',
 'p3',
 'p3_conf',
 'p3_dog',
 'favorites',
 'retweet_count']

Create a Hashtag Column

In [18]:
df['hashtag'] = df['text'].str.extract(r"#(\w+)", expand=True)
df['hashtag'].value_counts()
# there will be a lot of empty values but I think the info is interesting.
Out[18]:
BarkWeek                 9
PrideMonth               3
WomensMarch              1
NoDaysOff                1
PrideMonthPuppo          1
GoodDogs                 1
dogsatpollingstations    1
WKCDogShow               1
Canada150                1
ScienceMarch             1
notallpuppers            1
BellLetsTalk             1
K9VeteransDay            1
FinalFur                 1
ImWithThor               1
BATP                     1
LoveTwitter              1
Name: hashtag, dtype: int64

Convert Timestamp to a Datetime

In [19]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 31 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null datetime64[ns]
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       2075 non-null object
img_num                       2075 non-null float64
p1                            2075 non-null object
p1_conf                       2075 non-null float64
p1_dog                        2075 non-null object
p2                            2075 non-null object
p2_conf                       2075 non-null float64
p2_dog                        2075 non-null object
p3                            2075 non-null object
p3_conf                       2075 non-null float64
p3_dog                        2075 non-null object
favorites                     2072 non-null float64
retweet_count                 2072 non-null float64
hashtag                       27 non-null object
dtypes: datetime64[ns](1), float64(10), int64(3), object(17)
memory usage: 570.7+ KB

Remove Retweets and Tweets without Pictures

In [20]:
#before we proceed, let's get rid of the rows we won't use, the ones without pics and the retweets
df = df[pd.notnull(df['jpg_url'])] #without pics
df = df[pd.isnull(df['retweeted_status_id'])] #additional retweets
len(df['text'])
Out[20]:
1994
In [21]:
#and now to weed out a few more surpluse columns, these regarding the retweet status
del df['retweeted_status_id']
del df['retweeted_status_user_id']
del df['retweeted_status_timestamp']
list(df)
Out[21]:
['tweet_id',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'timestamp',
 'source',
 'text',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo',
 'jpg_url',
 'img_num',
 'p1',
 'p1_conf',
 'p1_dog',
 'p2',
 'p2_conf',
 'p2_dog',
 'p3',
 'p3_conf',
 'p3_dog',
 'favorites',
 'retweet_count',
 'hashtag']

Condense 'Dog Type' Columns

In [22]:
#lets condense the 'dog type' columns into one
dog_type = []

string_in = ['puppo', 'pupper', 'doggo', 'floof']
string_out = ['puppo', 'pupper', 'doggo', 'floofer']

for row in df['text']:
    row = row.lower()
    for word in string_in:
        if word in str(row):
            dog_type.append(string_out[string_in.index(word)])
            break
    else:
        dog_type.append('None')

df['dog_type'] = dog_type
df['dog_type'].value_counts()
Out[22]:
None       1625
pupper      237
doggo        69
floofer      34
puppo        29
Name: dog_type, dtype: int64
In [23]:
#now I can delete those redundant columns
df.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

Condense Dog Breed Analysis

In [24]:
# next let's condense the dog breed analysis
breed = []
confidence = []

def breed_confidence(row):
    if row['p1_dog'] == True:
        breed.append(row['p1'])
        confidence.append(row['p1_conf'])
    elif row['p2_dog'] == True:
        breed.append(row['p2'])
        confidence.append(row['p2_conf'])
    elif row['p3_dog'] == True:
        breed.append(row['p3'])
        confidence.append(row['p3_conf'])
    else:
        breed.append('Unidentifiable')
        confidence.append(0)
      
    # this isn't about the parsing AI so the rest of this data is deletable

df.apply(breed_confidence, axis=1)
df['breed'] = breed
df['confidence'] = confidence
df.head()
Out[24]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text expanded_urls rating_numerator rating_denominator name ... p2_dog p3 p3_conf p3_dog favorites retweet_count hashtag dog_type breed confidence
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... https://twitter.com/dog_rates/status/892420643... 13 10 Phineas ... False banana 0.076110 False 39455.0 8836.0 NaN None Unidentifiable 0.000000
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... https://twitter.com/dog_rates/status/892177421... 13 10 Tilly ... True papillon 0.068957 True 33766.0 6475.0 NaN None Chihuahua 0.323581
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... https://twitter.com/dog_rates/status/891815181... 12 10 Archie ... True kelpie 0.031379 True 25424.0 4295.0 NaN None Chihuahua 0.716012
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... https://twitter.com/dog_rates/status/891689557... 13 10 Darla ... True spatula 0.040836 False 42828.0 8915.0 NaN None Labrador_retriever 0.168086
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... https://twitter.com/dog_rates/status/891327558... 12 10 Franklin ... True German_short-haired_pointer 0.175219 True 40975.0 9708.0 BarkWeek None basset 0.555712

5 rows × 27 columns

In [25]:
#I know I could have used column numbers but I didn't want an accident if I accidentally ran this block twice
df.drop(['p1', 'p1_conf', 'p1_dog', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog',], axis=1, inplace=True)
df.head()
Out[25]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text expanded_urls rating_numerator rating_denominator name jpg_url img_num favorites retweet_count hashtag dog_type breed confidence
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... https://twitter.com/dog_rates/status/892420643... 13 10 Phineas https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1.0 39455.0 8836.0 NaN None Unidentifiable 0.000000
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... https://twitter.com/dog_rates/status/892177421... 13 10 Tilly https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1.0 33766.0 6475.0 NaN None Chihuahua 0.323581
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... https://twitter.com/dog_rates/status/891815181... 12 10 Archie https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1.0 25424.0 4295.0 NaN None Chihuahua 0.716012
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... https://twitter.com/dog_rates/status/891689557... 13 10 Darla https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1.0 42828.0 8915.0 NaN None Labrador_retriever 0.168086
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... https://twitter.com/dog_rates/status/891327558... 12 10 Franklin https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2.0 40975.0 9708.0 BarkWeek None basset 0.555712

Remove Redundant Columns

In [26]:
df['in_reply_to_status_id'].value_counts()
df['in_reply_to_user_id'].value_counts()
Out[26]:
4.196984e+09    23
Name: in_reply_to_user_id, dtype: int64
In [27]:
#the ['in_reply_to_user_id'] are all 4196983835, which is @dog_rates, so this info is not useful
df.drop(['in_reply_to_status_id', 'in_reply_to_user_id'], axis=1, inplace=True)
In [28]:
df['source'].value_counts() # not sure if data worth keeping, will keep for now
Out[28]:
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     1955
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       28
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

Parse Dog Rates and Dog Count

In [29]:
rates = []

extract_rates = lambda x: rates.append(re.findall(r'(\d+(\.\d+)|(\d+))\/(\d+0)', x, flags=0))

df['text'].apply(extract_rates)

numerator = []
dog_count = []

for item in rates:
    
    # for tweets with no rating, but a picture, so a dog
    if len(item) == 0:
        numerator.append('NaN')
        dog_count.append(1)
        
    # for tweetss with one rating and one dog
    elif len(item) == 1 and item[0][-1] == '10':
        numerator.append(float(item[0][0]))
        dog_count.append(1)
   
    # for group ratings
    elif len(item) == 1: 
        avg = float(item[0][0]) / (float(item[0][-1]) / 10) 
        numerator.append(avg)
        dog_count.append(float(item[0][-1]) / 10)
   
    # for tweets with more than one rating
    elif len(item) > 1: 
        total = 0
        list = []
        for i in range(len(item)):
            if item[i][-1] == '10': #one tweet has the phrase '50/50' so I'm coding to exclude it
                list.append(item[i])
        for rate in list:
            total = total + float(rate[0])
        avg = total / len(item)
        numerator.append(avg)
        dog_count.append(len(item))
   
    # in order to catch bugs
    else:
        numerator.append('Not parsed')
        dog_count.append('Not parsed') 
        
df['rating'] = numerator # not need to also add denominator since they are all 10!
df['dog_count'] = dog_count
df['rating'].value_counts()
Out[29]:
12.0                 453
10.0                 411
11.0                 399
13.0                 261
9.0                  152
8.0                   94
7.0                   52
14.0                  36
6.0                   32
5.0                   30
3.0                   19
4.0                   14
2.0                    9
8.5                    4
1.0                    4
7.5                    3
9.5                    3
6.5                    2
5.5                    2
10.5                   2
0.0                    2
1776.0                 1
9.75                   1
9.666666666666666      1
4.5                    1
NaN                    1
11.27                  1
13.5                   1
420.0                  1
11.5                   1
11.26                  1
Name: rating, dtype: int64
In [30]:
# All are below 14 except the joke ratings of 420 and 1776, so success!
df.drop([ 'rating_numerator', 'rating_denominator'], axis=1, inplace=True)
#no longer needed since 'rating' has the info
In [31]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1994 entries, 0 to 2355
Data columns (total 16 columns):
tweet_id         1994 non-null int64
timestamp        1994 non-null datetime64[ns]
source           1994 non-null object
text             1994 non-null object
expanded_urls    1994 non-null object
name             1994 non-null object
jpg_url          1994 non-null object
img_num          1994 non-null float64
favorites        1994 non-null float64
retweet_count    1994 non-null float64
hashtag          22 non-null object
dog_type         1994 non-null object
breed            1994 non-null object
confidence       1994 non-null float64
rating           1994 non-null object
dog_count        1994 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(1), object(9)
memory usage: 264.8+ KB
In [32]:
df['dog_count'].value_counts()
Out[32]:
1.0     1958
2.0       23
5.0        2
8.0        2
3.0        1
12.0       1
11.0       1
13.0       1
4.0        1
9.0        1
17.0       1
15.0       1
7.0        1
Name: dog_count, dtype: int64

Extract Names

In [33]:
df['text_split'] = df['text'].str.split()
In [34]:
names = []

# use string starts with method to clean this up

def extract_names(row):
    
    # 'This is Charlie'
    if row['text'].startswith('This is ') and re.match(r'[A-Z].*', row['text_split'][2]):
            names.append(row['text_split'][2].strip('.').strip(','))
    
    # 'Meet Charlie'
    elif row['text'].startswith('Meet ') and re.match(r'[A-Z].*', row['text_split'][1]):
            names.append(row['text_split'][1].strip('.').strip(','))
    
    # 'Say hello to Charlie'
    elif row['text'].startswith('Say hello to ') and re.match(r'[A-Z].*', row['text_split'][3]):
            names.append(row['text_split'][3].strip('.').strip(','))
    
    # 'Here we have Charlie'
    elif row['text'].startswith('Here we have ') and re.match(r'[A-Z].*', row['text_split'][3]):
            names.append(row['text_split'][3].strip('.').strip(','))
    
    # 'named Charlie'           
    elif 'named' in row['text'] and re.match(r'[A-Z].*', row['text_split'][(row['text_split'].index('named') + 1)]): 
            names.append(row['text_split'][(row['text_split'].index('named') + 1)])
            
    else:
        names.append('Nameless')
        
        
df.apply(extract_names, axis=1)

len(names)
df['names'] = names
df['names'].value_counts()
Out[34]:
Nameless     625
Charlie       11
Cooper        10
Lucy          10
Oliver        10
Penny          9
Tucker         9
Sadie          8
Winston        8
Lola           7
Daisy          7
Koda           6
Jax            6
Bella          6
Stanley        6
Toby           6
Bo             6
Chester        5
Leo            5
Louis          5
Milo           5
Scout          5
Bailey         5
Buddy          5
Oscar          5
Rusty          5
Cassie         4
Phil           4
Chip           4
Jerry          4
            ... 
Nimbus         1
Franq          1
Malikai        1
Darby          1
Filup          1
Timber         1
Dutch          1
Fillup         1
Molly          1
Meyer          1
Stephanus      1
Leonard        1
Hermione       1
Sprinkles      1
Rodney         1
Simba          1
Ronduh         1
Crimson        1
Remus          1
Jaycob         1
Tanner         1
Peanut         1
Kaia           1
Danny          1
Scott          1
River          1
Griswold       1
Iroh           1
Sephie         1
Canela         1
Name: names, Length: 938, dtype: int64
In [35]:
df['name'].value_counts()
# So mine doesn't have 'a', 'the', and 'an' as common names
# Mine has more nameless but I think that's appropriate. 
Out[35]:
None          546
a              55
Charlie        11
Lucy           10
Oliver         10
Cooper         10
Tucker          9
Penny           9
Sadie           8
Winston         8
Lola            7
Daisy           7
the             7
Toby            7
Jax             6
Stanley         6
Koda            6
Bella           6
an              6
Bo              6
Bailey          5
Louis           5
Chester         5
Leo             5
Dave            5
Milo            5
Oscar           5
Buddy           5
Scout           5
Rusty           5
             ... 
Zara            1
such            1
Huck            1
Ralphson        1
Rufio           1
Nollie          1
Sierra          1
River           1
Arnie           1
Griswold        1
Iroh            1
Sephie          1
Fillup          1
Kara            1
Timber          1
Chuq            1
Filup           1
Chesterson      1
Nigel           1
Charleson       1
Wishes          1
Birf            1
Jordy           1
Divine          1
Amélie          1
Brooks          1
DonDon          1
Glacier         1
Mosby           1
Canela          1
Name: name, Length: 936, dtype: int64

Parse Dog Gender

In [36]:
tagger = lambda x: pos_tag(x)
df['tagged'] = df['text_split'].apply(tagger)
In [37]:
pronouner = lambda x: [word for word, pos in x if pos == 'PRP']
df['pronouns'] = df['tagged'].apply(pronouner)
lowerer = lambda x: [a.lower() for a in x]
df['pronouns'] = df['pronouns'].apply(lowerer)
df['pronouns'].head(10)
Out[37]:
0                    []
1                    []
2             [he, you]
3             [she, us]
4    [he, you, him, he]
5                  [we]
6              [he, he]
7      [you, they, you]
8                 [she]
9                 [she]
Name: pronouns, dtype: object
In [38]:
pronouns = df['pronouns']
pronouns.to_csv('pronouns.csv')
In [39]:
gender = []

male = ['he', 'him', 'his', "he's", 'himself']
female = ['she', 'her', 'hers', 'herself', "she's"]

def genderer(row):
    row['text'] = row['text'].islower()
    if len(row['pronouns']) > 0 and any(i in female for i in row['pronouns']):
        gender.append('Female')
    elif len(row['pronouns']) > 0 and any(i in male for i in row['pronouns']):
        gender.append('Male') 
    elif 'girl' in str(row['text']):
        gender.append('Female')   
    elif 'boy' in str(row['text']):
        gender.append('Male')
    else:
        gender.append('Neutral')
        
df.apply(genderer, axis=1)
df['gender'] = gender
df['gender'].value_counts()
Out[39]:
Neutral    1287
Male        529
Female      178
Name: gender, dtype: int64
In [40]:
df.drop(['text_split', 'tagged', 'pronouns'], axis=1, inplace=True)
df.head(20)
Out[40]:
tweet_id timestamp source text expanded_urls name jpg_url img_num favorites retweet_count hashtag dog_type breed confidence rating dog_count names gender
0 892420643555336193 2017-08-01 16:23:56 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... https://twitter.com/dog_rates/status/892420643... Phineas https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1.0 39455.0 8836.0 NaN None Unidentifiable 0.000000 13 1.0 Phineas Neutral
1 892177421306343426 2017-08-01 00:17:27 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... https://twitter.com/dog_rates/status/892177421... Tilly https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1.0 33766.0 6475.0 NaN None Chihuahua 0.323581 13 1.0 Tilly Neutral
2 891815181378084864 2017-07-31 00:18:03 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... https://twitter.com/dog_rates/status/891815181... Archie https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1.0 25424.0 4295.0 NaN None Chihuahua 0.716012 12 1.0 Archie Male
3 891689557279858688 2017-07-30 15:58:51 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... https://twitter.com/dog_rates/status/891689557... Darla https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1.0 42828.0 8915.0 NaN None Labrador_retriever 0.168086 13 1.0 Darla Female
4 891327558926688256 2017-07-29 16:00:24 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... https://twitter.com/dog_rates/status/891327558... Franklin https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2.0 40975.0 9708.0 BarkWeek None basset 0.555712 12 1.0 Franklin Male
5 891087950875897856 2017-07-29 00:08:17 <a href="http://twitter.com/download/iphone" r... Here we have a majestic great white breaching ... https://twitter.com/dog_rates/status/891087950... None https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg 1.0 20534.0 3237.0 BarkWeek None Chesapeake_Bay_retriever 0.425595 13 1.0 Nameless Neutral
6 890971913173991426 2017-07-28 16:27:12 <a href="http://twitter.com/download/iphone" r... Meet Jax. He enjoys ice cream so much he gets ... https://gofundme.com/ydvmve-surgery-for-jax,ht... Jax https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg 1.0 12047.0 2140.0 NaN None Appenzeller 0.341703 13 1.0 Jax Male
7 890729181411237888 2017-07-28 00:22:40 <a href="http://twitter.com/download/iphone" r... When you watch your owner call another dog a g... https://twitter.com/dog_rates/status/890729181... None https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg 2.0 66574.0 19527.0 NaN None Pomeranian 0.566142 13 1.0 Nameless Neutral
8 890609185150312448 2017-07-27 16:25:51 <a href="http://twitter.com/download/iphone" r... This is Zoey. She doesn't want to be one of th... https://twitter.com/dog_rates/status/890609185... Zoey https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg 1.0 28168.0 4396.0 BarkWeek None Irish_terrier 0.487574 13 1.0 Zoey Female
9 890240255349198849 2017-07-26 15:59:51 <a href="http://twitter.com/download/iphone" r... This is Cassie. She is a college pup. Studying... https://twitter.com/dog_rates/status/890240255... Cassie https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg 1.0 32444.0 7664.0 NaN doggo Pembroke 0.511319 14 1.0 Cassie Female
10 890006608113172480 2017-07-26 00:31:25 <a href="http://twitter.com/download/iphone" r... This is Koda. He is a South Australian decksha... https://twitter.com/dog_rates/status/890006608... Koda https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg 1.0 31104.0 7564.0 BarkWeek None Samoyed 0.957979 13 1.0 Koda Male
11 889880896479866881 2017-07-25 16:11:53 <a href="http://twitter.com/download/iphone" r... This is Bruno. He is a service shark. Only get... https://twitter.com/dog_rates/status/889880896... Bruno https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg 1.0 28194.0 5107.0 NaN None French_bulldog 0.377417 13 1.0 Bruno Male
12 889665388333682689 2017-07-25 01:55:32 <a href="http://twitter.com/download/iphone" r... Here's a puppo that seems to be on the fence a... https://twitter.com/dog_rates/status/889665388... None https://pbs.twimg.com/media/DFi579UWsAAatzw.jpg 1.0 38717.0 8484.0 NaN puppo Pembroke 0.966327 13 1.0 Nameless Neutral
13 889638837579907072 2017-07-25 00:10:02 <a href="http://twitter.com/download/iphone" r... This is Ted. He does his best. Sometimes that'... https://twitter.com/dog_rates/status/889638837... Ted https://pbs.twimg.com/media/DFihzFfXsAYGDPR.jpg 1.0 27606.0 4691.0 NaN None French_bulldog 0.991650 12 1.0 Ted Male
14 889531135344209921 2017-07-24 17:02:04 <a href="http://twitter.com/download/iphone" r... This is Stuart. He's sporting his favorite fan... https://twitter.com/dog_rates/status/889531135... Stuart https://pbs.twimg.com/media/DFg_2PVW0AEHN3p.jpg 1.0 15319.0 2302.0 BarkWeek puppo golden_retriever 0.953442 13 1.0 Stuart Neutral
15 889278841981685760 2017-07-24 00:19:32 <a href="http://twitter.com/download/iphone" r... This is Oliver. You're witnessing one of his m... https://twitter.com/dog_rates/status/889278841... Oliver https://pbs.twimg.com/ext_tw_video_thumb/88927... 1.0 25698.0 5626.0 BarkWeek None whippet 0.626152 13 1.0 Oliver Neutral
16 888917238123831296 2017-07-23 00:22:39 <a href="http://twitter.com/download/iphone" r... This is Jim. He found a fren. Taught him how t... https://twitter.com/dog_rates/status/888917238... Jim https://pbs.twimg.com/media/DFYRgsOUQAARGhO.jpg 1.0 29533.0 4670.0 NaN None golden_retriever 0.714719 12 1.0 Jim Male
17 888804989199671297 2017-07-22 16:56:37 <a href="http://twitter.com/download/iphone" r... This is Zeke. He has a new stick. Very proud o... https://twitter.com/dog_rates/status/888804989... Zeke https://pbs.twimg.com/media/DFWra-3VYAA2piG.jpg 1.0 25997.0 4524.0 NaN None golden_retriever 0.469760 13 1.0 Zeke Male
18 888554962724278272 2017-07-22 00:23:06 <a href="http://twitter.com/download/iphone" r... This is Ralphus. He's powering up. Attempting ... https://twitter.com/dog_rates/status/888554962... Ralphus https://pbs.twimg.com/media/DFTH_O-UQAACu20.jpg 3.0 20250.0 3710.0 NaN None Siberian_husky 0.700377 13 1.0 Ralphus Neutral
20 888078434458587136 2017-07-20 16:49:33 <a href="http://twitter.com/download/iphone" r... This is Gerald. He was just told he didn't get... https://twitter.com/dog_rates/status/888078434... Gerald https://pbs.twimg.com/media/DFMWn56WsAAkA7B.jpg 1.0 22129.0 3625.0 NaN None French_bulldog 0.995026 12 1.0 Gerald Male

Set Null Values in Various Columns

In [41]:
df.loc[df['gender'] == 'Neutral', 'gender'] = None
df.loc[df['names'] == 'Nameless', 'names'] = None
df.loc[df['breed'] == 'Unidentifiable', 'breed'] = None
df.loc[df['dog_type'] == 'None', 'dog_type'] = None
df.loc[df['rating'] == 0.0, 'rating'] = np.nan
df.loc[df['confidence'] == 0.0, 'confidence'] = np.nan
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1994 entries, 0 to 2355
Data columns (total 18 columns):
tweet_id         1994 non-null int64
timestamp        1994 non-null datetime64[ns]
source           1994 non-null object
text             1994 non-null object
expanded_urls    1994 non-null object
name             1994 non-null object
jpg_url          1994 non-null object
img_num          1994 non-null float64
favorites        1994 non-null float64
retweet_count    1994 non-null float64
hashtag          22 non-null object
dog_type         369 non-null object
breed            1686 non-null object
confidence       1686 non-null float64
rating           1992 non-null object
dog_count        1994 non-null float64
names            1369 non-null object
gender           707 non-null object
dtypes: datetime64[ns](1), float64(5), int64(1), object(11)
memory usage: 296.0+ KB
In [42]:
df.to_csv('twitter_archive_master.csv', encoding = 'utf-8') #saved