CLUSTERING : How You Make Tweet ? etc
save as setting.py
code :
access_token='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_secret='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
consumer_key='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
consumer_secret='xxxxxxxxxxxxxxxxxxxxxxx'
database = 'twitterutils'
named db = 'twitterutils' in couchdb
----------------------------------------------------------------------------------------------------------------------------
save as tweet_sources.py
code :
import re
from collections import defaultdict
from argparse import ArgumentParser
from matplotlib import pyplot
from couchdb import Server
import settings
def get_arguments():
parser = ArgumentParser(description="Show a graph with the tweet source popularity")
parser.add_argument("-start", dest="start",
nargs="?", help="Start date in YYYY-MM-DD HH:MM format")
parser.add_argument("-end", dest="end",
nargs="?", help="End date in YYYY-MM-DD HH:MM format")
parser.add_argument("-top", default=10, type=int,
help="Number of the most significant sources to show")
parser.add_argument("-pie", action="store_true",
help="Show a pie chart instead")
return parser.parse_args()
def main():
args = get_arguments()
server = Server()
db = server = server[settings.database]
date_pattern = r"(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2}):(\d{2})"
if args.start:
date_match = re.match(date_pattern, args.start)
if not date_match:
print("Invalid start date format")
exit(-1)
start_key = [int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
int(date_match.group(4)),
int(date_match.group(5))]
else:
start_key = []
if args.end:
date_match = re.match(date_pattern, args.end)
if not date_match:
print("Invalid end date format")
exit(-1)
end_key = [int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
int(date_match.group(4)),
int(date_match.group(5)),{}]
else:
end_key = [{}]
sources = defaultdict(int)
url_pattern = r"(.*?)"
for row in db.view("sources/sources", reduce=False, start_key=start_key, end_key=end_key):
# if the source string contains a link, extract the text from it
source_match = re.match(url_pattern, row.value['source'])
if source_match:
sources[source_match.group(1)] += 1
else:
sources[row.value['source']] += 1
sorted_items = sorted(sources.items(), key=lambda x: x[1], reverse=True)
top_items, rest = sorted_items[:args.top], sorted_items[args.top:]
counts = [item[1] for item in top_items]
source_types = [item[0] for item in top_items]
other_sum = sum([item[1] for item in rest])
counts.append(other_sum)
source_types.append("other")
if args.pie:
pyplot.pie(counts, labels=source_types, autopct="%.2f%%",
colors=["b", "g", "r", "c", "m", "y", "w"])
else:
pyplot.barh(range(len(counts)), counts, align="center")
pyplot.yticks(range(len(counts)), source_types)
pyplot.tight_layout()
pyplot.show()
if __name__ == "__main__":
main()
-----------------------------------------------------------------------------------------------------------------------------
save as tweet_timezones.py
code :
from collections import defaultdict
from argparse import ArgumentParser
from couchdb import Server
from twitterutils.date_utils import parse_date_string
import settings
def get_arguments():
parser = ArgumentParser(description="Show the tweets timezones")
parser.add_argument("--start", action="store", dest="start", nargs="?",
help="Start date")
parser.add_argument("--end", action="store", dest="end", nargs="?",
help="End date")
return parser.parse_args()
def main():
args = get_arguments()
date_pattern = r"(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2}):(\d{2})"
if args.start:
start_date = parse_date_string(args.start.strip())
if not start_date:
print("Invalid start date")
exit(-1)
else:
start_date = []
if args.end:
end_date = parse_date_string(args.end.strip())
if not end_date:
print("Invalid end date")
exit(-1)
else:
end_date = {}
server = Server()
db = server[settings.database]
timezones = defaultdict(int)
for row in db.view("timezones/by_date", start_key=start_date, end_key=end_date):
timezone = row.key[-1]
timezones[timezone] += 1
sorted_timezones = sorted(timezones.items(), key=lambda x: x[1], reverse=True)
for timezone, count in sorted_timezones:
print timezone, count
if __name__ == "__main__":
main()
-------------------------------------------------------------------------------------------------------------------------
save as user_stream.py
code :
import json
from tweepy.streaming import StreamListener
from tweepy import Stream, OAuthHandler
import couchdb
import settings
class CouchDBStreamListener(StreamListener):
def __init__(self, db):
self.db = db
self.tweet_count = 0
self.received_friend_ids = False
def on_data(self, data):
try:
tweet = json.loads(data)
except Exception:
print("Failed to parse tweet data")
tweet = None
if tweet:
if tweet.has_key('id') and tweet.has_key("text"):
print("%s: %s" % (tweet['user']['screen_name'], tweet['text']))
tweet['doc_type'] = "tweet"
self.db["tweet:%d" % tweet['id']] = tweet
self.tweet_count += 1
elif not self.received_friend_ids and tweet.has_key("friends"):
print("Got %d user ids" % len(tweet['friends']))
self.received_friend_ids = True
else:
print("Received a responce that is not a tweet")
print tweet
return True
def main():
auth = OAuthHandler(settings.consumer_key,
settings.consumer_secret)
auth.set_access_token(settings.access_token,
settings.access_secret)
server = couchdb.Server()
db = server[settings.database]
listener = CouchDBStreamListener(db)
stream = Stream(auth, listener)
try:
stream.userstream()
except KeyboardInterrupt:
print("Total tweets received: %d" % listener.tweet_count)
if __name__ == "__main__":
main()
-----------------------------------------------------------------------------------------------------------------------------
install couch db ( free download in couchdb.apache.org/ ) then do the data mining
port
http://127.0.0.1:5984/_utils/index.html
because the data are saved in temporary db... so be patient
No comments:
Post a Comment