Herzlich Willkommen

Live processing contents

Wednesday, January 23, 2013

Mining Twitter with Couch DB

CLUSTERING  : How You Make Tweet ? etc

save as setting.py
 code :


access_token='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_secret='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
consumer_key='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
consumer_secret='xxxxxxxxxxxxxxxxxxxxxxx'
database = 'twitterutils'



named db = 'twitterutils' in couchdb
----------------------------------------------------------------------------------------------------------------------------
save as tweet_sources.py

code :

import re
from collections import defaultdict
from argparse import ArgumentParser

from matplotlib import pyplot
from couchdb import Server

import settings

def get_arguments():
    parser = ArgumentParser(description="Show a graph with the tweet source popularity")

    parser.add_argument("-start", dest="start",
                        nargs="?", help="Start date in YYYY-MM-DD HH:MM format")

    parser.add_argument("-end", dest="end",
                        nargs="?", help="End date in YYYY-MM-DD HH:MM format")

    parser.add_argument("-top", default=10, type=int,
                        help="Number of the most significant sources to show")

    parser.add_argument("-pie", action="store_true",
                        help="Show a pie chart instead")

    return parser.parse_args()

def main():
    args = get_arguments()
   
    server = Server()
    db = server = server[settings.database]

    date_pattern = r"(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2}):(\d{2})"

    if args.start:
        date_match = re.match(date_pattern, args.start)
        if not date_match:
            print("Invalid start date format")
            exit(-1)
        start_key = [int(date_match.group(1)),
                     int(date_match.group(2)),
                     int(date_match.group(3)),
                     int(date_match.group(4)),
                     int(date_match.group(5))]
    else:
        start_key = []


    if args.end:
        date_match = re.match(date_pattern, args.end)
        if not date_match:
            print("Invalid end date format")
            exit(-1)
        end_key = [int(date_match.group(1)),
                   int(date_match.group(2)),
                   int(date_match.group(3)),
                   int(date_match.group(4)),
                   int(date_match.group(5)),{}]
    else:
        end_key = [{}]

    sources = defaultdict(int)
    url_pattern = r"(.*?)"

    for row in db.view("sources/sources", reduce=False, start_key=start_key, end_key=end_key):
        # if the source string contains a link, extract the text from it
        source_match = re.match(url_pattern, row.value['source'])
        if source_match:
            sources[source_match.group(1)] += 1           
        else:
            sources[row.value['source']] += 1

    sorted_items = sorted(sources.items(), key=lambda x: x[1], reverse=True)
    top_items, rest = sorted_items[:args.top], sorted_items[args.top:]
    counts = [item[1] for item in top_items]
    source_types = [item[0] for item in top_items]
    other_sum = sum([item[1] for item in rest])
    counts.append(other_sum)
    source_types.append("other")

    if args.pie:
        pyplot.pie(counts, labels=source_types, autopct="%.2f%%",
                   colors=["b", "g", "r", "c", "m", "y", "w"])
    else:
        pyplot.barh(range(len(counts)), counts, align="center")
        pyplot.yticks(range(len(counts)), source_types)
        pyplot.tight_layout()

    pyplot.show()

if __name__ == "__main__":
    main()


-----------------------------------------------------------------------------------------------------------------------------
save as tweet_timezones.py

code :

from collections import defaultdict
from argparse import ArgumentParser

from couchdb import Server

from twitterutils.date_utils import parse_date_string
import settings

def get_arguments():
    parser = ArgumentParser(description="Show the tweets timezones")

    parser.add_argument("--start", action="store", dest="start", nargs="?",
                        help="Start date")

    parser.add_argument("--end", action="store", dest="end", nargs="?",
                        help="End date")

    return parser.parse_args()

def main():
    args = get_arguments()

    date_pattern = r"(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2}):(\d{2})"

    if args.start:
        start_date = parse_date_string(args.start.strip())
        if not start_date:
            print("Invalid start date")
            exit(-1)
    else:
        start_date = []

    if args.end:
        end_date = parse_date_string(args.end.strip())
        if not end_date:
            print("Invalid end date")
            exit(-1)
    else:
        end_date = {}

    server = Server()
    db = server[settings.database]

    timezones = defaultdict(int)

    for row in db.view("timezones/by_date", start_key=start_date, end_key=end_date):
        timezone = row.key[-1]
        timezones[timezone] += 1

    sorted_timezones = sorted(timezones.items(), key=lambda x: x[1], reverse=True)
   
    for timezone, count in sorted_timezones:
        print timezone, count

if __name__ == "__main__":
    main()

-------------------------------------------------------------------------------------------------------------------------


save as user_stream.py

code :

import json

from tweepy.streaming import StreamListener
from tweepy import Stream, OAuthHandler
import couchdb

import settings

class CouchDBStreamListener(StreamListener):
    def __init__(self, db):
        self.db = db
        self.tweet_count = 0
        self.received_friend_ids = False

    def on_data(self, data):
        try:
            tweet = json.loads(data)
        except Exception:
            print("Failed to parse tweet data")
            tweet = None

        if tweet:
            if tweet.has_key('id') and tweet.has_key("text"):
                print("%s: %s" % (tweet['user']['screen_name'], tweet['text']))

                tweet['doc_type'] = "tweet"

                self.db["tweet:%d" % tweet['id']] = tweet

                self.tweet_count += 1
            elif not self.received_friend_ids and tweet.has_key("friends"):
                print("Got %d user ids" % len(tweet['friends']))
                self.received_friend_ids = True
            else:
                print("Received a responce that is not a tweet")
                print tweet

        return True

def main():
    auth = OAuthHandler(settings.consumer_key,
                        settings.consumer_secret)

    auth.set_access_token(settings.access_token,
                          settings.access_secret)

    server = couchdb.Server()
    db = server[settings.database]

    listener = CouchDBStreamListener(db)

    stream = Stream(auth, listener)
   
    try:
        stream.userstream()
    except KeyboardInterrupt:
        print("Total tweets received: %d" % listener.tweet_count)

if __name__ == "__main__":
    main()
-----------------------------------------------------------------------------------------------------------------------------



install couch db ( free download in couchdb.apache.org/ ) then do the data mining
port
http://127.0.0.1:5984/_utils/index.html



because the data are saved in temporary db... so be patient

No comments:

Post a Comment