Check for repeated tweets in MongoDB

2

I have a method that, thanks to Twython, saves the tweets in MongoDB as it is in my question Maintaining a mongodb with tweets that match a given tag

def getSearchTagTwitter(hashtag):
    db = connexMongoDB()
    t = loginTwython()
    search = t.search(q=hashtag, count=100)
    data = search['statuses']
    try:
        db.twittersearch.create_index('id_str')
        for tweet in data:
            try :
                db.twittersearch.insert_one(tweet)
            except :
                db.twittersearch.update_one({"id_str": tweet['id_str']}, tweet) 
    except Exception:
        print "Error al buscar hashtag"
        time.sleep(60*15) #15 minutos
        getSearchTagTwitter(hashtag)

I think it does not work correctly and I want to check if the value of id_str is not repeated through the MongoDB shell and / or from Python. I tried the following but it does not work for me:

db.twittersearch.find({'id_str':{$in:["numerodeid_str"]}})

Edit: I simplify the question: From Python, how can I check if I do not have duplicates in an already created mongodb? I currently connect with pymongo, and I can see that I created the collection.

    
asked by Elena 17.02.2016 в 18:09
source

2 answers

1

To record in your MongoDB collection you are using id_str :

db.twittersearch.create_index('id_str')
# ... 
db.twittersearch.update_one({"id_str": tweet['id_str']}, tweet) 

And when doing the query you are using the wrong field str_id (unlike id_str ):

db.twittersearch.find({'str_id':{$in:["numerodestr_id"]}})

The correct thing would be:

db.twittersearch.find({'id_str':{$in:["numerodestr_id"]}})

Unless, of course, it's just an error of typing or copying / pasting

Update after the edition

I have created a simple script to replicate your case using the hashtag python and getting only 10 tweets:

# -*- coding: utf-8 -*-
from pymongo import MongoClient
from twython import Twython


client = MongoClient('localhost', 27017)
db = client.test

CONSUMER_KEY = 'xxxxxxxxxxxxxxxxxx'
CONSUMER_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxx'

def get_tweets(hashtag='wtf'):
    twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)
    search = twitter.search(q=hashtag, count=10)
    data = search['statuses']
    for tweet in data:
        try:
            db.twittersearch.insert_one(tweet)
        except Exception, e:
            print "Error al insertar: %s" % e
            db.twittersearch.update_one({'id_str': tweet['id_str']}, tweet)

if __name__ == '__main__':
    get_tweets(hashtag='python')

And it does not give me problems, if I do some test queries in the MongoDB console:

> db.twittersearch.find({}, {"id_str": 1, "_id": 0})
{ "id_str" : "700315462568120320" }
{ "id_str" : "700315461850804224" }
{ "id_str" : "700315438169747457" }
{ "id_str" : "700315421900148736" }
{ "id_str" : "700315421887619076" }
{ "id_str" : "700315350299049988" }
{ "id_str" : "700315332838301698" }
{ "id_str" : "700315321689833473" }
{ "id_str" : "700315301594796032" }
{ "id_str" : "700315293177008128" }

> db.twittersearch.find({"id_str": {$in: ["700315461850804224"]}})
{ "_id" : ObjectId("56c5caae2fbb7114134d0bac"), "contributors" : null, "truncated" : false, "text" : "RT @dmeishappy_: กระเป๋าของฟานี่คือ DIORAMA MINI BAG \nMETALLIC BLUE PYTHON ราคาราวๆ 98,000~ บาท #WelcomeTIFFANYtoThailand https://t.co/UnDp…", "is_quote_status" : false, "in_reply_to_status_id" : null, "id" : NumberLong("700315461850804224"), "favorite_count" : 0, "source" : "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>", "retweeted" : false, "coordinates" : null, "entities" : { "symbols" : [ ], "user_mentions" : [ { "id" : 299006216, "indices" : [ 3, 15 ], "id_str" : "299006216", "screen_name" : "dmeishappy_", "name" : "น้องสดใส" } ], "hashtags" : [ { "indices" : [ 96, 121 ], "text" : "WelcomeTIFFANYtoThailand" } ], "urls" : [ ], "media" : [ { "source_user_id" : 299006216, "source_status_id_str" : "700301141263806464", "expanded_url" : "http://twitter.com/dmeishappy_/status/700301141263806464/photo/1", "display_url" : "pic.twitter.com/UnDpsznRJ6", "url" : "https://t.co/UnDpsznRJ6", "media_url_https" : "https://pbs.twimg.com/media/Cbf4Ka0UkAEWmhH.jpg", "source_user_id_str" : "299006216", "source_status_id" : NumberLong("700301141263806464"), "id_str" : "700301124960555009", "sizes" : { "large" : { "h" : 1024, "resize" : "fit", "w" : 669 }, "small" : { "h" : 520, "resize" : "fit", "w" : 340 }, "medium" : { "h" : 918, "resize" : "fit", "w" : 600 }, "thumb" : { "h" : 150, "resize" : "crop", "w" : 150 } }, "indices" : [ 122, 140 ], "type" : "photo", "id" : NumberLong("700301124960555009"), "media_url" : "http://pbs.twimg.com/media/Cbf4Ka0UkAEWmhH.jpg" } ] }, "in_reply_to_screen_name" : null, "in_reply_to_user_id" : null, "retweet_count" : 152, "id_str" : "700315461850804224", "favorited" : false, "retweeted_status" : { "contributors" : null, "truncated" : false, "text" : "กระเป๋าของฟานี่คือ DIORAMA MINI BAG \nMETALLIC BLUE PYTHON ราคาราวๆ 98,000~ บาท #WelcomeTIFFANYtoThailand https://t.co/UnDpsznRJ6", "is_quote_status" : false, "in_reply_to_status_id" : null, "id" : NumberLong("700301141263806464"), "favorite_count" : 13, "source" : "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>", "retweeted" : false, "coordinates" : null, "entities" : { "symbols" : [ ], "user_mentions" : [ ], "hashtags" : [ { "indices" : [ 79, 104 ], "text" : "WelcomeTIFFANYtoThailand" } ], "urls" : [ ], "media" : [ { "expanded_url" : "http://twitter.com/dmeishappy_/status/700301141263806464/photo/1", "display_url" : "pic.twitter.com/UnDpsznRJ6", "url" : "https://t.co/UnDpsznRJ6", "media_url_https" : "https://pbs.twimg.com/media/Cbf4Ka0UkAEWmhH.jpg", "id_str" : "700301124960555009", "sizes" : { "large" : { "h" : 1024, "resize" : "fit", "w" : 669 }, "small" : { "h" : 520, "resize" : "fit", "w" : 340 }, "medium" : { "h" : 918, "resize" : "fit", "w" : 600 }, "thumb" : { "h" : 150, "resize" : "crop", "w" : 150 } }, "indices" : [ 105, 128 ], "type" : "photo", "id" : NumberLong("700301124960555009"), "media_url" : "http://pbs.twimg.com/media/Cbf4Ka0UkAEWmhH.jpg" } ] }, "in_reply_to_screen_name" : null, "in_reply_to_user_id" : null, "retweet_count" : 152, "id_str" : "700301141263806464", "favorited" : false, "user" : { "follow_request_sent" : null, "has_extended_profile" : true, "profile_use_background_image" : true, "default_profile_image" : false, "id" : 299006216, "profile_background_image_url_https" : "https://pbs.twimg.com/profile_background_images/449190232508747776/eVbVkbBq.png", "verified" : false, "profile_text_color" : "3E3417", "profile_image_url_https" : "https://pbs.twimg.com/profile_images/699491224382676992/1RCFrGlC_normal.jpg", "profile_sidebar_fill_color" : "6E096E", "entities" : { "description" : { "urls" : [ ] } }, "followers_count" : 12731, "profile_sidebar_border_color" : "000000", "id_str" : "299006216", "profile_background_color" : "ECEEE0", "listed_count" : 15, "is_translation_enabled" : false, "utc_offset" : -28800, "statuses_count" : 123360, "description" : "ก็แค่ทวิตเตอร์จำเป็นต้องจริงจังปร้าาาาา", "friends_count" : 260, "location" : "คิดให้ดีๆก่อนฟอลโล่ว", "profile_link_color" : "E81C4F", "profile_image_url" : "http://pbs.twimg.com/profile_images/699491224382676992/1RCFrGlC_normal.jpg", "following" : null, "geo_enabled" : true, "profile_banner_url" : "https://pbs.twimg.com/profile_banners/299006216/1455606704", "profile_background_image_url" : "http://pbs.twimg.com/profile_background_images/449190232508747776/eVbVkbBq.png", "screen_name" : "dmeishappy_", "lang" : "en", "profile_background_tile" : false, "favourites_count" : 4013, "name" : "น้องสดใส", "notifications" : null, "url" : null, "created_at" : "Sun May 15 10:19:10 +0000 2011", "contributors_enabled" : false, "time_zone" : "Pacific Time (US & Canada)", "protected" : false, "default_profile" : false, "is_translator" : false }, "geo" : null, "in_reply_to_user_id_str" : null, "possibly_sensitive" : false, "lang" : "und", "created_at" : "Thu Feb 18 12:49:37 +0000 2016", "in_reply_to_status_id_str" : null, "place" : null, "metadata" : { "iso_language_code" : "und", "result_type" : "recent" } }, "user" : { "follow_request_sent" : null, "has_extended_profile" : true, "profile_use_background_image" : true, "default_profile_image" : false, "id" : 594309538, "profile_background_image_url_https" : "https://pbs.twimg.com/profile_background_images/568400749038534656/SJRD7Zj6.jpeg", "verified" : false, "profile_text_color" : "3E4415", "profile_image_url_https" : "https://pbs.twimg.com/profile_images/694513699344687106/DCKLb8Mz_normal.jpg", "profile_sidebar_fill_color" : "99CC33", "entities" : { "url" : { "urls" : [ { "url" : "https://t.co/TKchFkPats", "indices" : [ 0, 23 ], "expanded_url" : "http://instagram.com/beastly_snowy", "display_url" : "instagram.com/beastly_snowy" } ] }, "description" : { "urls" : [ ] } }, "followers_count" : 319, "profile_sidebar_border_color" : "FFFFFF", "id_str" : "594309538", "profile_background_color" : "352726", "listed_count" : 10, "is_translation_enabled" : false, "utc_offset" : 25200, "statuses_count" : 79299, "description" : "#SONE @GirlsGeneration || Hwang Miyoung Stepanie Tiffany || // Taeny // Taeyeon Jessica Sunny Hyoyeon Yuri Sooyoung  YoonA SeoHyun // #notep #เอมน้ำ #OT9", "friends_count" : 255, "location" : "#Kamphaeng Phet Thailand", "profile_link_color" : "D02B54", "profile_image_url" : "http://pbs.twimg.com/profile_images/694513699344687106/DCKLb8Mz_normal.jpg", "following" : null, "geo_enabled" : true, "profile_banner_url" : "https://pbs.twimg.com/profile_banners/594309538/1452856105", "profile_background_image_url" : "http://pbs.twimg.com/profile_background_images/568400749038534656/SJRD7Zj6.jpeg", "screen_name" : "yeppoNOOO", "lang" : "en", "profile_background_tile" : true, "favourites_count" : 3413, "name" : "ONIAP° #ขุ่นแม่มา♥", "notifications" : null, "url" : "https://t.co/TKchFkPats", "created_at" : "Wed May 30 04:51:51 +0000 2012", "contributors_enabled" : false, "time_zone" : "Bangkok", "protected" : false, "default_profile" : false, "is_translator" : false }, "geo" : null, "in_reply_to_user_id_str" : null, "possibly_sensitive" : false, "lang" : "und", "created_at" : "Thu Feb 18 13:46:31 +0000 2016", "in_reply_to_status_id_str" : null, "place" : null, "metadata" : { "iso_language_code" : "und", "result_type" : "recent" } }

I think the problem is on the other hand, maybe there is something else in your code that is happening to us.

    
answered by 17.02.2016 / 18:17
source
1

I add another solution that I found, which is to use the update with the value upsert to True. This will be overwritten if there is a duplicate and a new record will be created if it does not exist.

db.twittersearch.update({'id_str': tweet['id_str']}, tweet, upsert=True)
    
answered by 15.03.2016 в 16:24