Build index which will cover queries

The steps in our application are :

  • Tweets are being fetched and stored in a MongoDB collection viz. search_tweets
  • Data is read from search_tweets, several parameters are calculated and the results are stored in a different collection viz. Twitter_Processed . This collection has far less no. of keys and most of it are calculated fields
  • Sample document - search_tweets

      "_id" : ObjectId("5372fae4e4b0e6088a20a4f4"),
      "contributors" : null,
      "text" : "Stay far from #Massachusetts General Hospital. It secretly imposes #unilateral #DNRs against patient & family wishes.",
      "geo" : null,
      "retweeted" : false,
      "in_reply_to_screen_name" : null,
      "possibly_sensitive" : false,
      "truncated" : false,
      "lang" : "en",
      "entities" : {
        "symbols" : [],
        "urls" : [{
            "expanded_url" : "",
            "indices" : [122, 144],
            "display_url" : "…",
            "url" : ""
        "hashtags" : [{
            "text" : "Massachusetts",
            "indices" : [14, 28]
          }, {
            "text" : "unilateral",
            "indices" : [67, 78]
          }, {
            "text" : "DNRs",
            "indices" : [79, 84]
        "user_mentions" : []
      "in_reply_to_status_id_str" : null,
      "id" : NumberLong("466053869173088258"),
      "source" : "web",
      "in_reply_to_user_id_str" : null,
      "favorited" : false,
      "in_reply_to_status_id" : null,
      "retweet_count" : 0,
      "created_at" : "Tue May 13 03:14:35 +0000 2014",
      "in_reply_to_user_id" : null,
      "favorite_count" : 0,
      "id_str" : "466053869173088258",
      "place" : null,
      "user" : {
        "location" : "Gainesville, VA",
        "default_profile" : false,
        "profile_background_tile" : true,
        "statuses_count" : 3978,
        "lang" : "en",
        "profile_link_color" : "E82012",
        "profile_banner_url" : "",
        "id" : 137925469,
        "following" : false,
        "protected" : false,
        "favourites_count" : 156,
        "profile_text_color" : "E0B392",
        "description" : "Saved by faith in Jesus and ministering at nursing homes with my wife, Germaine.  We pray for the abolition of abortion and all forms of slavery (trafficking).",
        "verified" : false,
        "contributors_enabled" : false,
        "profile_sidebar_border_color" : "FFFFFF",
        "name" : "Lawrence Sylvain",
        "profile_background_color" : "F9F4E1",
        "created_at" : "Wed Apr 28 04:24:00 +0000 2010",
        "is_translation_enabled" : false,
        "default_profile_image" : false,
        "followers_count" : 157,
        "profile_image_url_https" : "",
        "geo_enabled" : false,
        "profile_background_image_url" : "",
        "profile_background_image_url_https" : "",
        "follow_request_sent" : false,
        "entities" : {
          "description" : {
            "urls" : []
          "url" : {
            "urls" : [{
                "expanded_url" : "",
                "indices" : [0, 22],
                "display_url" : "",
                "url" : ""
        "url" : "",
        "utc_offset" : -18000,
        "time_zone" : "Lima",
        "notifications" : false,
        "profile_use_background_image" : true,
        "friends_count" : 331,
        "profile_sidebar_fill_color" : "BE9E78",
        "screen_name" : "ljsylvain",
        "id_str" : "137925469",
        "profile_image_url" : "",
        "listed_count" : 0,
        "is_translator" : false
      "coordinates" : null,
      "metadata" : {
        "result_type" : "recent",
        "iso_language_code" : "en"
      "token" : {
        "CONSUMER_SECRET" : "4FsHhU2KuYsbowCJsuZ4RtsUq4rpLQQcQAGeXkIZqY",
        "ACCESS_TOKEN" : "2362487558-EflbK1NLJMjhAQnXQHkmRwMCwqBBlZ2Y0KRnydf",
        "CONSUMER_KEY" : "cmZaUbCeDpb9SWCwUlCNsA",
        "ACCESS_SECRET" : "gNvLl6n3gXwXBl7GZfHpmUMreqNn1OULz30SpT29jAYwy",
        "APP_ID" : "3"
      "searchProfileId" : 905,
      "customerId" : 0,
      "schedularId" : 32446,
      "userId" : "395",
      "subject" : "Massachusetts General Hospital ",
      "context" : "Healthcare",
      "tagId" : 0,
      "domain" : "General",
      "uniqueId" : 933,
      "message" : "Stay far from #Massachusetts General Hospital. It secretly imposes #unilateral #DNRs against patient & family wishes.",
      "searchkeyword" : "Massachusetts General Hospital ",
      "connectortype" : "TWITTER"

    Sample document - Twitter_Processed

      "_id" : {
        "SpId" : 905,
        "Channel_Id" : 0,
        "ActivityId" : "137925469",
        "SchedularId" : 32446,
        "UniqueID" : 933
      "Cust_Id" : " 0",
      "Domain_Id" : " General",
      "searchkeyword" : " Massachusetts General Hospital ",
      "Sentiment" : 0.42,
      "Intention_cause" : "None",
      "Intention_category" : "None",
      "Emotion_category" : "neutral",
      "AgeGroup" : "26-35",
      "Gender" : "M",
      "Location" : {
        "Country" : "United Kingdom",
        "Lat" : "54.7136300",
        "Long" : "-6.2142800",
        "City" : "Northern Ireland"
      "message" : "Stay far from Massachusetts General Hospital. It secretly imposes unilateral DNRs against patient &amp family wishes. httpt.cohDuK96gqyU",
      "PostDate" : ISODate("2014-05-12T21:44:35Z"),
      "DetectedLanguage" : "English",
      "ProcessedText" : "%*Stay*% far from Massachusetts General Hospital . ==0.01:::It secretly imposes unilateralDNRs against %*patient*% &amp family wishes . ==1.25:::httpt.cohDuK96gqy U ==0:::",
      "IdeaCloudText" : " PS_stay PS_patient",
      "IdeaCloudTopicSpoken" : "  Massachusetts General unilateralDNRs patient family",
      "userid" : " ljsylvain",
      "followers" : 157,
      "like_count" : 0,
      "plusone_count" : 0

    After reading the documentation, how should I build an index/several indexes when:

  • In find/$match, keys "searchProfileId" & "schedularId" are used
  • The keys "text", "user"(has several sub-documents), "searchProfileId", "schedularId" and "uniqueId", "id_str" are returned
  • The "text" has tweets which is random - does it make sense to include it in a compound index? If not, will building a text index help in any way(NO text search is done within the value of "text")
  • Basically, I want to ensure that the queries are covered.


