Index: www/sites/all/modules/custom/np_scan/np_scan.analytics.inc =================================================================== --- www/sites/all/modules/custom/np_scan/np_scan.analytics.inc (revision 34222) +++ www/sites/all/modules/custom/np_scan/np_scan.analytics.inc (working copy) @@ -19,71 +19,6 @@ } /** - * Helper function to get the name of the velocity field. - * - * @param $timeframe time in hours 24, 48, 168,720 - * @param $collection_name the name of the mongo collection - * @return string, the name of the velocity field - */ -function _np_scan_analytics_get_velocity_field($timeframe, $collection_name) { - $velocity_field = ''; - - if ($collection_name == 'scanurl') { - switch ($timeframe) { - case 24: // day - $velocity_field = 'hours.velocity24'; - break; - case 168: //week - $velocity_field = 'days.velocity7'; - break; - case 720: // month - case 'all': - $velocity_field = 'days.velocity'; - break; - default: // biday - $velocity_field = 'hours.velocity'; - break; - } - } - elseif ($collection_name == 'scan') { - switch ($timeframe) { - case 24: // day - $velocity_field = 'hours.velocity'; - break; - case 168: //week - $velocity_field = 'days.velocity7'; - break; - case 720: // month - case 'all': - $velocity_field = 'days.velocity'; - break; - default: // biday - $velocity_field = 'hours.velocity48'; - break; - } - } - elseif (in_array($collection_name, array('hashtag', 'keyword', 'retweet'))) { - switch ($timeframe) { - case 24: // day - $velocity_field = 'hours.velocity'; - break; - case 168: //week - $velocity_field = 'days.velocity'; // no stats use month - break; - case 720: // month - case 'all': - $velocity_field = 'days.velocity'; - break; - default: // biday - $velocity_field = 'days.velocity'; // no stats use month - break; - } - } - - return $velocity_field; -} - -/** * Menu callback; property statistics. * * $page - which page are we displaying results for? @@ -114,6 +49,7 @@ switch ($page) { case 'views': + $velocity_field_2 = ($timeframe == 24) ? 'velocity.hours' : 'velocity.days'; $args = array_merge(array($scan_status), $client_ids ); scan_api_set_active_shard('misc'); $key = 'vid'; @@ -135,36 +71,15 @@ break; case 'links': - $velocity_field = _np_scan_analytics_get_velocity_field($timeframe, 'scanurl'); - $fields = array('scan_id' => 1); - $query = array( - 'scan.client_id' => array('$in' => $client_ids), - 'scan.status' => $scan_status, - ); - if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { - try { - $results = $cursor - ->find($query, $fields) - ->sort(array($velocity_field => -1)) - ->limit($limit) - ->timeout(scan_api_get_mongo_timeout()); - foreach ($results as $row) { - $scans[$row['scan_id']] = FALSE; - } - } - catch (MongoCursorTimeoutException $e) { - } - } - break; - default: - $velocity_field = _np_scan_analytics_get_velocity_field($timeframe, 'scan'); + $velocity_field = ($page == 'links') ? 'velocity.hours_urls' : 'velocity.hours_scan'; + $velocity_field_2 = 'velocity.days'; // @@@ V2 Maybe we should standardize on 48h across the board? $fields = array('scan_id' => 1); $query = array( 'scan.client_id' => array('$in' => $client_ids), 'scan.status' => $scan_status, ); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan.client_id[], scan.status / velocity.hours_urls, velocity.hours_scan try { $results = $cursor ->find($query, $fields) @@ -203,24 +118,23 @@ // pour together hashtag and keyword velocities foreach (array('keyword', 'hashtag') as $collection_name) { if ($words[$collection_name]) { - $velocity_field = _np_scan_analytics_get_velocity_field($timeframe, $collection_name); - list($interval, $tmp) = explode('.', $velocity_field); - $fields = array('word' => 1, $velocity_field => 1); + list($tmp, $interval) = explode('.', $velocity_field_2); + $fields = array('word' => 1, $velocity_field_2 => 1); $query = array( 'scan_id' => intval($statistics->scan_id), 'word' => array('$in' => array_map('strtolower', $words[$collection_name])), ); - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id, word[] / velocity.hours:-1, velocity.days:-1 try { $results = $cursor ->find($query, $fields) - ->sort(array($velocity_field => -1)) + ->sort(array($velocity_field_2 => -1)) ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { if ($collection_name == 'hashtag') { $row['word'] = '#' . $row['word']; }; - $word_velocities[$row['word']] = isset($row[$interval]['velocity']) ? round($row[$interval]['velocity'], VELOCITY_PRECISION) : 0; + $word_velocities[$row['word']] = isset($row['velocity'][$interval]) ? round($row['velocity'][$interval], VELOCITY_PRECISION) : 0; } } catch (MongoCursorTimeoutException $e) { @@ -248,60 +162,43 @@ 'timestamp' => strtotime('-1 day'), ); - // fill up velocity and difference + // fill up velocity and difference / stats. $statistics->velocity = 0; $statistics->difference = 0; - $fields = array('minutes.velocity' => 1, 'minutes.prev_velocity' => 1); - $query = array( - 'scan_id' => intval($statistics->scan_id), - ); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { - try { - $results = $cursor - ->find($query, $fields) - ->timeout(scan_api_get_mongo_timeout()); - if ($results->hasNext()) { - $row = $results->getNext(); - $velocity = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; - $prev_velocity = isset($row['minutes']['prev_velocity']) ? $row['minutes']['prev_velocity'] * 12 : 0; - $statistics->velocity = $velocity; - $statistics->difference = $velocity - $prev_velocity; - } - } - catch (MongoCursorTimeoutException $e) { - } - } - - // Fill up the url uniq stats part. Initialize to 0. - $statistics->velocity = 0; $statistics->general = 0; $statistics->photo = 0; $statistics->video = 0; - $statistics->url_velocity = 0; - $fields = array( 'scan_id' => 1, - 'hours.velocity' => 1, - 'hours.general.velocity' => 1, - 'hours.photo.velocity' => 1, - 'hours.video.velocity' => 1, + // @@@ V2 Previous code disagrees on whether to use minutes or hours here. + //'velocity.minutes' => 1, + 'velocity.hours' => 1, + 'velocity.hours_general' => 1, + 'velocity.hours_photo' => 1, + 'velocity.hours_video' => 1, + 'velocity.hours_urls' => 1, + 'increasing' => 1, ); $query = array( 'scan_id' => intval($statistics->scan_id), ); - if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan_id / none try { $results = $cursor ->find($query, $fields) ->timeout(scan_api_get_mongo_timeout()); - foreach ($results as $row) { - $statistics->velocity = isset($row['hours']['velocity']) ? $row['hours']['velocity'] : 0; - $statistics->general = isset($row['hours']['general']['velocity']) ? $row['hours']['general']['velocity'] : 0; - $statistics->photo = isset($row['hours']['photo']['velocity']) ? $row['hours']['photo']['velocity'] : 0; - $statistics->video = isset($row['hours']['video']['velocity']) ? $row['hours']['video']['velocity'] : 0; - - $statistics->url_velocity = isset($row['hours']['velocity']) ? $row['hours']['velocity'] : 0; + if ($results->hasNext()) { + $row = $results->getNext(); + // @@@ V2 Previous code disagrees on whether to use minutes or hours here. + //$statistics->velocity = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; + $statistics->velocity = isset($row['velocity']['hours']) ? $row['velocity']['hours'] : 0; + $statistics->general = isset($row['velocity']['hours_general']) ? $row['velocity']['hours_general'] : 0; + $statistics->photo = isset($row['velocity']['hours_photo']) ? $row['velocity']['hours_photo'] : 0; + $statistics->video = isset($row['velocity']['hours_video']) ? $row['velocity']['hours_video'] : 0; + $statistics->url_velocity = isset($row['velocity']['hours_urls']) ? $row['velocity']['hours_urls'] : 0; + // @@@ V2 Teach everything about "increasing". + $statistics->difference = $increasing ? 1 : -1; } } catch (MongoCursorTimeoutException $e) { @@ -365,15 +262,18 @@ $max = 0; $data = array(); - $order_field = ($order == 'trending') ? 'trending' : 'minutes.velocity'; + $order_field = ($order == 'trending') ? 'trending' : 'velocity.minutes'; $ordering_data = array(); foreach ($scan_ids as $scan_id) { - $fields = array('minutes.velocity' => 1, 'trending' => 1, 'word' => 1); + $fields = array( + 'velocity.minutes' => 1, + 'trending' => 1, + 'word' => 1, + ); $query = array( 'scan_id' => $scan_id, - 'word' => array('$exists' => TRUE), ); - if ($cursor = scan_api_get_mongo('scan_stats', $table_type)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($table_type)) { // V2r15 / keyword, hashtag / scan_id / trending:-1, velocity.minutes:-1 try { $results = $cursor ->find($query, $fields) @@ -382,11 +282,12 @@ ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { if (!isset($row['word'])) { + assert('Worker was speechless!'); // bug in workers. they write empty word records... bad continue; } - $velocity = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; + $velocity = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; $order_data = $order != 'trending' ? $velocity : $row['trending']; if (!isset($ordering_data[$row['word']]) || ($ordering_data[$row['word']] < $order_data)) { $data[$row['word']] = array( @@ -445,16 +346,16 @@ $query = array( 'scan_id' => array('$in' => $scan_ids), ); - if ($cursor = scan_api_get_mongo('scan_stats', $table_type)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($table_type)) { // V2r15 / keyword, hashtag / scan_id[] / velocity.minutes:-1 limit 1 try { $results = $cursor - ->find($query, array('minutes.velocity' => 1)) - ->sort(array('minutes.velocity' => -1)) + ->find($query, array('velocity.minutes' => 1)) + ->sort(array('velocity.minutes' => -1)) ->limit(1) ->timeout(scan_api_get_mongo_timeout()); if ($results->hasNext()) { $row = $results->getNext(); - $max = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; + $max = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; } } catch (MongoCursorTimeoutException $e) { @@ -462,15 +363,13 @@ } } else { - // MAX(ABS(trending) $max_trending = $min_trending = 0; $query = array( 'scan_id' => array('$in' => $scan_ids), - 'trending' => array('$ne' => -1000), ); // max trending - if ($cursor = scan_api_get_mongo('scan_stats', $table_type)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($table_type)) { // V2r15 / keyword, hashtag / scan_id[] / trending:-1, trending:1 limit 1 (two queries) try { $results = $cursor ->find($query, array('trending' => 1)) @@ -531,11 +430,10 @@ foreach (array('keyword', 'hashtag') as $collection_name) { $query = array( 'scan_id' => 0, - 'word' => array('$exists' => TRUE), ); - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag - // max trending + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0 / trending:-1, velocity.minutes:-1 count 1 two queries try { + // max trending $results = $cursor ->find($query, array('trending' => 1)) ->sort(array('trending' => -1)) @@ -548,13 +446,13 @@ // max velocity $results = $cursor - ->find($query, array('minutes.velocity' => 1)) - ->sort(array('minutes.velocity' => -1)) + ->find($query, array('velocity.minutes' => 1)) + ->sort(array('velocity.minutes' => -1)) ->limit(1) ->timeout(scan_api_get_mongo_timeout()); if ($results->hasNext()) { $row = $results->getNext(); - $result[$collection_name]['velocity'] = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; + $result[$collection_name]['velocity'] = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; } } catch (MongoCursorTimeoutException $e) { @@ -575,7 +473,7 @@ /** * Helper function to get the pager's current page number. */ -function _np_scan_analytics_get_page_number($db, $collection, $limit, $query = array(), $element = 0) { +function _np_scan_analytics_get_page_number($collection, $limit, $query = array(), $element = 0) { global $pager_page_array, $pager_total, $pager_total_items; // Initialize pager, see pager.inc. @@ -583,7 +481,7 @@ $page = isset($_GET['page']) ? $_GET['page'] : ''; $pager_page_array = explode(',', $page); $pager_total_items[$element] = 0; - if ($cursor = scan_api_get_mongo($db, $collection)) { // keyword, hashtag, (url -- disabled) + if ($cursor = scan_api_get_mongo($collection)) { // V2r15 / keyword, hashtag, (url -- disabled) / varies(_np_scan_analytics_get_page_number) / none (count) try { $pager_total_items[$element] = $cursor->find($query) ->timeout(scan_api_get_mongo_timeout()) @@ -605,13 +503,12 @@ $keyword_origin_access = user_access('access keyword origin'); $collection_name = $hashtag ? 'hashtag' : 'keyword'; - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0 / trending (paged query) $fields = array('scan_id' => 1, 'trending' => 1, 'word' => 1); $query = array( 'scan_id' => 0, - 'word' => array('$exists' => TRUE), ); - $pagenumber = $full ? _np_scan_analytics_get_page_number('scan_stats', $collection_name, $limit, $query) : 0; + $pagenumber = $full ? _np_scan_analytics_get_page_number($collection_name, $limit, $query) : 0; // V2r15 / hashtag, keyword / scan_id=0 / none (count) try { $results = $cursor ->find($query, $fields) @@ -659,26 +556,25 @@ $keyword_origin_access = user_access('access keyword origin'); $collection_name = $hashtag ? 'hashtag' : 'keyword'; - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0 / velocity.minutes:-1 paged query $query = array( 'scan_id' => 0, - 'word' => array('$exists' => TRUE), ); - $pagenumber = $full ? _np_scan_analytics_get_page_number('scan_stats', $collection_name, $limit, $query) : 0; + $pagenumber = $full ? _np_scan_analytics_get_page_number($collection_name, $limit, $query) : 0; // V2r15 / keyword, hashtag / scan_id=0 / none (count) - $fields = array('scan_id' => 1, 'minutes.velocity' => 1, 'word' => 1); + $fields = array('velocity.minutes' => 1, 'word' => 1); try { $results = $cursor ->find($query, $fields) - ->sort(array('minutes.velocity' => -1)) + ->sort(array('velocity.minutes' => -1)) ->skip($pagenumber * $limit) ->limit($limit) ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { $tweets[$row['word']] = array( - 'scan_id' => $row['scan_id'], - 'velocity' => isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0, + 'scan_id' => 0, + 'velocity' => isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0, 'word' => $row['word'], ); } @@ -715,13 +611,17 @@ } if ($tweets) { - - $fields = array('word' => 1, 'hours' => 1, 'updated' => 1); $query = array( 'scan_id' => 0, 'word' => array('$in' => array_map('strtolower', $words)), ); - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + $fields = array( + 'word' => 1, + 'hours' => 1, + 'updated' => 1, + ); + + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0, word[] / nond try { $results = $cursor ->find($query, $fields) @@ -753,7 +653,7 @@ 'word' => array('$in' => $words), 'updated' => array('$gte' => $date), ); - $cursor = scan_api_get_mongo('urls', 'url') + $cursor = scan_api_get_mongo('urls', 'url') // !V2r15 not converted -- commented out code ->find($query, $fields) ->sort(array('updated' => -1)); $i = 0; @@ -872,22 +772,22 @@ //$pagenumber = $paged ? _np_scan_analytics_get_page_number('urls', 'url', $limit, array('scan_id' => array('$in' => $scan_ids), 'category' => $category)) : 0; $pagenumber = 0; //@todo: fixme foreach ($scan_ids as $scan_id) { - $fields = array('hours.velocity' => 1, 'url_id' => 1); + $fields = array('velocity.hours' => 1, 'url_id' => 1); $query = array( - 'scan_id' => $scan_id, - 'category' => $category, + 'scan_id' => intval($scan_id), + 'category' => intval($category), ); - if ($cursor = scan_api_get_mongo('urls', 'url')) { + if ($cursor = scan_api_get_mongo('url')) { // V2r15 / url / scan_id, category / velocity.hours:-1 paged query try { $results = $cursor ->find($query, $fields) - ->sort(array('hours.velocity' => -1)) + ->sort(array('velocity.hours' => -1)) ->skip($pagenumber * $limit) ->limit($limit) ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { $url_ids[$row['url_id']] = $row['url_id']; - $velocity = isset($row['hours']['velocity']) ? $row['hours']['velocity'] : 0; + $velocity = isset($row['velocity']['hours']) ? $row['velocity']['hours'] : 0; if (!isset($order_data[$row['url_id']])) { $order_data[$row['url_id']] = $velocity; } Index: www/sites/all/modules/custom/np_scan/np_scan.module =================================================================== --- www/sites/all/modules/custom/np_scan/np_scan.module (revision 34222) +++ www/sites/all/modules/custom/np_scan/np_scan.module (working copy) @@ -437,7 +437,6 @@ // Refuse to update to a lower vid. db_query('UPDATE {scan_settings} SET active_vid = %d WHERE nid = %d AND active_vid < %d', $scan->vid, $scan->nid, $scan->vid); if (db_affected_rows()) { - np_scan_denorm($scan->scan_id, 1, 'active'); watchdog('np_scan', 'Scan id %scan_id for revision %vid promoted to active on node %nid.', array('%scan_id' => $scan->scan_id, '%vid' => $scan->vid, '%nid' => $scan->nid)); // Get any non archived/deleted scans older than the one being promoted to... $result = db_query("SELECT scan_id, nid, vid, archived FROM {scan} WHERE nid = %d AND scan_id < %d AND archived = 0", $scan->nid, $scan->scan_id); @@ -967,9 +966,6 @@ 'auto_unyaml' => false, )); - // Mark scan as inactive in mongo. - np_scan_denorm($scan_id, 0, 'active'); - // insert matching marker into DB for deletion worker to double check db_query("INSERT INTO {scan_delete} (scan_id) VALUES (%d)", $scan_id); @@ -1913,15 +1909,14 @@ */ function np_scan_denorm($scan_id, $new_value, $key = 'status') { $collections = array('scan', 'keyword', 'hashtag', 'location', 'retweet', 'url', 'scanurl'); - if ($key == 'active' && $new_value == 0) { - // Don't bother propogating the active status fully for something that's about - // to get deleted by terminator. - $collections = array('scan', 'scanurl'); + if ($key == 'active') { + assert('Someone tried to denorm scan.active!'); + return FALSE; } $key = "scan.". $key; $set = array('$set' => array($key => intval($new_value))); foreach ($collections as $collection) { - if ($cursor = scan_api_get_mongo('statistics', $collection)) { // denorm + if ($cursor = scan_api_get_mongo($collection)) { // V2r15 / * / scan_id / none (update np_scan_denorm) $cursor ->update(array('scan_id' => intval($scan_id)), $set, array('multiple' => TRUE)); } Index: www/sites/all/modules/custom/np_scan/np_scan.archive.inc =================================================================== --- www/sites/all/modules/custom/np_scan/np_scan.archive.inc (revision 34222) +++ www/sites/all/modules/custom/np_scan/np_scan.archive.inc (working copy) @@ -8,6 +8,7 @@ * @return TRUE on success, FALSE if details could not be found for this scan in scan and node DB tables */ function _np_scan_snapshot($scan_id, $title) { + $scan_id = intval($scan_id); $scan = db_fetch_object(db_query_range('SELECT r.timestamp, s.* FROM {scan} s INNER JOIN {node_revisions} r USING(vid) WHERE scan_id = %d', $scan_id, 0, 1)); if (!empty($scan)) { @@ -21,9 +22,9 @@ $url_stats->photo = 0; $url_stats->video = 0; try { - if ($cursor = scan_api_get_mongo('scan_stats', 'scanurl')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan_id $query = array( - 'scan_id' => intval($scan_id), + 'scan_id' => $scan_id, ); $fields = array('days.general' => 1, 'days.photo' => 1, 'days.video' => 1, 'updated' => 1); $cursor = $cursor->find($query, $fields) @@ -36,9 +37,9 @@ $last_update = scan_api_bucket_index('scanurl', 'days', $result['updated']->sec); $sum = 0; foreach ($result['days'][$key] as $index => $value) { + // @@@ V2 This is completely wrong and probabaly kills kittens. // ignore velocity values and keys which have not been updated recently enough - // if $index is int(0), it will return true for in_array($index, array('velocity', 'velocity7') - if ((!is_int($index) && in_array($index, array('velocity', 'velocity7'))) || ($index > $last_update)) { + if ($index > $last_update) { continue; } else { @@ -89,7 +90,7 @@ * */ function _np_scan_archive_get_urls_cursor($scan_id, $category) { - if ($cursor = scan_api_get_mongo('scan_stats', 'url')) { + if ($cursor = scan_api_get_mongo('url')) { // V2r15 / url / scan_id, category / count:-1 (limit 1000) $query = array( 'scan_id' => intval($scan_id), 'category' => intval($category), @@ -370,12 +371,10 @@ * @param $snapshot_id */ function np_scan_archive_stats($scan, $snapshot_id) { - - $insert_params = array(); $rows = 0; $max = array('count' => 0, 'start_time' => '2009-01-01'); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan_id / none (single) $query = array( 'scan_id' => intval($scan->scan_id), ); @@ -387,27 +386,22 @@ $last_update_start = $result['updated']->sec; $last_update_index = scan_api_bucket_index('scan', 'days', $result['updated']->sec); foreach ($result['days'] as $index => $count) { - if (!is_int($index) && in_array($index, array('velocity', 'prev_velocity', 'velocity7'))) { - continue; + if ($index <= $last_update_index) { + // anything which is <= last-update-index = last updated - (last-updated-index - bucket index) * 86400 + $start_time = gmdate('Y-m-d', $last_update_start - (($last_update_index - $index) * 86400)); } else { - if ($index <= $last_update_index) { - // anything which is <= last-update-index = last updated - (last-updated-index - bucket index) * 86400 - $start_time = gmdate('Y-m-d', $last_update_start - (($last_update_index - $index) * 86400)); - } - else { - // @TODO decide whether to drop the older stats or not, index in mongo here is 0-29 - // anything > last-update-index is the same as above + another 30 days or so? ... or just ignore these as too old? - $start_time = gmdate('Y-m-d', $last_update_start - ((($last_update_index - $index) + 30) * 86400)); - } - $insert_params[] = $snapshot_id; - $insert_params[] = $start_time; - $insert_params[] = $count; - ++$rows; - if ($count > $max['count']) { - $max = array('count' => $count, 'start_time' => $start_time); // @TODO fill in the start time - } + // @TODO decide whether to drop the older stats or not, index in mongo here is 0-29 + // anything > last-update-index is the same as above + another 30 days or so? ... or just ignore these as too old? + $start_time = gmdate('Y-m-d', $last_update_start - ((($last_update_index - $index) + 30) * 86400)); } + $insert_params[] = $snapshot_id; + $insert_params[] = $start_time; + $insert_params[] = $count; + ++$rows; + if ($count > $max['count']) { + $max = array('count' => $count, 'start_time' => $start_time); // @TODO fill in the start time + } } } } Index: www/sites/all/modules/custom/np_scan_stats/np_scan_stats.module =================================================================== --- www/sites/all/modules/custom/np_scan_stats/np_scan_stats.module (revision 34222) +++ www/sites/all/modules/custom/np_scan_stats/np_scan_stats.module (working copy) @@ -1,101 +0,0 @@ - t('Scan statistics'), - 'description' => t('View scan, keyword, hashtag, and url statistics'), - 'access callback' => 'user_access', - 'access arguments' => array('view scan stats'), - 'page callback' => 'np_scan_stats_proto_view', - 'page arguments' => array(3, 4, 5), - 'type' => MENU_NORMAL_ITEM, - 'file' => 'np_scan_stats.proto.inc', - ); - - return $items; -} - -/** - * Implementation of hook_cron - */ -function np_scan_stats_cron() { - $mc = dmemcache_object('cache'); - // aggregated stat maintanence, compressed stat maintanence and fresh stat maintanence should not run together - while (!$mc->add('np_scan_stats_maintanence', 1, FALSE, 1800)) { - sleep(120); - } - $last_run = variable_get('np_scan_stats_cron_run', 0); - $time = time(); - // dont run more than once - if (date('H') != date('H', $last_run) || $lastrun + 3600 < $time) { - include_once drupal_get_path('module', 'np_scan_stats') . '/np_scan_stats.cron.inc'; - $last_hour = mktime(date('H') - 1, 0, 0); - // cleanup - $cleanup_times = array(); - // stats - // scan_statistics, keyword_statistics, hashtag_statistics, retweet_statistics, url_statistics, url_uniq_statistics, location_statistics - foreach (array('scan', 'keyword', 'hashtag', 'retweet', 'url', 'url_uniq', 'location') as $table) { - $start = time(); - $mongo_date = new MongoDate($last_hour - 7 * 86400); - scan_api_get_mongo('statistics', $table)->remove(array('updated' => array('$lte' => $mongo_date))); // scan, keyword, hashtag, retweet, url, url_uniq, location - $cleanup_times[] = $table . ' ' . (time() - $start); - } - scan_api_set_active_shard('misc'); - // urls - $start = time(); - db_query("DELETE su FROM {source_urls} su INNER JOIN {urls} u ON su.url_id = u.id WHERE u.last_occurrence < '%s'", date('Y-m-d H:i:s', $last_hour - 30 * 86400)); - $cleanup_times[] = 'source_urls ' . (time() - $start); - $start = time(); - db_query("DELETE su FROM {scan_urls} su INNER JOIN {urls} u ON su.url_id = u.id WHERE u.last_occurrence < '%s'", date('Y-m-d H:i:s', $last_hour - 30 * 86400)); - $cleanup_times[] = 'scan_urls ' . (time() - $start); - - $start = time(); - $mongo_date = new MongoDate($last_hour - 30 * 86400); - scan_api_get_mongo('statistics', 'url')->remove(array('updated' => array('$lte' => $mongo_date))); - $cleanup_times[] = 'url_statistics_all_time ' . (time() - $start); - - $start = time(); - db_query("DELETE se FROM {scan_embeds} se INNER JOIN {urls} u ON se.url_id = u.id WHERE u.last_occurrence < '%s'", date('Y-m-d H:i:s', $last_hour - 30 * 86400)); - $cleanup_times[] = 'scan_embeds ' . (time() - $start); - $start = time(); - db_query("DELETE si FROM {scan_images} si INNER JOIN {urls} u ON si.url_id = u.id WHERE u.last_occurrence < '%s'", date('Y-m-d H:i:s', $last_hour - 30 * 86400)); - $cleanup_times[] = 'scan_images ' . (time() - $start); - $start = time(); - // make sure we don't delete a url that is blocked, as they are blocked by ID and we still need the url table entry to get that blocked url - db_query("DELETE u FROM {urls} u LEFT JOIN {group_blocked_urls} gbu ON u.id = gbu.url_id WHERE last_occurrence < '%s' AND gbu.url_id IS NULL", date('Y-m-d H:i:s', $last_hour - 30 * 86400)); - $cleanup_times[] = 'urls ' . (time() - $start); - scan_api_set_active_shard(); - watchdog('cron', 'np_scan_stat cleanup times in seconds: ' . implode("\n", $cleanup_times)); - variable_set('np_scan_stats_cron_run', $time); - } - $mc->delete('np_scan_stats_maintanence'); - if (date('d', variable_get('np_scan_views_24_cleanup', 0)) != date('d')) { - variable_set('np_scan_views_24_cleanup', time()); - db_query('UPDATE {node} SET np_views_24 = 0 WHERE np_views_24 != 0'); - db_query('UPDATE {node} SET np_views_widget_24 = 0 WHERE np_views_widget_24 != 0'); - } -} - -function np_scan_stats_perm() { - return array('view scan stats'); -} - -function np_scan_stats_theme() { - return array( - 'mongo_query' => array('arguments' => array('collection' => NULL, 'find' => NULL, 'fields' => array(), 'sort' => array(), 'limit' => 0)), - 'mongo_json' => array('arguments' => array('json' => array())), - 'mongo_short_datetime' => array('arguments' => array('dt' => NULL)), - 'mongo_datetime' => array('arguments' => array('dt' => NULL)), - 'mongo_scan_details' => array('arguments' => array('record' => array())), - 'mongo_word_details' => array('arguments' => array('collection' => 'keyword', 'record' => array())), - 'mongo_location_details' => array('arguments' => array('record' => array())), - 'mongo_url_details' => array('arguments' => array('record' => array())), - 'twitter_word' => array('arguments' => array('collection' => NULL, 'word' => NULL)), - ); -} Index: www/sites/all/modules/custom/np_scan_stats/np_scan_stats_mail.module =================================================================== --- www/sites/all/modules/custom/np_scan_stats/np_scan_stats_mail.module (revision 34222) +++ www/sites/all/modules/custom/np_scan_stats/np_scan_stats_mail.module (working copy) @@ -56,19 +56,28 @@ function np_scan_stats_mail_cron() { $mail_params = array('time' => time(), 'keyword' => array(), 'hashtag' => array(),); $collections = array('keyword' => 15, 'hashtag' => 10); + $query = array( + 'scan_id' => 0, + ); + $fields = array( + 'word' => 1, + 'velocity.mintues' => 1, + 'velocity.hours' => 1, + 'trending' => 1, + ); foreach ($collections as $collection => $default_length) { - $cursor = scan_api_get_mongo('statistics', $collection); // keyword, hashtag - if ($cursor) { + if ($cursor = scan_api_get_mongo($collection)) { // V2r15 / keyword, hashtag / scan_id=0 / trending:-1 try { $result = $cursor - ->find(array('scan_id' => 0)) + ->find($query, $fields) ->sort(array('trending' => -1)) ->limit(variable_get('np_scan_stats_mail_' . $collection . '_list_length', $default_length)) ->timeout(scan_api_get_mongo_timeout()); foreach($result as $document) { $mail_params[$collection][] = array( 'word' => $document['word'], - 'velocity' => $document['minutes']['velocity']? $document['minutes']['velocity'] : $document['hours']['velocity'], + // @@@ V2 This fallback is kinda stupid. + 'velocity' => $document['velocity']['minutes'] ? $document['velocity']['minutes'] : $document['velocity']['hours'], 'trending' => $document['trending'], ); } Index: www/sites/all/modules/custom/np_scan_stats/np_scan_stats.cron.inc =================================================================== --- www/sites/all/modules/custom/np_scan_stats/np_scan_stats.cron.inc (revision 34222) +++ www/sites/all/modules/custom/np_scan_stats/np_scan_stats.cron.inc (working copy) @@ -1,264 +0,0 @@ - 1, 'biday' => 2,) as $timeslice => $multiplier) { - db_query(" - UPDATE {scan_statistics_aggregated} dst - INNER JOIN {scan_statistics} src ON dst.scan_id = src.scan_id AND src.start_time = '%s' - SET dst.velocity_" . $timeslice . " = IF(dst.velocity_" . $timeslice . " > src.count, dst.velocity_" . $timeslice . " - src.count, 0) - WHERE dst.timestamp = %d - ", date('Y-m-d H:i:s', $last_hour - $multiplier * 86400), $time + $i); - } - $runtimes[] = time() - $start; - } - watchdog('cron', 'np_scan_stat cron scan runtimes: ' . implode(', ', $runtimes) . ' (seconds)'); - if ($db_change) { - db_set_active(); - } -} - -/** - * Maintain keyword/hashtag aggregated statistics table - */ -function np_scan_stats_cron_word($table_type, $last_hour = 0) { - if ($table_type != 'keyword' && $table_type != 'hashtag') { - return; - } - $db_change = FALSE; - if (!$last_hour) { - $last_hour = mktime(date('H') - 1, 0, 0); - $db_change = TRUE; - db_set_active('scan'); - } - // keyword_statistics_aggregated, hashtag_statistics_aggregated - $max = db_result(db_query('SELECT COUNT(*) FROM {' . $table_type . '_statistics_aggregated}')); - $run_length = variable_get('np_scan_stats_runlength_' . $table_type, ($table_type == 'keyword' ? 25000 : 100000)); - $runs = ceil($max / $run_length); - $time = time(); - $runtimes = array(); - for ($i = 0; $i < $runs; $i++) { - $start = time(); - // flag the rows that needs to be updated (cant use limit with joined tables in update) - // keyword_statistics_aggregated, hashtag_statistics_aggregated - db_query(" - UPDATE {" . $table_type . "_statistics_aggregated} - SET timestamp = %d - WHERE timestamp < %d - LIMIT %d - ", $time + $i, $time, $run_length); - // last hour + age - // keyword_statistics_aggregated, hashtag_statistics_aggregated, keyword_statistics, hashtag_statistics - db_query(" - UPDATE {" . $table_type . "_statistics_aggregated} dst - LEFT JOIN {" . $table_type . "_statistics} src ON dst.scan_id = src.scan_id AND dst.word = src.word AND src.start_time = '%s' - SET dst.velocity_recent = IFNULL(src.count, 0), dst.age = dst.age + 1 - WHERE dst.timestamp = %d - ", date('Y-m-d H:i:s', $last_hour), $time + $i); - // remove out of interval ones - foreach (array('6hour' => 6,) as $timeslice => $multiplier) { - // keyword_statistics_aggregated, hashtag_statistics_aggregated, keyword_statistics, hashtag_statistics - db_query(" - UPDATE {" . $table_type . "_statistics_aggregated} dst - INNER JOIN {" . $table_type . "_statistics} src ON dst.scan_id = src.scan_id AND src.word = dst.word AND src.start_time = '%s' - SET dst.velocity_" . $timeslice . " = IF(dst.velocity_" . $timeslice . " > src.count, dst.velocity_" . $timeslice . " - src.count, 0) - WHERE dst.timestamp = %d - ", date('Y-m-d H:i:s', $last_hour - $multiplier * 3600), $time + $i); - } - // trending - // velocity_6hour / (velocity_month / 30 days) - // keyword_statistics_aggregated, hashtag_statistics_aggregated - db_query(" - UPDATE {" . $table_type . "_statistics_aggregated} - SET trending = IF(velocity_6hour < 18 OR velocity_month = 0 OR age = 0, 0, (velocity_6hour/6) / ( velocity_month / IF(age > 744, 744, age) )) - WHERE timestamp = %d - ", $time + $i); - $runtimes[] = time() - $start; - } - watchdog('cron', 'np_scan_stat cron ' . $table_type . ' runtimes: ' . implode(', ', $runtimes) . ' (seconds)'); - if ($db_change) { - db_set_active(); - } -} - -/** - * Maintain location aggregated statistics table - */ -function np_scan_stats_cron_location($last_hour = 0) { - $db_change = FALSE; - if (!$last_hour) { - $last_hour = mktime(date('H') - 1, 0, 0); - $db_change = TRUE; - db_set_active('scan'); - } - // location_statistics_aggregated - $max = db_result(db_query('SELECT COUNT(*) FROM {location_statistics_aggregated}')); - $run_length = variable_get('np_scan_stats_runlength_location', 100000); - $runs = ceil($max / $run_length); - $time = time(); - $runtimes = array(); - for ($i = 0; $i < $runs; $i++) { - $start = time(); - // flag the rows that needs to be updated (cant use limit with joined tables in update) - // keyword_statistics_aggregated, hashtag_statistics_aggregated - db_query(" - UPDATE {location_statistics_aggregated} - SET timestamp = %d - WHERE timestamp < %d - LIMIT %d - ", $time + $i, $time, $run_length); - // last hour + age - // location_statistics_aggregated, location_statistics - db_query(" - UPDATE {location_statistics_aggregated} dst - LEFT JOIN {location_statistics} src ON dst.scan_id = src.scan_id AND dst.location_id = src.location_id AND src.start_time = '%s' - SET dst.velocity_recent = IFNULL(src.count, 0), dst.age = dst.age + 1 - WHERE dst.timestamp = %d - ", date('Y-m-d H:i:s', $last_hour), $time + $i); - // remove out of interval ones - foreach (array('6hour' => 6, 'biday' => 48,) as $timeslice => $multiplier) { - // location_statistics_aggregated, location_statistics - db_query(" - UPDATE {location_statistics_aggregated} dst - INNER JOIN {location_statistics} src ON dst.scan_id = src.scan_id AND src.location_id = dst.location_id AND src.start_time = '%s' - SET dst.velocity_" . $timeslice . " = IF(dst.velocity_" . $timeslice . " > src.count, dst.velocity_" . $timeslice . " - src.count, 0) - WHERE dst.timestamp = %d - ", date('Y-m-d H:i:s', $last_hour - $multiplier * 3600), $time + $i); - } - // trending - // velocity_6hour / (velocity_month / 30 days) - // location_statistics_aggregated - db_query(" - UPDATE {location_statistics_aggregated} - SET trending = IF(velocity_6hour < 18 OR velocity_month = 0 OR age = 0, 0, (velocity_6hour/6) / ( velocity_month / IF(age > 744, 744, age) )) - WHERE timestamp = %d - ", $time + $i); - $runtimes[] = time() - $start; - } - watchdog('cron', 'np_scan_stat cron location runtimes: ' . implode(', ', $runtimes) . ' (seconds)'); - if ($db_change) { - db_set_active(); - } -} - -/** - * Maintain url aggregated statistics table - */ -function np_scan_stats_cron_url($last_hour = 0) { - $db_change = FALSE; - if (!$last_hour) { - $last_hour = mktime(date('H') - 1, 0, 0); - $db_change = TRUE; - db_set_active('scan'); - } - $max = db_result(db_query('SELECT COUNT(*) FROM {url_statistics_aggregated}')); - $run_length = variable_get('np_scan_stats_runlength_url', 100000); - $runs = ceil($max / $run_length); - $time = time(); - $runtimes = array(); - for ($i = 0; $i < $runs; $i++) { - $start = time(); - // flag the rows that needs to be updated (cant use limit with joined tables in update) - db_query(" - UPDATE {url_statistics_aggregated} - SET timestamp = %d - WHERE timestamp < %d - LIMIT %d - ", $time + $i, $time, $run_length); - // remove out of interval ones - db_query(" - UPDATE {url_statistics_aggregated} dst - INNER JOIN {url_statistics} src ON dst.scan_id = src.scan_id AND src.url_id = dst.url_id AND src.start_time = '%s' - SET dst.velocity = IF(dst.velocity > src.count, dst.velocity - src.count, 0), dst.cleanup = IF(dst.velocity > src.count, 0, 1) - WHERE dst.timestamp = %d - ", date('Y-m-d H:i:s', $last_hour - 6 * 3600), $time + $i); - $runtimes[] = time() - $start; - } - watchdog('cron', 'np_scan_stat cron url runtimes: ' . implode(', ', $runtimes) . ' (seconds)'); - // cleanup - db_query("DELETE FROM {url_statistics_aggregated} WHERE cleanup = 1"); - if ($db_change) { - db_set_active(); - } -} - -/** - * Maintain Uniq url aggregated statistics table - */ -function np_scan_stats_cron_url_uniq($last_hour = 0) { - $db_change = FALSE; - if (!$last_hour) { - $last_hour = mktime(date('H') - 1, 0, 0); - $db_change = TRUE; - db_set_active('scan'); - } - $max = db_result(db_query('SELECT COUNT(*) FROM {url_uniq_statistics_aggregated}')); - $run_length = variable_get('np_scan_stats_runlength_url_unique', 100000); - $runs = ceil($max / $run_length); - $time = time(); - $runtimes = array(); - for ($i = 0; $i < $runs; $i++) { - $start = time(); - // flag the rows that needs to be updated (cant use limit with joined tables in update) - db_query(" - UPDATE {url_uniq_statistics_aggregated} - SET timestamp = %d - WHERE timestamp < %d - LIMIT %d - ", $time + $i, $time, $run_length); - // remove out of interval ones - foreach (array('day' => 1, 'biday' => 2,) as $timeslice => $multiplier) { - db_query(" - UPDATE {url_uniq_statistics_aggregated} dst - INNER JOIN {url_uniq_statistics} src ON dst.scan_id = src.scan_id AND src.start_time = '%s' - SET dst.velocity_" . $timeslice . " = IF(dst.velocity_" . $timeslice . " > src.general + src.photo + src.video, dst.velocity_" . $timeslice . " - src.general - src.photo - src.video, 0), - dst.general_" . $timeslice . " = IF(dst.general_" . $timeslice . " > src.general, dst.general_" . $timeslice . " - src.general, 0), - dst.photo_" . $timeslice . " = IF(dst.photo_" . $timeslice . " > src.photo, dst.photo_" . $timeslice . " - src.photo, 0), - dst.video_" . $timeslice . " = IF(dst.video_" . $timeslice . " > src.video, dst.video_" . $timeslice . " - src.video, 0) - WHERE dst.timestamp = %d - ", date('Y-m-d H:i:s', $last_hour - $multiplier * 86400), $time + $i); - } - $runtimes[] = time() - $start; - } - watchdog('cron', 'np_scan_stat cron uniq url runtimes: ' . implode(', ', $runtimes) . ' (seconds)'); - if ($db_change) { - db_set_active(); - } -} Index: www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.css =================================================================== --- www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.css (revision 34222) +++ www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.css (working copy) @@ -1,30 +0,0 @@ -#proto-page th.title { width: 99% } -#proto-page th.word { min-width: 100px; max-width: 100px } -#proto-page th.location { min-width: 99%; max-width: 100px } -#proto-page th.url-link { min-width: 200px; max-width: 200px } -#proto-page th.url-title { min-width: 50% } -#proto-page th.id { min-width: 75px; max-width: 75px } -#proto-page th.updated { min-width: 100px; max-width: 100px } -#proto-page th.created { min-width: 50px; max-width: 50px } -#proto-page th.trending { min-width: 75px; max-width: 75px } -#proto-page th.velocity { min-width: 75px; max-width: 75px } -#proto-page th.prev_velocity { min-width: 75px; max-width: 75px } -#proto-page th.general { min-width: 75px; max-width: 75px } -#proto-page th.video { min-width: 75px; max-width: 75px } -#proto-page th.photo { min-width: 75px; max-width: 75px } -#proto-page th.category { min-width: 75px; max-width: 75px } -#proto-page th.count { min-width: 75px; max-width: 75px } -#proto-page th.source { min-width: 75px; max-width: 75px } -#proto-page h3 { padding: 10px 0 0 0; text-align: center} -#proto-page h4 { text-align: center } -#proto-page hr { padding: 10px 0 10px 0; clear: both } -#proto-page div#toc { line-height: 100% } -#proto-page div#toc > div.item-list > ul > li { float: left } -#proto-page div#option-form { clear: both } -#proto-page div#option-form form { margin-bottom: 0 } -#proto-page div#option-form div, #proto-page div#option-form > div, #proto-page div#option-form label { display: inline } -#proto-page div#help { clear: both; padding: 5px 0 0 0; line-height: 100%; font-size: 90% } -#proto-page div.scan-list { float: right; clear: both } -#proto-page div.scan-list-keyword, -#proto-page div.scan-list-hashtag, -#proto-page div.scan-list-retweet { float: right; clear: none } Index: www/sites/all/modules/custom/np_scan_stats/np_scan_stats.info =================================================================== --- www/sites/all/modules/custom/np_scan_stats/np_scan_stats.info (revision 34222) +++ www/sites/all/modules/custom/np_scan_stats/np_scan_stats.info (working copy) @@ -1,6 +0,0 @@ -; $Id: $ -name = Np Scan Stats -description = Scan Stats Cron / Admin functions -dependencies[] = scan_api -package = NP -core = 6.x Index: www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.inc =================================================================== --- www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.inc (revision 34222) +++ www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.inc (working copy) @@ -1,985 +0,0 @@ - array()); - if (isset($_GET['limit'])) { - $options['query']['limit'] = $_GET['limit']; - } - if (isset($_GET['fields'])) { - $options['query']['fields'] = $_GET['fields']; - } - - $current_url = 'admin/reports/scan_stats/'; - $toc = array( - array( - 'data' => l('Scans', $current_url . 'scan', array_merge($options, array('fragment' => 'scan'))), - 'children' => array( - l('Most active online scans', $current_url . 'scan', array_merge($options, array('fragment' => 'scan_online'))), - l('Most active offline scans', $current_url . 'scan', array_merge($options, array('fragment' => 'scan_offline'))), - l('Most active scans by link', $current_url . 'scan', array_merge($options, array('fragment' => 'scan_bylink'))), - l('Top trending scans', $current_url . 'scan', array_merge($options, array('fragment' => 'scan_trending'))), - ) - ), - array( - 'data' => l('Keywords', $current_url . 'keyword', array_merge($options, array('fragment' => 'keyword'))), - 'children' => array( - l('Most mentioned keywords', $current_url . 'keyword', array_merge($options, array('fragment' => 'keyword_velocity'))), - l('Top trending keywords', $current_url . 'keyword', array_merge($options, array('fragment' => 'keyword_trending'))), - ), - ), - array( - 'data' => l('Hashtags', $current_url . 'hashtag', array_merge($options, array('fragment' => 'hashtag'))), - 'children' => array( - l('Most mentioned hashtags', $current_url . 'hashtag', array_merge($options, array('fragment' => 'hashtag_velocity'))), - l('Top trending hashtags', $current_url . 'hashtag', array_merge($options, array('fragment' => 'hashtag_trending'))), - ), - ), -/* array( - 'data' => l('Retweets', $current_url . 'retweet', array_merge($options, array('fragment' => 'retweet'))), - 'children' => array( - l('Most mentioned retweets', $current_url . 'retweet', array_merge($options, array('fragment' => 'retweet_velocity'))), - l('Top trending retweets', $current_url . 'retweet', array_merge($options, array('fragment' => 'retweet_trending'))), - ), - ), */ - array( - 'data' => l('Links', $current_url . 'link', array_merge($options, array('fragment' => 'link'))), - 'children' => array( - l('Top links', $current_url . 'link', array_merge($options, array('fragment' => 'link_general'))), - l('Top photos', $current_url . 'link', array_merge($options, array('fragment' => 'link_photo'))), - l('Top videos', $current_url . 'link', array_merge($options, array('fragment' => 'link_video'))), - l('Top links by count', $current_url . 'link', array_merge($options, array('fragment' => 'link_count_general'))), - l('Top photos by count', $current_url . 'link', array_merge($options, array('fragment' => 'link_count_photo'))), - l('Top videos by count', $current_url . 'link', array_merge($options, array('fragment' => 'link_count_video'))), - ), - ), - array( - 'data' => l('Locations', $current_url . 'location', array_merge($options, array('fragment' => 'location'))), - 'children' => array( - l('Most used locations', $current_url . 'location', array_merge($options, array('fragment' => 'location_velocity'))), - l('Top trending locations', $current_url . 'location', array_merge($options, array('fragment' => 'location_trending'))), - ), - ), - ); - $out = '
' . theme('item_list', $toc) . '
'; - - if (!empty($heading)) { - $out .= '
' . drupal_get_form('np_scan_stats_proto_options_form') . '
'; - } - - $help = array(); - $help[] = t('Trending measures current momentum usng the normalized ratio of the current velocity (5 minute increments) to the long term velocity (30 day).'); - $help[] .= t('See Mongo Schema for more help.', array('@wiki-url' => url('https://apps.d2.nowpublic.com/trac/wiki/Scan/MongoSchema', array('absolute' => TRUE)))); - $out .= '
' . theme('item_list', $help) . '
'; - - $query_active_and_online = array('scan_id' => array('$gt' => 0), 'scan.status' => 0, 'scan.active' => 1); - $query_no_scan = array('scan_id' => 0); - $query_scan = array('scan_id' => array('$gt' => 0)); - - if ($heading == 'scan') { - if (empty($detail)) { - $out .= '

' . t('Scans') . '

'; - $out .= '

' . t('Most active online scans') . '

'; - $out .= _np_scan_stats_scan_top_velocity($query_active_and_online); - $out .= '

' . t('Most active offline scans') . '

'; - $out .= _np_scan_stats_scan_top_velocity(array('scan.status' => 1, 'scan.active' => 1)); - $out .= '

' . t('Most active scans by link') . '

'; - $out .= _np_scan_stats_top_unique_links($query_active_and_online); - $out .= '

' . t('Top trending scans') . '

'; - $out .= _np_scan_stats_word_top_sort('scan', array('trending' => -1), $query_scan); - } - else { - $out .= '
'; - $out .= _np_scan_stats_scan_details_header($detail); - $out .= _np_scan_stats_scan_title($detail, false); - $out .= _np_scan_stats_scan_details($detail); - } - } - else if ($heading == 'keyword') { - if (empty($detail)) { - $out .= '

' . t('Keywords') . '

'; - $out .= '

' . t('Most mentioned keywords') . '

'; - $out .= _np_scan_stats_word_top_velocity('keyword', $query_no_scan); - $out .= '

' . t('Top trending keywords') . '

'; - $out .= _np_scan_stats_word_top_sort('keyword', array('trending' => -1), $query_no_scan); - } - else { - $out .= '
'; - $out .= _np_scan_stats_word_details_header($heading, $detail); - $out .= _np_scan_stats_scan_title($scan_id); - $out .= '

' . t('Keyword: %word', array('%word' => $detail)) . '

'; - $out .= _np_scan_stats_word_details('keyword', $detail, $scan_id); - } - } - else if ($heading == 'hashtag') { - if (empty($detail)) { - $out .= '

' . t('Hashtag') . '

'; - $out .= '

' . t('Most mentioned hashtags') . '

'; - $out .= _np_scan_stats_word_top_velocity('hashtag', $query_no_scan); - $out .= '

' . t('Top trending hashtags') . '

'; - $out .= _np_scan_stats_word_top_sort('hashtag', array('trending' => -1), $query_no_scan); - } - else { - $out .= '
'; - $out .= _np_scan_stats_word_details_header($heading, $detail); - $out .= _np_scan_stats_scan_title($scan_id); - $out .= '

' . t('Hashtag: #%word', array('%word' => $detail)) . '

'; - $out .= _np_scan_stats_word_details('hashtag', $detail, $scan_id); - } - } - else if ($heading == 'retweet') { -/* if (empty($detail)) { - $out .= '

' . t('Retweet') . '

'; - $out .= '

' . t('Most mentioned retweets') . '

'; - $out .= _np_scan_stats_word_top_velocity('retweet', $query_no_scan); - $out .= '

' . t('Top trending retweets') . '

'; - $out .= _np_scan_stats_word_top_sort('retweet', array('trending' => -1), $query_no_scan); - } - else { */ - $out .= '
'; - $out .= _np_scan_stats_word_details_header($heading, $detail); - $out .= _np_scan_stats_scan_title($scan_id); - $out .= '

' . t('Retweet: %word', array('%word' => $detail)) . '

'; - $out .= _np_scan_stats_word_details('retweet', $detail, $scan_id); -// } - } - else if ($heading == 'location') { - if (empty($detail)) { - $out .= '

' . t('Locations') . '

'; - $out .= '

' . t('Most used locations') . '

'; - $out .= _np_scan_stats_word_top_velocity('location', $query_no_scan); - $out .= '

' . t('Top trending locations') . '

'; - $out .= _np_scan_stats_word_top_sort('location', array('trending' => -1), $query_no_scan); - } - else { - $name = db_result(db_query("SELECT name FROM {geonames} WHERE geonameid = %d", $detail)); - $out .= '
'; - $out .= _np_scan_stats_word_details_header($heading, intval($detail)); - $out .= _np_scan_stats_scan_title($scan_id); - $out .= '

' . t('Location: %location_id: %title', array('%location_id' => $detail, '%title' => $name)) . '

'; - $out .= _np_scan_stats_location_details($detail, $scan_id); - } - } - else if ($heading == 'link') { - $out .= '

' . t('Links') . '

'; - - $categories = array( - "general" => 1, - "photo" => 2, - "video" => 3, - ); - foreach ($categories as $category_name => $category_id) { - $out .= '

' . t('Top %category links by velocity', array('%category' => $category_name)) . '

'; - $out .= _np_scan_stats_top_links(array('scan_id' => 0, 'category' => $category_id), array('hours.velocity' => -1)); - $out .= '

' . t('Top %category links by count', array('%category' => $category_name)) . '

'; - $out .= _np_scan_stats_top_links(array('scan_id' => 0, 'category' => $category_id), array('count' => -1)); - } - } - else if ($heading == 'url' && !empty($detail)) { - $title = db_result(db_query("SELECT title FROM {urls} WHERE id = %d", $detail)); - $out .= '
'; -// $out .= _np_scan_stats_word_details_header($heading, intval($detail)); - $out .= _np_scan_stats_scan_title($scan_id); - $out .= '

' . t('Url: %url_id: %title', array('%url_id' => $detail, '%title' => $title)) . '

'; - $out .= _np_scan_stats_url_details($detail, $scan_id); - } - - db_set_active(); - return '
' . $out . '
'; -} - -function _np_scan_stats_scan_title($scan_id, $link = true) { - if ($scan_id) { - $title = db_result(db_query("SELECT title FROM {scan} INNER JOIN {node_revisions} using(vid) WHERE scan_id = %d", $scan_id)); - $out = t('Scan: %scan_id: %title', array('%scan_id' => $scan_id, '%title' => $title)); - return '

' . ($link ? l($out, 'admin/reports/scan_stats/scan/' . $scan_id, array('html' => 1)) : $out) . '

'; - } -} - -function _np_scan_stats_scan_top_velocity($query = array()) { - $output = array(); - if (!($cursor = scan_api_get_mongo('scan_stats', 'scan'))) { - _np_scan_stats_mongo_error(); - } - else { - $buckets = array( - 'hour' => array('bucket' => 'minutes', 'velocity' => 'velocity', 'prev_velocity' => 1), - 'day' => array('bucket' => 'hours', 'velocity' => 'velocity'), - 'biday' => array('bucket' => 'hours', 'velocity' => 'velocity48', 'prev_velocity' => 1), - 'week' => array('bucket' => 'days', 'velocity' => 'velocity7'), - 'month' => array('bucket' => 'days', 'velocity' => 'velocity', 'prev_velocity' => 1), - ); - $caption = array(); - foreach ($buckets as $timeslice => $mongo) { - $output[$timeslice] = array(); - $bucket = $mongo['bucket']; - $velocity_field = $bucket . '.' . $mongo['velocity']; - $fields = array('scan_id' => 1, $velocity_field => 1, 'trending' => 1, 'updated' => 1, 'created' => 1); - if (isset($mongo['prev_velocity'])) { - $fields[$bucket . '.prev_velocity'] = 1; - } - $bucket_query = $query; - $bucket_query[$velocity_field] = array('$gt' => 0); - $hint = array($velocity_field => -1); - $sort = array($velocity_field => -1); - $limit = isset($_GET['limit']) ? $_GET['limit'] : LIMIT; - $caption[$timeslice] = theme('mongo_query', 'scan', $bucket_query, $fields, $sort, $limit); - try { - $results = $cursor->find($bucket_query, $fields) - ->sort($sort) - ->limit($limit) - ->hint($hint) - ->timeout(scan_api_get_mongo_timeout()) - ; - foreach ($results as $row) { - $title = db_result(db_query("SELECT title FROM {scan} INNER JOIN {node_revisions} using(vid) WHERE scan_id = %d", $row['scan_id'])); - $output[$timeslice][] = array( - l($row['scan_id'], 'admin/reports/scan_stats/scan/'. $row['scan_id']), - theme('mongo_short_datetime', $row['created']), - theme('mongo_datetime', $row['updated']), - $title ? l($title, 'admin/reports/scan_stats/scan/' . $row['scan_id']) : '-', - isset($row['trending']) ? round($row['trending'], STATS_TRENDING_PRECISION) : '-', - round($row[$bucket][$mongo['velocity']], STATS_VELOCITY_PRECISION), - isset($mongo['prev_velocity']) ? round($row[$bucket]['prev_velocity'], STATS_VELOCITY_PRECISION) : '-', - ); - } - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - } - - $headers = array( - array('data' => t('id'), 'class' => 'id'), - array('data' => t('Created'), 'class' => 'created'), - array('data' => t('Updated'), 'class' => 'updated'), - array('data' => t('Title'), 'class' => 'title'), - array('data' => t('Trending'), 'class' => 'trending'), - array('data' => t('Velocity'), 'class' => 'velocity'), - array('data' => t('Prev'), 'class' => 'prev_velocity'), - ); - - $out = ''; - $out .= '

' . t('Last 60 Minutes') . '

'; - $out .= theme('table', $headers, $output['hour'], array(), $caption['hour']); - - $out .= '

' . t('Last 24 Hours') . '

'; - $out .= theme('table', $headers, $output['day'], array(), $caption['day']); - - $out .= '

' . t('Last 48 Hours') . '

'; - $out .= theme('table', $headers, $output['biday'], array(), $caption['biday']); - - $out .= '

' . t('Last 7 Days') . '

'; - $out .= theme('table', $headers, $output['week'], array(), $caption['week']); - - $out .= '

' . t('Last 30 Days') . '

'; - $out .= theme('table', $headers, $output['month'], array(), $caption['month']); - - return $out; -} - -function _np_scan_stats_top_unique_links($query = array()) { - if (!($cursor = scan_api_get_mongo('scan_stats', 'scanurl'))) { - _np_scan_stats_mongo_error(); - } - else { - // most active scans by number of unique links - $buckets = array( - 'day' => array('bucket' => 'hours', 'velocity' => 'velocity24'), - 'biday' => array('bucket' => 'hours', 'velocity' => 'velocity'), - 'week' => array('bucket' => 'days', 'velocity' => 'velocity7'), - 'month' => array('bucket' => 'days', 'velocity' => 'velocity'), - ); - $output = array(); - $caption = array(); - foreach ($buckets as $timeslice => $mongo) { - $output[$timeslice] = array(); - $bucket = $mongo['bucket']; - $velocity_field = $bucket . '.' . $mongo['velocity']; - $fields = array( - 'scan_id' => 1, - $velocity_field => 1, - 'updated' => 1, - 'created' => 1, - $bucket . '.general.' . $mongo['velocity'] => 1, - $bucket . '.video.' . $mongo['velocity'] => 1, - $bucket . '.photo.' . $mongo['velocity'] => 1, - ); - $bucket_query = $query; - $bucket_query[$velocity_field] = array('$gt' => 0); - $sort = array($velocity_field => -1); - $hint = array('scan_id' => 1, $velocity_field => -1); - $limit = isset($_GET['limit']) ? $_GET['limit'] : LIMIT; - $caption[$timeslice] = theme('mongo_query', 'scanurl', $bucket_query, $fields, $sort, $limit); - try { - $results = $cursor - ->find($bucket_query, $fields) - ->sort($sort) - ->limit($limit) - ->hint($hint) - ->timeout(scan_api_get_mongo_timeout()) - ; - foreach ($results as $row) { - $title = db_result(db_query("SELECT title FROM {scan} INNER JOIN {node_revisions} using(vid) WHERE scan_id = %d", $row['scan_id'])); - $output[$timeslice][] = array( - l($row['scan_id'], 'admin/reports/scan_stats/scan/' . $row['scan_id']), - theme('mongo_short_datetime', $row['created']), - theme('mongo_datetime', $row['updated']), - $title ? l($title, 'admin/reports/scan_stats/scan/' . $row['scan_id']) : '-', - round($row[$bucket][$mongo['velocity']], STATS_VELOCITY_PRECISION), - isset($row[$bucket]['general'][$mongo['velocity']]) ? round($row[$bucket]['general'][$mongo['velocity']], STATS_VELOCITY_PRECISION) : '-', - isset($row[$bucket]['video'][$mongo['velocity']]) ? round($row[$bucket]['video'][$mongo['velocity']], STATS_VELOCITY_PRECISION) : '-', - isset($row[$bucket]['photo'][$mongo['velocity']]) ? round($row[$bucket]['photo'][$mongo['velocity']], STATS_VELOCITY_PRECISION) : '-', - ); - } - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - return; - } - } - } - - $headers = array( - array('data' => t('id'), 'class' => 'id'), - array('data' => t('Created'), 'class' => 'created'), - array('data' => t('Updated'), 'class' => 'updated'), - array('data' => t('Title'), 'class' => 'title'), - array('data' => t('Velocity'), 'class' => 'velocity'), - array('data' => t('General'), 'class' => 'general'), - array('data' => t('Video'), 'class' => 'video'), - array('data' => t('Photo'), 'class' => 'photo'), - ); - - $out = ''; - $out .= '

' . t('Last 24 Hours') . '

'; - $out .= theme('table', $headers, $output['day'], array(), $caption['day']); - - $out .= '

' . t('Last 48 Hours') . '

'; - $out .= theme('table', $headers, $output['biday'], array(), $caption['biday']); - - $out .= '

' . t('Last 7 Days') . '

'; - $out .= theme('table', $headers, $output['week'], array(), $caption['week']); - - $out .= '

' . t('Last 30 Days') . '

'; - $out .= theme('table', $headers, $output['month'], array(), $caption['month']); - - return $out; -} - -function _np_scan_stats_top_links($query = array(), $sort = array()) { - $output = array(); - if (!($cursor = scan_api_get_mongo('scan_stats', 'url'))) { - _np_scan_stats_mongo_error(); - $caption = theme('mongo_query', 'url', array(), array(), array(), LIMIT); - } - else { - $fields = array( - 'category' => 1, - 'url_id' => 1, - 'hours.velocity' => 1, - 'count' => 1, - 'updated' => 1, - 'created' => 1, - ); - if (isset($sort['hours.velocity'])) { - $query['hours.velocity'] = array('$gt' => 0); - $hint = array('scan_id' => 1, 'category' => 1, 'hours.velocity' => -1); - } - else { - $hint = array('scan_id' => 1, 'category' => 1, 'count' => -1); - } - // Add category to query so we can use the index - if (!isset($query['category'])) { - $query['category'] = array('$in' => array(1, 2, 3)); - } - $limit = isset($_GET['limit']) ? $_GET['limit'] : LIMIT; - try { - $results = $cursor->find($query, $fields) - ->sort($sort) - ->limit($limit) - ->hint($hint) - ->timeout(scan_api_get_mongo_timeout()) - ; - $caption = theme('mongo_query', 'url', $query, $fields, $sort, $limit); - foreach ($results as $row) { - db_set_active('misc'); - $url = db_fetch_array(db_query(" - SELECT u.title, u.resolved, - si.thumb_path, - se.domain, se.video_id - FROM {urls} u - LEFT JOIN {scan_images} si ON si.url_id = u.id - LEFT JOIN {scan_embeds} se ON se.url_id = u.id - WHERE u.id = %d - ", $row['url_id'])); - $title = empty($url['title']) ? '-' : $url['title']; - if ($row['category'] == 2 && !empty($url['thumb_path'])) { - $path = basename($url['thumb_path']); - $full_path = 'http://media.scan.nowpublic.com/'. substr($path, 0, 1) . '/' . substr($path, 1, 1) . substr($path, 2, 1) . '/'. $path; - $title = theme('image', $full_path, $title, $title, NULL, FALSE); - } - elseif ($row['category'] == 3 && !empty($url['video_id'])) { - $title = _api_embed_code($url['domain'], $url['video_id'], 280, 200); - } - if (!empty($url['resolved'])) { - $url['host'] = parse_url($url['resolved'], PHP_URL_HOST); - } - db_set_active(); - $output[] = array( - l($row['url_id'], 'admin/reports/scan_stats/url/' . $row['url_id']), - theme('mongo_short_datetime', $row['created']), - theme('mongo_datetime', $row['updated']), - $title, - empty($url['resolved']) ? '-' : l($url['host'], $url['resolved']), - round($row['hours']['velocity'], STATS_VELOCITY_PRECISION), - $row['count'], - ); - } - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - - $headers = array( - array('data' => t('Id'), 'class' => 'id'), - array('data' => t('Created'), 'class' => 'created'), - array('data' => t('Updated'), 'class' => 'updated'), - array('data' => t('Title'), 'class' => 'url-title'), - array('data' => t('Link'), 'class' => 'url-link'), - array('data' => t('Velocity'), 'class' => 'velocity'), - array('data' => t('Count'), 'class' => 'count'), - ); - - return theme('table', $headers, $output, array(), $caption); -} - -function _np_scan_stats_scan_details($scan_id) { - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { - $query = array('scan_id' => intval($scan_id)); - try { - $results = $cursor - ->find($query) - ->timeout(scan_api_get_mongo_timeout()) - ; - return theme('mongo_scan_details', _np_scan_stats_get_record('scan', $results)); - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - else { - _np_scan_stats_mongo_error(); - } -} - -function _np_scan_stats_word_details_header($collection, $value) { - if (!($cursor = scan_api_get_mongo('scan_stats', $collection))) { // keyword, hashtag, keyword, location, (url -- disabled) - _np_scan_stats_mongo_error(); - return; - } - $collections = array( - 'keyword' => 'word', - 'hashtag' => 'word', - 'retweet' => 'word', - 'location' => 'location_id', - 'url' => 'url_id', - ); - $query = array($collections[$collection] => $value, 'scan_id' => array('$gt' => 0)); - $hint = array('scan_id' => 1, $collections[$collection] => 1, 'hours.velocity' => -1); - $limit = isset($_GET['limit']) ? $_GET['limit'] : LIMIT; - try { - $results = $cursor - ->find($query, array('scan_id' => 1, 'hours.velocity' => 1)) - ->sort(array('hours.velocity' => -1)) - ->hint($hint) - ->limit($limit) - ->timeout(scan_api_get_mongo_timeout()) - ; - $rows = array(); - foreach ($results as $row) { - $title = db_result(db_query("SELECT title FROM {scan} INNER JOIN {node_revisions} using(vid) WHERE scan_id = %d", $row['scan_id'])); - if (strlen($title) > 40) { - $title = substr($title, 0, 40) . "..."; - } - $rows[] = round($row['hours']['velocity'], STATS_VELOCITY_PRECISION) . ': ' . l($title ? $title : $row['scan_id'], 'admin/reports/scan_stats/scan/' . $row['scan_id']); - } - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - if ($rows) { - return '
' . theme('item_list', $rows, 'scans', 'ol') . '
'; - } -} - -function _np_scan_stats_scan_details_header($scan_id) { - $query = array('scan_id' => intval($scan_id)); - $collections = array( - 'keyword' => 'word', - 'hashtag' => 'word', - 'retweet' => 'word', - 'location' => 'location_id', - 'url' => 'url_id', - ); - $toc = array(); - $limit = isset($_GET['limit']) ? $_GET['limit'] : LIMIT; - foreach ($collections as $collection => $field) { - if ($cursor = scan_api_get_mongo('scan_stats', $collection)) { // keyword, hashtag, retweet, location, url - try { - $results = $cursor - ->find($query, array($field => 1, 'hours.velocity' => 1)) - ->sort(array('hours.velocity' => -1)) - ->limit($limit) - ->timeout(scan_api_get_mongo_timeout()) - ; - $rows = array(); - foreach ($results as $row) { - $title = $row[$field]; - if ($collection == 'location') { - if ($name = db_result(db_query("SELECT name FROM {geonames} WHERE geonameid = %d", $row[$field]))) { - $title = $name . ' (' . $row[$field] . ')'; - } - } - else if ($collection == 'url') { - if ($title = db_result(db_query("SELECT title FROM {urls} WHERE id = %d", $row[$field]))) { - $title = preg_replace('/^YouTube\s+-\s+/', '', $title); - if (strlen($title) > 40) { - $title = substr($title, 0, 40) . "..."; - } - $title .= ' (' . $row[$field] . ')'; - } - } - $rows[] = round($row['hours']['velocity'], STATS_VELOCITY_PRECISION) . ': ' . l($title, 'admin/reports/scan_stats/'. $collection . '/' . $row[$field] . '/' . $scan_id); - } - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - else { - _np_scan_stats_mongo_error(); - } - if ($rows) { - return '
' . theme('item_list', $rows, $collection . 's', 'ol') . '
'; - } - } -} - -function _np_scan_stats_word_top_velocity($collection, $query = array()) { - $output = array('hour' => array(), 'day' => array(), 'month' => array()); - $key = $collection == 'location' ? 'location_id' : 'word'; - - if (!($cursor = scan_api_get_mongo('scan_stats', $collection))) { // keyword, hashtag, (retweet -- disabled), location - _np_scan_stats_mongo_error(); - $caption_title = theme('mongo_query', $collection, array(), array(), array(), LIMIT); - $caption = array('hour' => $caption_title, 'day' => $caption_title, 'month' => $caption_title); - } - else { - $buckets = array( - 'hour' => array('bucket' => 'minutes', 'velocity' => 'velocity'), - 'day' => array('bucket' => 'hours', 'velocity' => 'velocity'), - 'month' => array('bucket' => 'days', 'velocity' => 'velocity'), - ); - $limit = isset($_GET['limit']) ? $_GET['limit'] : LIMIT; - foreach ($buckets as $timeslice => $mongo) { - $bucket = $mongo['bucket']; - $velocity_field = $bucket . '.' . $mongo['velocity']; - $bucket_query = $query; - $bucket_query[$velocity_field] = array('$gt' => 0); - $hint = array('scan_id' => 1, $velocity_field => -1); - $sort = array($velocity_field => -1); - $fields = array($velocity_field => 1, 'trending' => 1, 'created' => 1, 'updated' => 1, $bucket . '.prev_velocity', $key => 1); - $caption[$timeslice] = theme('mongo_query', $collection, $bucket_query, $fields, $sort, $limit); - try { - $results = $cursor - ->find($bucket_query, $fields) - ->sort($sort) - ->limit($limit) - ->hint($hint) - ->timeout(scan_api_get_mongo_timeout()) - ; - foreach ($results as $row) { - if ($collection == 'location') { - $name = db_result(db_query("SELECT name FROM {geonames} WHERE geonameid = %d", $row[$key])); - $key_link = l($row[$key], 'admin/reports/scan_stats/location/' . $row[$key]); - $key_value = empty($name) ? $key_link : l($name, 'admin/reports/scan_stats/location/' . $row[$key]) . ' (' . $key_link . ')'; - } - else { - $key_value = theme('twitter_word', $collection, $row[$key]); - } - $output_row = array( - theme('mongo_short_datetime', $row['created']), - theme('mongo_datetime', $row['updated']), - $key_value, - isset($row['trending']) ? round($row['trending'], STATS_TRENDING_PRECISION) : '-', - round($row[$bucket][$mongo['velocity']], STATS_VELOCITY_PRECISION), - isset($mongo['prev_velocity']) ? round($row[$bucket]['prev_velocity'], STATS_VELOCITY_PRECISION) : '-', - ); - if ($collection != 'location') { - $whitelist = db_result(db_query("SELECT IFNULL(type, -1) as whitelist FROM {keyword_whitelist} WHERE word = '%s'", $row['word'])); - $output_row[] = $whitelist == -1 ? "NER" : (($whitelist == 5) ? "whitelist-user" : "whitelist"); - } - $output[$timeslice][] = $output_row; - } - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - } - - $headers = array( - array('data' => t('Created'), 'class' => 'created'), - array('data' => t('Updated'), 'class' => 'updated'), - array('data' => $collection == 'location' ? t('Location') : t('Word'), 'class' => $collection == 'location' ? 'location' : 'word'), - array('data' => t('Trending'), 'class' => 'trending'), - array('data' => t('Velocity'), 'class' => 'velocity'), - array('data' => t('Prev'), 'class' => 'prev_velocity'), - ); - if ($collection != 'location') { - $headers[] = array('data' => t('Source'), 'class' => 'source'); - } - - $out = ''; - $out .= '

' . t('Last 60 Minutes') . '

'; - $out .= theme('table', $headers, $output['hour'], array(), $caption['hour']); - - $out .= '

' . t('Last 24 Hours') . '

'; - $out .= theme('table', $headers, $output['day'], array(), $caption['day']); - - $out .= '

' . t('Last 30 Days') . '

'; - $out .= theme('table', $headers, $output['month'], array(), $caption['month']); - - return $out; -} - -function _np_scan_stats_word_top_sort($collection, $sort, $query = array()) { - $output = array(); - $keys = array('location' => 'location_id', 'scan' => 'scan_id'); - $key = isset($keys[$collection]) ? $keys[$collection] : 'word'; - $sort_keys = array_keys($sort); - $sort_key = $sort_keys[0]; - - if (!($cursor = scan_api_get_mongo('scan_stats', $collection))) { // scan, keyword, hashtag, (retweet -- disabled), location - _np_scan_stats_mongo_error(); - $caption = theme('mongo_query', $collection, array(), array(), array(), LIMIT); - } - else { - $fields = array('trending' => 1, 'created' => 1, 'updated' => 1, 'minutes.velocity' => 1, 'minutes.prev_velocity' => 1, 'hours.velocity' => 1, 'hours.prev_velocity' => 1, 'days.velocity' => 1, 'days.prev_velocity' => 1, $key => 1); - $hint = ($collection == 'scan') ? array() : array('scan_id' => 1); - $hint = array_merge($hint, $sort); - $limit = isset($_GET['limit']) ? $_GET['limit'] : LIMIT; - $caption = theme('mongo_query', $collection, $query, $fields, $sort, $limit); - try { - $results = $cursor - ->find($query, $fields) - ->sort($sort) - ->limit($limit) - ->hint($hint) - ->timeout(scan_api_get_mongo_timeout()) - ; - foreach ($results as $row) { - if ($collection == 'location') { - $name = db_result(db_query("SELECT name FROM {geonames} WHERE geonameid = %d", $row[$key])); - $path = 'admin/reports/scan_stats/location/' . $row[$key]; - $key_path = l($row[$key], $path); - $key_value = empty($name) ? $key_path : l($name, $path) . ' (' . $key_path . ')'; - } - else if ($collection == 'scan') { - $title = db_result(db_query("SELECT title FROM {scan} INNER JOIN {node_revisions} using(vid) WHERE scan_id = %d", $row[$key])); - $key_value = l($title, 'admin/reports/scan_stats/scan/' . $row[$key]); - } - else { - $key_value = theme('twitter_word', $collection, $row[$key]); - } - $output_row = array( - theme('mongo_short_datetime', $row['created']), - theme('mongo_datetime', $row['updated']), - $key_value, - isset($row['trending']) ? round($row['trending'], STATS_TRENDING_PRECISION) : '-', - isset($row['minutes']['velocity']) ? round($row['minutes']['velocity'], STATS_VELOCITY_PRECISION) : '-', - isset($row['minutes']['prev_velocity']) ? round($row['minutes']['prev_velocity'], STATS_VELOCITY_PRECISION) : '-', - isset($row['hours']['velocity']) ? round($row['hours']['velocity'], STATS_VELOCITY_PRECISION) : '-', - isset($row['hours']['prev_velocity']) ? round($row['hours']['prev_velocity'], STATS_VELOCITY_PRECISION) : '-', - isset($row['days']['velocity']) ? round($row['days']['velocity'], STATS_VELOCITY_PRECISION) : '-', - isset($row['days']['prev_velocity']) ? round($row['days']['prev_velocity'], STATS_VELOCITY_PRECISION) : '-', - ); - if ($key == 'word') { - $whitelist = db_result(db_query("SELECT IFNULL(type, -1) as whitelist FROM {keyword_whitelist} WHERE word = '%s'", $row['word'])); - $output_row[] = $whitelist == -1 ? "NER" : (($whitelist == 5) ? "whitelist-user" : "whitelist"); - } - if (floatval($row[$sort_key]) > 0) { - $output[] = $output_row; - } - } - } - catch (MongoCursorTimeoutException $e) { - } - } - - $headers = array( - array('data' => t('Created'), 'class' => 'created'), - array('data' => t('Updated'), 'class' => 'updated'), - array('data' => $collection == 'location' ? t('Location') : t('Word'), 'class' => $collection == 'location' ? 'location' : 'word'), - array('data' => t('Trending'), 'class' => 'trending'), - array('data' => t('Velocity hr'), 'class' => 'velocity'), - array('data' => t('Prev hr'), 'class' => 'prev_velocity'), - array('data' => t('Velocity day'), 'class' => 'velocity'), - array('data' => t('Prev day'), 'class' => 'prev_velocity'), - array('data' => t('Velocity mon'), 'class' => 'velocity'), - array('data' => t('Prev mon'), 'class' => 'prev_velocity'), - ); - if ($key == 'word') { - $headers[] = array('data' => t('Source'), 'class' => 'source'); - } - - return theme('table', $headers, $output, array(), $caption); -} - -function _np_scan_stats_word_details($collection, $word, $scan_id) { - if ($cursor = scan_api_get_mongo('scan_stats', $collection)) { // keyword, hashtag, retweet - $query = array('scan_id' => empty($scan_id) ? 0 : intval($scan_id), 'word' => $word); - try { - $results = $cursor - ->find($query) - ->timeout(scan_api_get_mongo_timeout()) - ; - return theme('mongo_word_details', $collection, _np_scan_stats_get_record($collection, $results)); - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - else { - _np_scan_stats_mongo_error(); - } -} - -function _np_scan_stats_location_details($location_id, $scan_id) { - if ($cursor = scan_api_get_mongo('scan_stats', 'location')) { - $query = array('scan_id' => empty($scan_id) ? 0 : intval($scan_id), 'location_id' => intval($location_id)); - try { - $results = $cursor - ->find($query) - ->timeout(scan_api_get_mongo_timeout()) - ; - return theme('mongo_location_details', _np_scan_stats_get_record('location', $results)); - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - else { - _np_scan_stats_mongo_error(); - } -} - -function _np_scan_stats_url_details($url_id, $scan_id) { - if ($cursor = scan_api_get_mongo('scan_stats', 'url')) { - $query = array('scan_id' => empty($scan_id) ? 0 : intval($scan_id), 'url_id' => intval($url_id)); - $hint = array('scan_id' => 1, 'url_id' => 1, 'hours.velocity' => -1); - try { - $results = $cursor - ->find($query) - ->hint($hint) - ->timeout(scan_api_get_mongo_timeout()) - ; - return theme('mongo_url_details', _np_scan_stats_get_record('url', $results)); - } - catch (MongoCursorTimeoutException $e) { - _np_scan_stats_mongo_timeout_error(); - } - } - else { - _np_scan_stats_mongo_error(); - } -} - -function _np_scan_stats_get_record($collection, $cursor) { - $record = $cursor->getNext(); - if (!$record) { - return array('error' => t('can not read collection')); - } - $updated = $record['updated']->sec; - foreach (array('minutes', 'hours', 'days') as $type) { - if (isset($record[$type])) { - $index = scan_api_bucket_index($collection, $type, $updated); - if (isset($record[$type][$index])) { - $record[$type][$index] .= ' (*' . t('updated') . ')'; - } - else { -// drupal_set_message(t('current %type index of %index missing', array('%type' => $type, '%index' => $index))); - } - // Validate that the velocities. - $size = scan_api_interval_count($collection, $type); - $velocity = 0; - foreach ($record[$type] as $key => $value) { - if (is_numeric($key)) { - $velocity += $value; - } - } - $field = 'velocity'; - if ($collection == 'scan' && $type == 'hours') { - $field .= $size; - } - if (intval($record[$type][$field] * $size + 0.0001 /*account for rounding error*/) != $velocity) { - drupal_set_message(t('%type %field is %old, but should be %new', array('%type' => $type, '%field' => $field, '%old' => round($record[$type]['velocity'], 4), '%new' => round($velocity / $size, 4))), 'error'); - } - } - } - foreach (array('updated', 'created') as $field) { - if (isset($record[$field])) { - $dt = $record[$field]; - $record[$field] = (array) $dt; - $record[$field]['sec'] .= ' (*' . theme('mongo_datetime', $dt) . ')'; - } - } - if (isset($record['scan'])) { - foreach (array('client_id', 'active', 'status') as $key) { - if (!isset($record['scan'][$key])) { - drupal_set_message(t('scan.%key missing', array('%key' => $key)), 'error'); - } - } - } - return $record; -} - -function np_scan_stats_proto_options_form($form_state) { - $form = array(); - $form['limit'] = array( - '#type' => 'select', - '#title' => t('Limit'), - '#default_value' => isset($_GET['limit']) ? $_GET['limit'] : LIMIT, - '#options' => drupal_map_assoc(array(5, 10, 15, 20, 30, 40, 50, 60, 100, 200, 300, 400, 500)), - ); - $form['fields'] = array( - '#type' => 'checkbox', - '#title' => t('Show Fields in JSON'), - '#default_value' => isset($_GET['fields']) ? $_GET['fields'] : 0, - ); - $form['submit'] = array( - '#type' => 'submit', - '#value' => t('Change'), - ); - return $form; -} - -function np_scan_stats_proto_options_form_submit($form, &$form_state) { - $options = array(); - if ($form_state['values']['limit'] != LIMIT) { - $options['limit'] = $form_state['values']['limit']; - } - if ($form_state['values']['fields'] == 1) { - $options['fields'] = $form_state['values']['fields']; - } - if (count($options)) { - drupal_goto($_GET['q'], $options); - } -} - -function theme_mongo_query($collection, $query, $fields, $sort, $limit) { - $out = 'mongo> db.' . $collection; - $out .= '.find(' . theme('mongo_json', $query); - if (count($fields) > 0 && isset($_GET['fields'])) { - $out .= ',' . theme('mongo_json', $fields); - } - $out .= ')'; - if (count($sort) > 0) { - $out .= '.sort(' . theme('mongo_json', $sort) . ')'; - } - if ($limit) { - $out .= '.limit(' . $limit . ')'; - } - $out .= ".timeout(" . scan_api_get_mongo_timeout() . ")"; - return $out; -} - -function theme_mongo_json($json) { - return str_replace(array(',', ':'), array(', ', ': '), json_encode($json)); -} - -function theme_mongo_short_datetime($dt) { - return date(SHORT_FORMAT, $dt->sec); -} - -function theme_mongo_datetime($dt) { - if (empty($dt)) { - return '-'; - } - static $now; - if (!isset($now)) { - $now = time(); - } - if ($now <= $dt->sec + 60) { - return t('last minute'); - } - $ago = $now - $dt->sec; - if ($ago <= 3600) { - return t('%mins minutes', array('%mins' => round($ago / 60, $ago <= 300 ? 1 : 0))); - } - return date(MEDIUM_FORMAT, $dt->sec); -} - -// @TODO: Here's a quick implementation for writing detail records, -// but would be nice to see some real theming. - -function theme_mongo_scan_details($record) { - return '
' . _mongo_record_to_string($record) . '
'; -} - -function theme_mongo_word_details($collection, $record) { - $output = ''; - if (isset($record['word'])) { - $output .= theme('twitter_word', $collection, $record['word']); - } - $output .= '
' . _mongo_record_to_string($record) . '
'; - return $output; -} - -function theme_mongo_location_details($record) { - return '
' . _mongo_record_to_string($record) . '
'; -} - -function theme_mongo_url_details($record) { - return '
' . _mongo_record_to_string($record) . '
'; -} - -function _np_scan_stats_mongo_timeout_error() { - static $once; - if (!isset($once)) { - drupal_set_message(t('Mongo timed out, use "mongo> db.currentOp()" to find the long running query'), 'error'); - $once = TRUE; - } -} - -function _np_scan_stats_mongo_error() { - static $once; - if (!isset($once)) { - drupal_set_message(t('Mongo server is probably down.'), 'error'); - $once = TRUE; - } -} - -function theme_twitter_word($collection, $word) { - $output = ''; - if ($collection == 'keyword' || $collection == 'hashtag') { - $output .= l('T', 'http://twitter.com/#search?q="' . $word . '"') . ' '; - } - $pre_word = $collection == 'hashtag' ? '#' : ''; - $output .= l($pre_word . $word, 'admin/reports/scan_stats/' . $collection . '/' . $word); - return $output; -} - -function _mongo_record_to_string($record) { - return print_r($record, 1); -} Index: www/sites/all/modules/custom/np_potpourri/np_potpourri.pages.inc =================================================================== --- www/sites/all/modules/custom/np_potpourri/np_potpourri.pages.inc (revision 34222) +++ www/sites/all/modules/custom/np_potpourri/np_potpourri.pages.inc (working copy) @@ -247,8 +247,17 @@ $placeholders = array_fill(0, count($add), "'%s'"); // $add comes from a form submission. data size assumed to always be sane ( < 1000) so we are not splitting this into chunked operation. - scan_api_get_mongo('statistics', 'keyword')->remove(array('word' => array('$in' => $add))); - scan_api_get_mongo('statistics', 'hashtag')->remove(array('word' => array('$in' => $add))); + $query = array('word' => array('$in' => $add)); + try { + if ($cursor = scan_api_get_mongo('keyword')) { // V2r15 / keyword / word[] / none (remove query) + $cursor->remove($query); + } + if ($cursor = scan_api_get_mongo('hashtag')) { // V2r15 / hashtag / word[] / none (remove query) + $cursor->remove($query); + } + } + catch (MongoCursorTimeoutException $e) { + } } if (!empty($remove)) { // remove from the blacklist Index: www/sites/all/modules/custom/np_scan_import/np_scan_import.module =================================================================== --- www/sites/all/modules/custom/np_scan_import/np_scan_import.module (revision 34222) +++ www/sites/all/modules/custom/np_scan_import/np_scan_import.module (working copy) @@ -99,14 +99,18 @@ scan_api_set_active_shard(); if (isset($nodes)) { // fill up scan stat - $result = scan_api_get_mongo('statistics', 'scan') - ->find( - array('scan_id' => array('$in' => $scan_ids)), - array('scan_id'=> 1, 'minutes.velocity' => 1) - ); - while($result->hasNext()) { - $row = $result->getNext(); - $nodes[$row['scan_id']]->velocity = $row['minutes']['velocity']; + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan_id[] / none + try { + $query = array('scan_id' => array('$in' => $scan_ids)); + $fields = array('scan_id' => 1, 'velocity.minutes' => 1); + $result = $cursor->find($fields, $result) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($result as $row) { + $nodes[$row['scan_id']]->velocity = $row['velocity']['minutes']; + } + } + catch (MongoCursorTimeoutException $e) { + } } $notify = FALSE; Index: view/sites/all/modules/scan_api/scan_api.module =================================================================== --- view/sites/all/modules/scan_api/scan_api.module (revision 34222) +++ view/sites/all/modules/scan_api/scan_api.module (working copy) @@ -753,7 +753,7 @@ function _scan_top_keywords(&$velocity, &$trending, $type, $scan_ids, $order, $count, $related, $related_keywords, $related_hashtags) { $interval = ($type == 1) ? 'minutes' : 'hours'; $multiplier = ($type == 1) ? 12 : 1; - $velocity_field = $interval . '.velocity'; + $velocity_field = 'velocity.' . $interval; foreach (array('keyword', 'hashtag') as $collection_name) { if ($related) { $words = ($collection_name == 'keyword') ? $related_keywords : $related_hashtags; @@ -764,7 +764,7 @@ if ($scan_ids && (!$related || ($related && isset($query['word'])))) { $order_field = ($order == 'velocity') ? $velocity_field : 'trending'; - $cursor = scan_api_get_mongo('scan_stats', $collection_name); // keyword, hashtag + $cursor = scan_api_get_mongo($collection_name); // V2r15 / keyword, hashtag / scan_id[], word[] / velocity.minutes, velocity.hours, trending if (!$cursor) { return; } @@ -779,6 +779,7 @@ ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { if (!isset($row['word'])) { + assert('Worker bug: keyword / hashtag is speechless!'); // bug in workers. they write empty word records... bad continue; } @@ -786,7 +787,7 @@ $row['word'] = '#' . $row['word']; } if ($order == 'velocity') { - $order_data = isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] : 0; + $order_data = isset($row['velocity'][$interval]) ? $row['velocity'][$interval] : 0; } else { $order_data = $row['trending']; @@ -794,7 +795,7 @@ if (!isset($ordering_data[$row['word']]) || ($ordering_data[$row['word']] < $order_data)) { $ordering_data[$row['word']] = $order_data; $data[$row['word']] = array( - 'velocity' => isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] * $multiplier: 0, + 'velocity' => isset($row['velocity'][$interval]) ? $row['velocity'][$interval] * $multiplier: 0, 'trending' => $row['trending'], 'word' => $row['word'], ); @@ -837,19 +838,146 @@ * @param $type * int, 1 - fresh stats, 2 - stats based on 6 hr timeslice */ -function _scan_keyword_velocity($scan_id, $keywords, $is_hashtag, $multirow, $type) { +function _scan_keyword_velocity($scan_id, $words) { + if (!is_array($words)) { + $words = array($words); + } + $words = array_map('strtolower', $words); + + $criteria['hashtag'] = array(); + $criteria['keyword'] = array(); + + foreach ($words as $k) { + if (substr($k, 0, 1) == '#') { + $criteria['hashtag'][] = substr($k, 1); + } + else { + $criteria['keyword'][] = $k; + } + + $args = array( + 'collection' => '', // placeholder + 'key' => 'word', + 'query' => array( + 'scan_id' => $scan_id, + 'word' => array(), // placeholder + ), + 'fields' => array( + 'word' => 1, + 'velocity.minutes' => 1, + 'velocity.hours' => 1, + 'trending' => 1, + ), + 'empty' => array( + 'word' => '', + 'velocity' => 0, + 'velocity.hours' => 0, + 'trending' => 0, + ), + 'emptykeys' => array(), // placeholder + 'flatten' => TRUE, + 'zeromap' => array( + 'velocity.hours' => 'velocity.minutes', + ), + 'remap' => array( + 'velocity.minutes' => 'velocity', + ), + ); + + $results = array('hashtags' => array(), 'keywords' => array()); + foreach (array('keyword, hashtag') as $collection) { + $query['collection'] = $collection; + $args['query']['word'] = (count($criteria[$collection]) > 1) ? array('$in' => $criteria[$collection]) : $criteria[$collection][0]; + $args['emptykeys'] = $criteria[$collection]; + $results[$collection] = scan_api_mongo_doquery($args); // V2r15 / keyword, hashtag / scan_id, word[] / none + } + foreach ($results['hashtag'] as $k => $v) { + $results['hashtag'][$k]['word'] = '#' . $results['hashtag'][$k]['word']; + } + foreach ($results['hashtag'] as $k => $v) { + $results['hashtag']['#' . $k] =& $results['hashtag'][$k]; + unset($results['hashtag'][$k]); + } + + + if (!empty($keywords)) { + $query['collection'] = 'keyword'; + $args['query']['word'] = (count($keywords) > 1) ? array('$in' => $keywords) : $keywords[0]; + $args['emptykeys'] = $keywords; + $data = scan_api_mongo_doquery($args); // V2r15 / keyword / scan_id, word[] / none + $results['keywords'] = $data; + } + if (!empty($hashtags)) { + $query['collection'] = 'hashtag'; + $args['query']['word'] = (count($hashtags) > 1) ? array('$in' => $hashtags) : $hashtags[0]; + $data = scan_api_mongo_doquery($args); // V2r15 / hashtag / scan_id, word[] / none + foreach ($data as $key => $value) { + $results['keywords'][$key] = $value + $results['hashtags']['#' . $key] = $value; + } + } + + $collection_name = $hashtag ? 'hashtag' : 'keyword'; + + + $data = scan_api_mongo_doquery(array( // V2r15 / keyword, hashtag / scan_id[2], word / none + 'collection' => $collection_name, + 'key' => 'scan_id', + 'query' => $query, + 'fields' => $fields, + 'empty' => $return, + 'emptykeys' => $scan_id ? $query['scan_id']['$in'] : array(0), + 'flatten' => TRUE, + )); + + $return = $data[0]; + // Fall back to hours if minutes is 0 (i.e. twitter might be down?). @@@ V2 Should we just go with minutes? + $return['velocity'] = $return['velocity.minutes'] ? $return['velocity.minutes'] : $return['velocity.hours']; + + if ($scan_id) { + $scan = $data[$scan_id]; + $scan['velocity'] = $scan['velocity.minutes'] ? $scan['velocity.minutes'] : $scan['velocity.hours']; + if ($scan['velocity'] > $return['velocity']) { + // Use per scan stats if it is better than global stats. + // Scans can be higher than global because searchapi mentions do not end up in global stats. + $return = $scan; + } + } + + unset($return['velocity.minutes']); + unset($return['velocity.hours']); + + + + + + + + + + + + + + $result = array(); $interval = ($type == 1) ? 'minutes' : 'hours'; $multiplier = ($type == 1) ? 12 : 1; - $velocity_field = $interval . '.velocity'; + $velocity_field = 'velocity.' . $interval; $collection_name = $is_hashtag ? 'hashtag' : 'keyword'; - $fields = array($velocity_field => 1, 'trending' => 1, 'word' => 1); + $fields = array( + 'velocity.minutes' + + + $velocity_field => 1, 'word' => 1); $query = array( 'scan_id' => intval($scan_id), 'word' => array('$in' => array_map('strtolower', $keywords)), ); - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + + + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id, word[] / velocity.minutes, velocity.hours try { $cursor = $cursor ->find($query, $fields) @@ -860,13 +988,13 @@ if ($is_hashtag) { $row['word'] = '#' . $row['word']; } - $result[$row['word']] = isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] * $multiplier : 0; + $result[$row['word']] = isset($row['velocity'][$interval]) ? $row['velocity'][$interval] * $multiplier : 0; } } elseif ($cursor->hasNext()) { $result = $cursor->getNext(); - if (isset($result[$interval]['velocity'])) { - $result['velocity'] = $result[$interval]['velocity'] * $multiplier; + if (isset($result['velocity'][$interval])) { + $result['velocity'] = $result['velocity'][$interval] * $multiplier; } } } @@ -887,32 +1015,50 @@ $hashtag = TRUE; $keyword = substr($keyword, 1); } - // get global stat - $return = _scan_keyword_velocity(0, array($keyword), $hashtag, FALSE, 1); - // if twitter have gone sleeping fall back to 6 hr timeslice - if (!$return || !$return['velocity']) { - $return = _scan_keyword_velocity(0, array($keyword), $hashtag, FALSE, 2); - } - // get per scan stat if we have scan_id (searchapi mentions do not get into global stat - // making global stat too low for what we actually show... - $per_scan = array(); + $collection_name = $hashtag ? 'hashtag' : 'keyword'; + $fields = array( + 'scan_id' => 1, + 'velocity.minutes' => 1, + 'velocity.hours' => 1, + 'trending' => 1, + ); + $query = array( + 'scan_id' => $scan_id ? array('$in' => array(0, $scan_id)) : 0, + 'word' => strtolower($keyword), + ); + $empty = array( + 'velocity.minutes' => 0, + 'velocity.hours' => 0, + 'trending' => 0, + ); + + $data = scan_api_mongo_doquery(array( // V2r15 / keyword, hashtag / scan_id[2], word / none + 'collection' => $collection_name, + 'key' => 'scan_id', + 'query' => $query, + 'fields' => $fields, + 'empty' => $return, + 'emptykeys' => $scan_id ? $query['scan_id']['$in'] : array(0), + 'flatten' => TRUE, + )); + + $return = $data[0]; + // Fall back to hours if minutes is 0 (i.e. twitter might be down?). @@@ V2 Should we just go with minutes? + $return['velocity'] = $return['velocity.minutes'] ? $return['velocity.minutes'] : $return['velocity.hours']; + if ($scan_id) { - $per_scan = _scan_keyword_velocity($scan_id, array($keyword), $hashtag, FALSE, 1); - if (!$per_scan || !$per_scan['velocity']) { - $per_scan = _scan_keyword_velocity($scan_id, array($keyword), $hashtag, FALSE, 2); + $scan = $data[$scan_id]; + $scan['velocity'] = $scan['velocity.minutes'] ? $scan['velocity.minutes'] : $scan['velocity.hours']; + if ($scan['velocity'] > $return['velocity']) { + // Use per scan stats if it is better than global stats. + // Scans can be higher than global because searchapi mentions do not end up in global stats. + $return = $scan; } - if (!$return && $per_scan) { - $return = $per_scan; - } } - // if we have per scan stat as well then return the one that's higher - if ($per_scan && $per_scan['velocity'] > $return['velocity']) { - $return = $per_scan; - } - // if not found in DB - if (!$return) { - $return = array('velocity' => 0, 'trending' => 0); - } + + unset($return['velocity.minutes']); + unset($return['velocity.hours']); + print _scan_api_format($return, $format); } @@ -983,22 +1129,20 @@ 'velocity' => 0, 'difference' => 0, ); - $fields = array('minutes.velocity' => 1, 'minutes.prev_velocity' => 1); + $fields = array('velocity.minutes_scan' => 1, 'increasing' => 1); $query = array( 'scan_id' => intval($scan_id), ); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan_id / none (single) try { $cursor = $cursor ->find($query, $fields) ->timeout(scan_api_get_mongo_timeout()); if ($cursor->hasNext()) { $row = $cursor->getNext(); - $velocity = isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0; - $prev_velocity = isset($row['minutes']['prev_velocity']) ? round(12 * $row['minutes']['prev_velocity']) : 0; $return = array( - 'velocity' => $velocity, - 'difference' => $velocity - $prev_velocity, + 'velocity' => isset($row['velocity']['minutes_scan']) ? round(12 * $row['velocity']['minutes_scan']) : 0, + 'difference' => $row['increasing'] ? 1 : -1, ); } } @@ -1011,7 +1155,7 @@ /** * Determine a scan's number of uniq links */ -function scan_scan_uniq_links() { +function scan_scan_uniq_links() { // @@@ V2 performance? // $scan_id, $format extract(_scan_get_args()); $return = array( @@ -1025,7 +1169,8 @@ SCAN_CATEGORY_PHOTOS => 'photo', SCAN_CATEGORY_VIDEOS => 'video', ); - if ($cursor = scan_api_get_mongo('urls', 'url')) { + if ($cursor = scan_api_get_mongo('url')) { // V2r15 / url / scan_id, category / none (count) + // @@@V2 performance Switch to single query grouped by category? foreach ($categories as $category => $return_key) { $query = array( 'scan_id' => intval($scan_id), @@ -1104,7 +1249,7 @@ 'category' => $category, ); $range = $blocked_url_ids ? sizeof($blocked_url_ids) : 0; - if ($cursor = scan_api_get_mongo('urls', 'url')) { + if ($cursor = scan_api_get_mongo('url')) { // V2r15 / url / scan_id[], category / count:-1 (query is paged!) try { $cursor = $cursor ->find($query, $fields) @@ -1140,7 +1285,7 @@ 'scan_id' => array('$in' => $scan_ids), 'category' => $category, ); - if ($cursor = scan_api_get_mongo('urls', 'url')) { + if ($cursor = scan_api_get_mongo('url')) { // V2r15 / url / scan_id[], category, (url_id[]) / none (count) try { $count = $cursor ->find($query) @@ -1447,6 +1592,69 @@ } /** + * Get bucket data for rendering a sparkline, etc. + */ +function scan_get_bucket_data($collection, $time_type, $key = 'scan_id', $values, $query = array(), $when = FALSE) { + if (!$when) { + $when = time(); + } + $interval_size_ = scan_api_interval_size($collection, $time_type); + $interval_count_ = scan_api_interval_count($collection, $time_type); + $bucket_cycle_time = $interval_size_ * $interval_count_; + $index = scan_api_bucket_index($collection, $time_type, $when); + + if (is_array($values)) { + $query[$key] = array('$in' => $values); + } + else { + $query[$key] = $values; + $values = array($values); + } + + // Initialize the output array. + $data = array(); + foreach ($values as $v) { + $data[$v] = array( + 'size' => $interval_count_, + 'offset' => $index, +// 'cutoff' => 0, // @@@ Point where we run out of data +// 'range' => 0, // @@@ Hours in range + 'data' => array_fill(0, $interval_count_, 0), + ); + } + + $fields = array( + $key => 1, + 'created' => 1, + 'updated' => 1, + $time_type => 1, + ); + if ($cursor = scan_api_get_mongo($collection)) { // V2r15 / * / scan_id[] / none (sparkline data analyzer) + try { + $cursor = $cursor->find($query, $fields) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($cursor as $row) { + // If this scan hasn't been touched for more than a cycle, continue. + // We already initialized everything to 0 above. + if ($when > ($bucket_cycle_time + $row['updated'])) { + continue; + } + + for ($i = 0; $i < $interval_count_; $i++) { + // OK, so we need to walk forwards on one array while walking backwards and wrapping around on another. + // Adding $interval_count_ to $index on the right is done because PHP's modulus handles negatives in the + // equally-correct-but-not-as-useful-as-the-other-way-around fashion. + $data[$row['scan_id']]['data'][$i] = $row[$time_type][($interval_count_ + $index - $i) % $interval_count_]; + } + } + } + catch (MongoCursorTimeoutException $e) { + } + } + return $data; +} + +/** * Produce list of buckets in order from newest to oldest * scan buckets do not maintain that order, bellow current undext we have new items, above we have old * which makes it not easy to manage. here's API for that. @@ -1490,42 +1698,22 @@ return $bucket; } function scan_stats_velocity() { + $when = time(); // $scan_id,, $count, $max_age, $format extract(_scan_get_args()); + $scan_id = intval($scan_id); - $epoch = gmdate('c', 0); - $return = array_fill(0, $count, array('count' => 0, 'start_time' => $epoch, 'last_occurence' => $epoch + 3600)); - - if ($cursor = scan_api_get_mongo('statistics', 'scan')) { - $fields = array('hours' => 1, 'updated' => 1); - $query = array( - 'scan_id' => intval($scan_id), + $data = scan_get_bucket_data('scan', 'hours', 'scan_id', $scan_id, array(), $when); + $return = array(); + foreach ($data[$scan_id]['data'] as $k => $v) { + $return[$k] = array( + 'count' => $v, + //@@@V2 This is horribly inefficient, data transfer wise. Would be much better to + // pass a single time and have the JS code do offsets from that itself. + 'start_time' => gmdate('c', ($when - $when % 3600) - ($k * 3600)), ); - try { - $cursor = $cursor - ->find($query) - ->timeout(scan_api_get_mongo_timeout()); + } - if ($cursor->hasNext()) { - $row = $cursor->getNext(); - $updated = explode(' ', (string)$row['updated']); - $bucket = scan_api_reorder_scan_time_buckets($row['hours'], 'scan', 'hours', $updated[1]); - - $now = mktime(date('H'), 0, 0 ); - foreach($bucket as $index => $value ) { - if ( $count == $index ) { - break; - } - $return[$index] = array( - 'count' => $value, - 'start_time' => gmdate('c', $now - ($index * 60 * 60)), - ); - } - } - } - catch (MongoCursorTimeoutException $e) { - } - } print _scan_api_format($return, $format); } @@ -1565,17 +1753,17 @@ // fill up the stats if ($return) { $scan_ids = array_keys($return); - $fields = array('scan_id' => 1, 'minutes.velocity' => 1, 'hours.velocity'); + $fields = array('scan_id' => 1, 'velocity.minutes_scan' => 1); $query = array( 'scan_id' => array('$in' => $scan_ids), ); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan_id[] / none try { $cursor = $cursor ->find($query, $fields) ->timeout(scan_api_get_mongo_timeout()); foreach ($cursor as $row) { - $return[$row['scan_id']]['velocity'] = isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0; + $return[$row['scan_id']]['velocity'] = isset($row['velocity']['minutes_scan']) ? round(12 * $row['velocity']['minutes_scan']) : 0; } } catch (MongoCursorTimeoutException $e) { @@ -1586,25 +1774,33 @@ } else { // get a basic data structure in ordered form - $fields = array('scan_id' => 1, 'minutes.velocity' => 1); + $fields = array( + 'scan_id' => 1, + 'velocity.minutes_scan' => 1, + 'velocity.hours_general' => 1, // @@@V2 was days.general.velocity + 'velocity.hours_photo' => 1, //@@@V2 was days.photo.velocity + 'velocity.hours_video' => 1, //@@@V2 was days.video.velocity + 'velocity.hours_urls' => 1, //@@@V2 was days.velocity + ); $query = array( - 'scan.client_id' => intval($client_id), 'scan.status' => 1, 'scan.active' => 1, + 'scan.client_id' => intval($client_id), 'scan.status' => 1 ); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan.client_id, scan.status / velocity.minutes_scan:-1 try { $cursor = $cursor ->find($query, $fields) - ->sort(array('minutes.velocity' => -1)) + ->sort(array('velocity.minutes_scan' => -1)) ->limit($count) ->timeout(scan_api_get_mongo_timeout()); foreach ($cursor as $row) { $return[$row['scan_id']] = array( 'scan_id' => $row['scan_id'], - 'velocity' => isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0, - 'general' => 0, - 'photo' => 0, - 'video' => 0, - 'summary' => 0, + // @@@ x12 multiplier is less accurate than summing the minutes.scan array. + 'velocity' => isset($row['velocity']['minutes_scan']) ? round(12 * $row['velocity']['minutes_scan']) : 0, + 'general' => isset($row['velocity']['hours_general']) ? round($row['velocity']['hours_general']) : 0, + 'photo' => isset($row['velocity']['hours_photo']) ? round($row['velocity']['hours_photo']) : 0, + 'video' => isset($row['velocity']['hours_video']) ? round($row['velocity']['hours_video']) : 0, + 'summary' => isset($row['velocity']['hours_urls']) ? round($row['velocity']['hours_urls']) : 0, ); } } @@ -1630,34 +1826,9 @@ } } } - // common query for both cases: if we have data we need to fill up url uniq stats if ($return) { - $fields = array( - 'scan_id' => 1, - 'days.velocity' => 1, - 'days.general.velocity' => 1, - 'days.photo.velocity' => 1, - 'days.video.velocity' => 1, - ); - $query = array( - 'scan_id' => array('$in' => $scan_ids), - ); - if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { - try { - $cursor = $cursor - ->find($query, $fields) - ->timeout(scan_api_get_mongo_timeout()); - foreach ($cursor as $row) { - $return[$row['scan_id']]['summary'] = round($row['days']['velocity'] * 30); - $return[$row['scan_id']]['general'] = round($row['days']['general']['velocity'] * 30); - $return[$row['scan_id']]['photo'] = round($row['days']['photo']['velocity'] * 30); - $return[$row['scan_id']]['video'] = round($row['days']['video']['velocity'] * 30); - } - $return = array_filter($return, '_scan_api_filter_nid'); - } - catch (MongoCursorTimeoutException $e) { - } - } + // Filter out scans not backed by nodes. + $return = array_filter($return, '_scan_api_filter_nid'); } scan_api_set_active_shard(); print _scan_api_format(array_values($return), $format); @@ -1752,33 +1923,15 @@ if ($matches) { $scan_ids = array_keys($matches); // fill up the scan stats part - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { - $fields = array('scan_id' => 1, 'minutes.velocity' => 1, 'minutes.prev_velocity' => 1); - $query = array( - 'scan_id' => array('$in' => $scan_ids), - ); - try { - $cursor = $cursor - ->find($query, $fields) - ->timeout(scan_api_get_mongo_timeout()); - foreach ($cursor as $row) { - $velocity = isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0; - $prev_velocity = isset($row['minutes']['prev_velocity']) ? round(12 * $row['minutes']['prev_velocity']) : 0; - $data[$row['scan_id']]['velocity'] = $velocity; - $data[$row['scan_id']]['difference'] = $velocity - $prev_velocity; - } - } - catch (MongoCursorTimeoutException $e) { - } - } - // fill up the url uniq stats part - if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { + if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { // V2r15 / scan / scan_id[] / none $fields = array( 'scan_id' => 1, - 'days.velocity' => 1, - 'days.general.velocity' => 1, - 'days.photo.velocity' => 1, - 'days.video.velocity' => 1, + 'velocity.minutes_scan' => 1, + 'velocity.increasing' => 1, + 'velocity.hours_general' => 1, // @@@V2 was days.general.velocity + 'velocity.hours_photo' => 1, //@@@V2 was days.photo.velocity + 'velocity.hours_video' => 1, //@@@V2 was days.video.velocity + 'velocity.hours_urls' => 1, //@@@V2 was days.velocity ); $query = array( 'scan_id' => array('$in' => $scan_ids), @@ -1788,10 +1941,13 @@ ->find($query, $fields) ->timeout(scan_api_get_mongo_timeout()); foreach ($cursor as $row) { - $data[$row['scan_id']]['uniq_links_summary'] = round($row['days']['velocity'] * 30); - $data[$row['scan_id']]['uniq_links_general'] = round($row['days']['general']['velocity'] * 30); - $data[$row['scan_id']]['uniq_links_photo'] = round($row['days']['photo']['velocity'] * 30); - $data[$row['scan_id']]['uniq_links_video'] = round($row['days']['video']['velocity'] * 30); + $data[$row['scan_id']]['velocity'] = isset($row['velocity']['minutes_scan']) ? round(12 * $row['velocity']['minutes_scan']) : 0; + //@@@V2 This is really a boolean now -- requires theme / js change to fix. + $data[$row['scan_id']]['difference'] = isset($row['velocity']['increasing']) ? $row['velocity']['increasing'] : 0; + $data[$row['scan_id']]['uniq_links_summary'] = round($row['velocity']['hours_urls']); + $data[$row['scan_id']]['uniq_links_general'] = round($row['velocity']['hours_general']); + $data[$row['scan_id']]['uniq_links_photo'] = round($row['velocity']['hours_photo']); + $data[$row['scan_id']]['uniq_links_video'] = round($row['velocity']['hours_video']); } } catch (MongoCursorTimeoutException $e) { @@ -2499,20 +2655,30 @@ * @return * A mongoCollection. */ -function scan_api_get_mongo($db_name, $collection_name) { +function scan_api_get_mongo($collection_name, $shard_key = FALSE) { static $connections; - if (!isset($connections[$db_name])) { + static $mongo_dbs; + if (!isset($mongo_dbs)) { $mongo_dbs = variable_get('mongo_dbs', array()); - $mongo_db = $mongo_dbs[isset($mongo_dbs[$db_name]) ? $db_name : 'default']; + } + if (!$shard_key) { + $shard_key = $collection_name; + } + // Normalize shard key so we can reuse connections often. + if (!isset($mongo_dbs[$shard_key])) { + $shard_key = 'default'; + } + if (!isset($connections[$shard_key])) { + $mongo_db = $mongo_dbs[$shard_key]; try { $mongo = new mongo($mongo_db['host']); - $connections[$db_name] = $mongo->selectDB($mongo_db['db']); + $connections[$shard_key] = $mongo->selectDB($mongo_db['db']); } catch (MongoConnectionException $e) { return; } } - return $connections[$db_name]->selectCollection($collection_name); + return $connections[$shard_key]->selectCollection($collection_name); } /** @@ -2530,3 +2696,165 @@ return $timeout; } +/** + * One stop shop for getting a blob of stuff from mongo. + */ +function scan_api_mongo_doquery($args) { + //$collection + $key = FALSE; + //$query + $fields = array(); + $sort = NULL; + $limit = 100; + $empty = array(); + $emptykeys = array(); + $flatten = FALSE; + $zeromap = array(); + $remap = array(); + $stripmongoid = TRUE; + extract($args); + + if (!$key) { + // No sense in reading multiple values if this will be single-result. + $limit = 1; + } + + $return = array(); + if (!empty($emptykeys)) { + foreach ($emptykeys as $k) { + $return[$k] = array(); + } + } + + if ($cursor = scan_api_get_mongo('FOO',$collection)) { // V2r15 metaquery + try { + $cursor = $cursor->find($query, $fields) + ->limit($limit); + if (isset($sort)) { + $cursor->sort($sort); + } + $query = $cursor->timeout(scan_api_get_mongo_timeout()); + if (!$key) { + if ($query->hasNext()) { + // Switch to findOne() instead of find()? + $return[0] = $query->getNext(); + } + else { + // No result. + return FALSE; + } + } + else if (strpos($key, '.') !== FALSE) { + $keyparts = explode('.', $key); + foreach ($query as $row) { + $r =& $row; + foreach ($keyparts as $part) { + $r =& $r[$part]; + } + $rowkey = $r; + unset($r); + $return[$rowkey] = $row; + } + } + else { + foreach ($query as $row) { + // Assumes data from mongo is consistent. + $return[$row[$key]] = $row; + } + } + if ($stripmongoid) { + foreach ($return as $k => $v) { + unset($return[$k]['_id']); + } + } + if ($flatten) { + // Single level flattening. Doing it like this because recursion sucks. + // Not gonna bother with more than one dot for now. + foreach ($return as $k => $v) { + if (is_array($v)) { + foreach ($return[$k] as $kk => $vv) { + if (is_array($vv)) { + foreach ($return[$k][$kk] as $kkk => $vvv) { + $return[$k]["$kk.$kkk"] =& $return[$k][$kk][$kkk]; + } + unset($return[$k][$kk]); + } + } + } + } + } + // Do a single level initialization of defaults. + if (!empty($empty)) { + foreach ($return as $k => $v) { + foreach ($empty as $kk => $vv) { + if (!isset($return[$k][$kk])) { + $return[$k][$kk] = $vv; + } + } + } + } + // Do zero mapping for fallbacks. + // Note: This only applies if every entry in that field is 0. + if (!empty($zeromap)) { + foreach ($zeromap as $src => $dst) { + $fallback = TRUE; + foreach ($return as $k => $v) { + if ($return[$k][$src]) { + $fallback = FALSE; + break; + } + } + foreach ($return[$k] as $k => $v) { + if ($fallback) { + $return[$k][$dst] =& $return[$k][$src]; + } + // Always unset src, even if not falling back. + unset($return[$k][$src]); + } + } + } + // Perform output remapping to adapt the result array on behalf + // of the caller. + if (!empty($remap)) { + foreach ($remap as $src => $dst) { + foreach ($return as $k => $v) { + $return[$dst] =& $return[$src]; + unset($return[$src]); + } + } + } + if (!$key) { + return $return[0]; + } + return $return; + } + catch (MongoCursorTimeoutException $e) { + } + } +} + + +/** + * Set query result keys. + */ +function scan_api_mongo_keyresult($query, $key) { + $return = array(); + if (strpos($key, '.') !== FALSE) { + $keyparts = explode('.', $key); + foreach ($query as $row) { + $r =& $row; + foreach ($keyparts as $part) { + $r =& $r[$part]; + } + $rowkey = $r; + unset($r); + $return[$rowkey] = $row; + } + } + else { + foreach ($query as $row) { + $return[$row[$key]] = $row; + } + } + return $return; +}