Index: www/sites/all/modules/custom/np_scan/np_scan.analytics.inc =================================================================== --- www/sites/all/modules/custom/np_scan/np_scan.analytics.inc (revision 34236) +++ www/sites/all/modules/custom/np_scan/np_scan.analytics.inc (working copy) @@ -19,71 +19,6 @@ } /** - * Helper function to get the name of the velocity field. - * - * @param $timeframe time in hours 24, 48, 168,720 - * @param $collection_name the name of the mongo collection - * @return string, the name of the velocity field - */ -function _np_scan_analytics_get_velocity_field($timeframe, $collection_name) { - $velocity_field = ''; - - if ($collection_name == 'scanurl') { - switch ($timeframe) { - case 24: // day - $velocity_field = 'hours.velocity24'; - break; - case 168: //week - $velocity_field = 'days.velocity7'; - break; - case 720: // month - case 'all': - $velocity_field = 'days.velocity'; - break; - default: // biday - $velocity_field = 'hours.velocity'; - break; - } - } - elseif ($collection_name == 'scan') { - switch ($timeframe) { - case 24: // day - $velocity_field = 'hours.velocity'; - break; - case 168: //week - $velocity_field = 'days.velocity7'; - break; - case 720: // month - case 'all': - $velocity_field = 'days.velocity'; - break; - default: // biday - $velocity_field = 'hours.velocity48'; - break; - } - } - elseif (in_array($collection_name, array('hashtag', 'keyword', 'retweet'))) { - switch ($timeframe) { - case 24: // day - $velocity_field = 'hours.velocity'; - break; - case 168: //week - $velocity_field = 'days.velocity'; // no stats use month - break; - case 720: // month - case 'all': - $velocity_field = 'days.velocity'; - break; - default: // biday - $velocity_field = 'days.velocity'; // no stats use month - break; - } - } - - return $velocity_field; -} - -/** * Menu callback; property statistics. * * $page - which page are we displaying results for? @@ -104,7 +39,7 @@ $scans = array(); // this variable gonna be handy later $key = 'scan_id'; - + if ($page == 'views') { $timeframe = !empty($_GET['timeframe']) ? $_GET['timeframe'] : 'all'; } @@ -114,6 +49,7 @@ switch ($page) { case 'views': + $velocity_field_2 = ($timeframe == 24) ? 'velocity.hours' : 'velocity.days'; $args = array_merge(array($scan_status), $client_ids ); scan_api_set_active_shard('misc'); $key = 'vid'; @@ -135,36 +71,15 @@ break; case 'links': - $velocity_field = _np_scan_analytics_get_velocity_field($timeframe, 'scanurl'); - $fields = array('scan_id' => 1); - $query = array( - 'scan.client_id' => array('$in' => $client_ids), - 'scan.status' => $scan_status, - ); - if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { - try { - $results = $cursor - ->find($query, $fields) - ->sort(array($velocity_field => -1)) - ->limit($limit) - ->timeout(scan_api_get_mongo_timeout()); - foreach ($results as $row) { - $scans[$row['scan_id']] = FALSE; - } - } - catch (MongoCursorTimeoutException $e) { - } - } - break; - default: - $velocity_field = _np_scan_analytics_get_velocity_field($timeframe, 'scan'); + $velocity_field = ($page == 'links') ? 'velocity.hours_urls' : 'velocity.hours_scan'; + $velocity_field_2 = 'velocity.days'; // @@@ V2 Maybe we should standardize on 48h across the board? $fields = array('scan_id' => 1); $query = array( 'scan.client_id' => array('$in' => $client_ids), 'scan.status' => $scan_status, ); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan.client_id[], scan.status / velocity.hours_urls, velocity.hours_scan try { $results = $cursor ->find($query, $fields) @@ -203,24 +118,23 @@ // pour together hashtag and keyword velocities foreach (array('keyword', 'hashtag') as $collection_name) { if ($words[$collection_name]) { - $velocity_field = _np_scan_analytics_get_velocity_field($timeframe, $collection_name); - list($interval, $tmp) = explode('.', $velocity_field); - $fields = array('word' => 1, $velocity_field => 1); + list($tmp, $interval) = explode('.', $velocity_field_2); + $fields = array('word' => 1, $velocity_field_2 => 1); $query = array( 'scan_id' => intval($statistics->scan_id), 'word' => array('$in' => array_map('strtolower', $words[$collection_name])), ); - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id, word[] / velocity.hours:-1, velocity.days:-1 try { $results = $cursor ->find($query, $fields) - ->sort(array($velocity_field => -1)) + ->sort(array($velocity_field_2 => -1)) ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { if ($collection_name == 'hashtag') { $row['word'] = '#' . $row['word']; }; - $word_velocities[$row['word']] = isset($row[$interval]['velocity']) ? round($row[$interval]['velocity'], VELOCITY_PRECISION) : 0; + $word_velocities[$row['word']] = isset($row['velocity'][$interval]) ? round($row['velocity'][$interval], VELOCITY_PRECISION) : 0; } } catch (MongoCursorTimeoutException $e) { @@ -248,60 +162,43 @@ 'timestamp' => strtotime('-1 day'), ); - // fill up velocity and difference + // fill up velocity and difference / stats. $statistics->velocity = 0; $statistics->difference = 0; - $fields = array('minutes.velocity' => 1, 'minutes.prev_velocity' => 1); - $query = array( - 'scan_id' => intval($statistics->scan_id), - ); - if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { - try { - $results = $cursor - ->find($query, $fields) - ->timeout(scan_api_get_mongo_timeout()); - if ($results->hasNext()) { - $row = $results->getNext(); - $velocity = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; - $prev_velocity = isset($row['minutes']['prev_velocity']) ? $row['minutes']['prev_velocity'] * 12 : 0; - $statistics->velocity = $velocity; - $statistics->difference = $velocity - $prev_velocity; - } - } - catch (MongoCursorTimeoutException $e) { - } - } - - // Fill up the url uniq stats part. Initialize to 0. - $statistics->velocity = 0; $statistics->general = 0; $statistics->photo = 0; $statistics->video = 0; - $statistics->url_velocity = 0; - $fields = array( 'scan_id' => 1, - 'hours.velocity' => 1, - 'hours.general.velocity' => 1, - 'hours.photo.velocity' => 1, - 'hours.video.velocity' => 1, + // @@@ V2 Previous code disagrees on whether to use minutes or hours here. + //'velocity.minutes' => 1, + 'velocity.hours_scan' => 1, + 'velocity.hours_general' => 1, + 'velocity.hours_photo' => 1, + 'velocity.hours_video' => 1, + 'velocity.hours_urls' => 1, + 'increasing' => 1, ); $query = array( 'scan_id' => intval($statistics->scan_id), ); - if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { + if ($cursor = scan_api_get_mongo('scan')) { // V2r15 / scan / scan_id / none try { $results = $cursor ->find($query, $fields) ->timeout(scan_api_get_mongo_timeout()); - foreach ($results as $row) { - $statistics->velocity = isset($row['hours']['velocity']) ? $row['hours']['velocity'] : 0; - $statistics->general = isset($row['hours']['general']['velocity']) ? $row['hours']['general']['velocity'] : 0; - $statistics->photo = isset($row['hours']['photo']['velocity']) ? $row['hours']['photo']['velocity'] : 0; - $statistics->video = isset($row['hours']['video']['velocity']) ? $row['hours']['video']['velocity'] : 0; - - $statistics->url_velocity = isset($row['hours']['velocity']) ? $row['hours']['velocity'] : 0; + if ($results->hasNext()) { + $row = $results->getNext(); + // @@@ V2 Previous code disagrees on whether to use minutes or hours here. + //$statistics->velocity = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; + $statistics->velocity = isset($row['velocity']['hours_scan']) ? $row['velocity']['hours_scan'] : 0; + $statistics->general = isset($row['velocity']['hours_general']) ? $row['velocity']['hours_general'] : 0; + $statistics->photo = isset($row['velocity']['hours_photo']) ? $row['velocity']['hours_photo'] : 0; + $statistics->video = isset($row['velocity']['hours_video']) ? $row['velocity']['hours_video'] : 0; + $statistics->url_velocity = isset($row['velocity']['hours_urls']) ? $row['velocity']['hours_urls'] : 0; + // @@@ V2 Teach everything about "increasing". + $statistics->difference = $row['increasing'] ? 1 : -1; } } catch (MongoCursorTimeoutException $e) { @@ -365,15 +262,18 @@ $max = 0; $data = array(); - $order_field = ($order == 'trending') ? 'trending' : 'minutes.velocity'; + $order_field = ($order == 'trending') ? 'trending' : 'velocity.minutes'; $ordering_data = array(); foreach ($scan_ids as $scan_id) { - $fields = array('minutes.velocity' => 1, 'trending' => 1, 'word' => 1); + $fields = array( + 'velocity.minutes' => 1, + 'trending' => 1, + 'word' => 1, + ); $query = array( 'scan_id' => $scan_id, - 'word' => array('$exists' => TRUE), ); - if ($cursor = scan_api_get_mongo('scan_stats', $table_type)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($table_type)) { // V2r15 / keyword, hashtag / scan_id / trending:-1, velocity.minutes:-1 try { $results = $cursor ->find($query, $fields) @@ -382,11 +282,12 @@ ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { if (!isset($row['word'])) { + assert('Worker was speechless!'); // bug in workers. they write empty word records... bad continue; } - $velocity = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; + $velocity = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; $order_data = $order != 'trending' ? $velocity : $row['trending']; if (!isset($ordering_data[$row['word']]) || ($ordering_data[$row['word']] < $order_data)) { $data[$row['word']] = array( @@ -445,16 +346,16 @@ $query = array( 'scan_id' => array('$in' => $scan_ids), ); - if ($cursor = scan_api_get_mongo('scan_stats', $table_type)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($table_type)) { // V2r15 / keyword, hashtag / scan_id[] / velocity.minutes:-1 limit 1 try { $results = $cursor - ->find($query, array('minutes.velocity' => 1)) - ->sort(array('minutes.velocity' => -1)) + ->find($query, array('velocity.minutes' => 1)) + ->sort(array('velocity.minutes' => -1)) ->limit(1) ->timeout(scan_api_get_mongo_timeout()); if ($results->hasNext()) { $row = $results->getNext(); - $max = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; + $max = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; } } catch (MongoCursorTimeoutException $e) { @@ -462,15 +363,13 @@ } } else { - // MAX(ABS(trending) $max_trending = $min_trending = 0; $query = array( 'scan_id' => array('$in' => $scan_ids), - 'trending' => array('$ne' => -1000), ); // max trending - if ($cursor = scan_api_get_mongo('scan_stats', $table_type)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($table_type)) { // V2r15 / keyword, hashtag / scan_id[] / trending:-1, trending:1 limit 1 (two queries) try { $results = $cursor ->find($query, array('trending' => 1)) @@ -531,11 +430,10 @@ foreach (array('keyword', 'hashtag') as $collection_name) { $query = array( 'scan_id' => 0, - 'word' => array('$exists' => TRUE), ); - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag - // max trending + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0 / trending:-1, velocity.minutes:-1 count 1 two queries try { + // max trending $results = $cursor ->find($query, array('trending' => 1)) ->sort(array('trending' => -1)) @@ -548,13 +446,13 @@ // max velocity $results = $cursor - ->find($query, array('minutes.velocity' => 1)) - ->sort(array('minutes.velocity' => -1)) + ->find($query, array('velocity.minutes' => 1)) + ->sort(array('velocity.minutes' => -1)) ->limit(1) ->timeout(scan_api_get_mongo_timeout()); if ($results->hasNext()) { $row = $results->getNext(); - $result[$collection_name]['velocity'] = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0; + $result[$collection_name]['velocity'] = isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0; } } catch (MongoCursorTimeoutException $e) { @@ -575,7 +473,7 @@ /** * Helper function to get the pager's current page number. */ -function _np_scan_analytics_get_page_number($db, $collection, $limit, $query = array(), $element = 0) { +function _np_scan_analytics_get_page_number($collection, $limit, $query = array(), $element = 0) { global $pager_page_array, $pager_total, $pager_total_items; // Initialize pager, see pager.inc. @@ -583,7 +481,7 @@ $page = isset($_GET['page']) ? $_GET['page'] : ''; $pager_page_array = explode(',', $page); $pager_total_items[$element] = 0; - if ($cursor = scan_api_get_mongo($db, $collection)) { // keyword, hashtag, (url -- disabled) + if ($cursor = scan_api_get_mongo($collection)) { // V2r15 / keyword, hashtag, (url -- disabled) / varies(_np_scan_analytics_get_page_number) / none (count) try { $pager_total_items[$element] = $cursor->find($query) ->timeout(scan_api_get_mongo_timeout()) @@ -605,13 +503,12 @@ $keyword_origin_access = user_access('access keyword origin'); $collection_name = $hashtag ? 'hashtag' : 'keyword'; - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0 / trending (paged query) $fields = array('scan_id' => 1, 'trending' => 1, 'word' => 1); $query = array( 'scan_id' => 0, - 'word' => array('$exists' => TRUE), ); - $pagenumber = $full ? _np_scan_analytics_get_page_number('scan_stats', $collection_name, $limit, $query) : 0; + $pagenumber = $full ? _np_scan_analytics_get_page_number($collection_name, $limit, $query) : 0; // V2r15 / hashtag, keyword / scan_id=0 / none (count) try { $results = $cursor ->find($query, $fields) @@ -659,26 +556,25 @@ $keyword_origin_access = user_access('access keyword origin'); $collection_name = $hashtag ? 'hashtag' : 'keyword'; - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0 / velocity.minutes:-1 paged query $query = array( 'scan_id' => 0, - 'word' => array('$exists' => TRUE), ); - $pagenumber = $full ? _np_scan_analytics_get_page_number('scan_stats', $collection_name, $limit, $query) : 0; + $pagenumber = $full ? _np_scan_analytics_get_page_number($collection_name, $limit, $query) : 0; // V2r15 / keyword, hashtag / scan_id=0 / none (count) - $fields = array('scan_id' => 1, 'minutes.velocity' => 1, 'word' => 1); + $fields = array('velocity.minutes' => 1, 'word' => 1); try { $results = $cursor ->find($query, $fields) - ->sort(array('minutes.velocity' => -1)) + ->sort(array('velocity.minutes' => -1)) ->skip($pagenumber * $limit) ->limit($limit) ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { $tweets[$row['word']] = array( - 'scan_id' => $row['scan_id'], - 'velocity' => isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] * 12 : 0, + 'scan_id' => 0, + 'velocity' => isset($row['velocity']['minutes']) ? $row['velocity']['minutes'] * 12 : 0, 'word' => $row['word'], ); } @@ -715,13 +611,17 @@ } if ($tweets) { - - $fields = array('word' => 1, 'hours' => 1, 'updated' => 1); $query = array( 'scan_id' => 0, 'word' => array('$in' => array_map('strtolower', $words)), ); - if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + $fields = array( + 'word' => 1, + 'hours' => 1, + 'updated' => 1, + ); + + if ($cursor = scan_api_get_mongo($collection_name)) { // V2r15 / keyword, hashtag / scan_id=0, word[] / nond try { $results = $cursor ->find($query, $fields) @@ -753,7 +653,7 @@ 'word' => array('$in' => $words), 'updated' => array('$gte' => $date), ); - $cursor = scan_api_get_mongo('urls', 'url') + $cursor = scan_api_get_mongo('urls', 'url') // !V2r15 not converted -- commented out code ->find($query, $fields) ->sort(array('updated' => -1)); $i = 0; @@ -872,22 +772,22 @@ //$pagenumber = $paged ? _np_scan_analytics_get_page_number('urls', 'url', $limit, array('scan_id' => array('$in' => $scan_ids), 'category' => $category)) : 0; $pagenumber = 0; //@todo: fixme foreach ($scan_ids as $scan_id) { - $fields = array('hours.velocity' => 1, 'url_id' => 1); + $fields = array('velocity' => 1, 'url_id' => 1); $query = array( - 'scan_id' => $scan_id, - 'category' => $category, + 'scan_id' => intval($scan_id), + 'category' => intval($category), ); - if ($cursor = scan_api_get_mongo('urls', 'url')) { + if ($cursor = scan_api_get_mongo('url')) { // V2r15 / url / scan_id, category / velocity:-1 paged query try { $results = $cursor ->find($query, $fields) - ->sort(array('hours.velocity' => -1)) + ->sort(array('velocity' => -1)) ->skip($pagenumber * $limit) ->limit($limit) ->timeout(scan_api_get_mongo_timeout()); foreach ($results as $row) { $url_ids[$row['url_id']] = $row['url_id']; - $velocity = isset($row['hours']['velocity']) ? $row['hours']['velocity'] : 0; + $velocity = isset($row['velocity']) ? $row['velocity'] : 0; if (!isset($order_data[$row['url_id']])) { $order_data[$row['url_id']] = $velocity; } @@ -904,6 +804,44 @@ } /** + * FASTER Helper function to get url statistics for different categories. + */ +function _np_scan_analytics_group_category_velocity($group_ids, $category, $limit) { + $url_ids = array(); // @@@ Use PHP better. This can be done with a single array + $order_data = array(); // assuming the callers know how to preserve a sorted array. + $fields = array( + 'velocity' => 1, + 'url_id' => 1, + ); + $query = array( + 'scan.client_id' => array_map('intval', $group_ids), + 'category' => intval($category), + ); + if ($cursor = scan_api_get_mongo('url')) { // V2r15 / url / scan_id, category / velocity:-1 paged query + try { + $results = $cursor->find($query, $fields) + ->sort(array('velocity' => -1)) + // Set limit large enough so that we can generally satisfy the $limit requested. + // This is relatively "cheap". + ->limit(150) + ->timeout(scan_api_get_mongo_timeout()); + $found = 0; + while ($found < $limit && $results->hasNext()) { + $curr = $results->getNext(); + if (!isset($url_ids[$curr['url_id']])) { + $found++; + $url_ids[$curr['url_id']] = $curr['url_id']; + $order_data[$curr['url_id']] = $curr['velocity']; + } + } + } + catch (MongoCursorTimeoutException $e) { + } + } + return array($url_ids, $order_data); +} + +/** * param $page - one of 'links', 'images', 'videos' or NULL for all results. * param $twitter - if TRUE then find popular links across all of twitter, * otherwise just find results within the current users' groups. @@ -917,37 +855,22 @@ $videos_list = array(); $scan_ids = array(0); + $group_ids = array(); if ($page_type == 'property') { - // when we have several scan_ids we need to do some magic, otherwise we - // force the sql to do filesort to order the resultset and - // temporary table to handle the distinct clause since several scans can - // have the same url with diff velocity, so... - // we will fetch in these cases a diff resultset for each scan id and comb them together ourselves - // killing paging $paged = FALSE; - $scan_ids = array(-1); if ($GLOBALS['user']->og_groups) { - scan_api_set_active_shard('misc'); - $rs = db_query(" - SELECT s.scan_id - FROM {og_ancestry} o - INNER JOIN {scan_settings} ss on ss.nid = o.nid - INNER JOIN {scan} s on s.vid = ss.active_vid - WHERE o.group_nid IN (" . implode(',', array_fill(0, count($GLOBALS['user']->og_groups), '%d')) . ") ORDER BY o.nid DESC LIMIT 100", array_keys($GLOBALS['user']->og_groups)); - $scan_ids = array(); - while ($row = db_fetch_object($rs)) { - $scan_ids[] = intval($row->scan_id); - } - scan_api_set_active_shard(); - if (!$scan_ids) { - $scan_ids = array(-1); - } + $group_ids = array_keys($GLOBALS['user']->og_groups); } } // TOP LINKS if ($page == 'links' || is_null($page)) { - list($url_ids, $order_data) = _np_scan_analytics_get_url_statistics_all_time($scan_ids, 1, $paged, $limit); + if (!empty($group_ids)) { + list($url_ids, $order_data) = _np_scan_analytics_group_category_velocity($group_ids, 1, $limit); + } + else { + list($url_ids, $order_data) = _np_scan_analytics_get_url_statistics_all_time($scan_ids, 1, $paged, $limit); + } $data = array(); if ($url_ids) { @@ -993,8 +916,14 @@ // IMAGES if ($page == 'images' || is_null($page)) { $limit_override = is_null($page) ? 12 : $limit; - list($url_ids, $order_data) = _np_scan_analytics_get_url_statistics_all_time($scan_ids, 2, $paged, $limit_override); + if (!empty($group_ids)) { + list($url_ids, $order_data) = _np_scan_analytics_group_category_velocity($group_ids, 2, $limit_override); + } + else { + list($url_ids, $order_data) = _np_scan_analytics_get_url_statistics_all_time($scan_ids, 2, $paged, $limit_override); + } + $data = array(); if ($url_ids) { $url_id_placeholders = implode(',', array_fill(0, count($url_ids), '%d')); @@ -1041,7 +970,12 @@ // VIDEOS if ($page == 'videos' || is_null($page)) { $limit_override = is_null($page) ? 4 : $limit; - list($url_ids, $order_data) = _np_scan_analytics_get_url_statistics_all_time($scan_ids, 3, $paged, $limit_override); + if (!empty($group_ids)) { + list($url_ids, $order_data) = _np_scan_analytics_group_category_velocity($group_ids, 3, $limit_override); + } + else { + list($url_ids, $order_data) = _np_scan_analytics_get_url_statistics_all_time($scan_ids, 3, $paged, $limit_override); + } $data = array(); if ($url_ids) {