Index: scan_api.module =================================================================== --- scan_api.module (revision 34200) +++ scan_api.module (working copy) @@ -291,8 +291,8 @@ // $scan_id,, $count, $max_age, $format extract(_scan_get_args()); // Now get items. + $items_needed = $count; $return = array(); - $items = array(); $params = array(); $filters = _scan_api_fetch_blocking_data('user', array($scan_id)); @@ -312,13 +312,21 @@ array_push($params, $scan_id, $max_age); scan_api_set_active_shard('posts'); if ($total = db_result(db_query("SELECT COUNT(*) FROM {scan_items} si " . $extra_join . " WHERE si.scan_id = %d AND si.created >= DATE_SUB(NOW(), INTERVAL %d SECOND)" . $extra_where, $params))) { - $result = db_query_range("SELECT si.item_id FROM {scan_items} si " . $extra_join . " WHERE si.scan_id = %d AND si.created >= DATE_SUB(NOW(), INTERVAL %d SECOND)" . $extra_where . " ORDER BY si.created DESC", $params, $offset, $count); - while ($item = db_fetch_array($result)) { - $items[] = $item['item_id']; - } + do { + $items = array(); + scan_api_set_active_shard('posts'); + $result = db_query_range("SELECT si.item_id FROM {scan_items} si " . $extra_join . " WHERE si.scan_id = %d AND si.created >= DATE_SUB(NOW(), INTERVAL %d SECOND)" . $extra_where . " ORDER BY si.created DESC", $params, $offset, $count); + while ($item = db_fetch_array($result)) { + $items[] = $item['item_id']; + } + scan_api_set_active_shard(); + $offset += $count; + $count = $count*2; + $return_items = _scan_api_items_helper($items, $scan_id); + $return = array_merge($return, $return_items); + } while (count($return) < $items_needed && !empty($items)); } - $return = _scan_api_items_helper($items, $scan_id); - + $return = array_slice($return, 0, $items_needed); scan_api_set_active_shard(); print _scan_api_format(array('items' => $total, 'data' => $return), $format); } @@ -502,27 +510,35 @@ function scan_get_preview_items() { // $scan_id,, $count, $max_age, $format extract(_scan_get_args()); + $needed_items = $count; + $result = array(); // Now get items. scan_api_set_active_shard('misc'); $sphinx = _scan_get_sphinx_client($offset, $count, $max_age); $query = _scan_get_sphinx_query($and_terms, $or_terms, $exclude_terms, $sphinx); - if (!empty($loc_includes) || !empty($loc_excludes)) { - $sphinx_result = _scan_api_sphinx_location_search($loc_includes, $loc_excludes, $query, $offset, $count, $max_age); - } - else { - if (!$sphinx_result = $sphinx->Query($query, 'main')) { - sleep(1); - $sphinx_result = $sphinx->Query($query, 'main'); + do { + if (!empty($loc_includes) || !empty($loc_excludes)) { + $sphinx_result = _scan_api_sphinx_location_search($loc_includes, $loc_excludes, $query, $offset, $count, $max_age); } - } - $ids = array(); - if (!empty($sphinx_result) && !empty($sphinx_result['matches'])) { - foreach($sphinx_result['matches'] as $id => $match) { - $ids[] = (int)$id; + else { + if (!$sphinx_result = $sphinx->Query($query, 'main')) { + sleep(1); + $sphinx_result = $sphinx->Query($query, 'main'); + } } - } - $items = _scan_api_items_helper($ids, $scan_id, FALSE); + $ids = array(); + if (!empty($sphinx_result) && !empty($sphinx_result['matches'])) { + foreach($sphinx_result['matches'] as $id => $match) { + $ids[] = (int)$id; + } + } + $items = _scan_api_items_helper($ids, $scan_id, FALSE); + $result = array_merge($items, $result); + $offset += $count; + $count = $count*2; + $sphinx->setLimits((int)$offset, (int)$count, variable_get('np_scan_api_sphinx_maxmatches', 5000)); + } while (!empty($sphinx_result) && count($result) < $needed_items); if (empty($sphinx_result)) { print _scan_api_error($sphinx->GetLastError() . ' :: ' . $sphinx->GetLastWarning(), $format); @@ -530,14 +546,16 @@ } scan_api_set_active_shard(); - print _scan_api_format(array('items' => $sphinx_result['total_found'], 'data' => $items), $format); + $result = array_slice($result, 0, $needed_items); + print _scan_api_format(array('items' => $sphinx_result['total_found'], 'data' => $result), $format); } function scan_top_items() { - // $scan_id,, $count, $max_age, $format + // $scan_id, $count, $max_age, $format extract(_scan_get_args()); // Now get items. - $items = array(); + $return = array(); + $items_needed = $count; $params = array(); $filters = _scan_api_fetch_blocking_data('user', array($scan_id)); @@ -557,13 +575,22 @@ array_push($params, $scan_id, $max_age); scan_api_set_active_shard('posts'); if ($total = db_result(db_query("SELECT COUNT(*) FROM {scan_items} si " . $extra_join . " WHERE si.scan_id = %d AND si.created >= DATE_SUB(NOW(), INTERVAL %d SECOND)" . $extra_where, $params))) { - $result = db_query_range("SELECT si.item_id FROM {scan_items} si " . $extra_join . " WHERE si.scan_id = %d AND si.created >= DATE_SUB(NOW(), INTERVAL %d SECOND)" . $extra_where . " ORDER BY si.recommend DESC, si.created DESC", $params, $offset, $count); - while ($item = db_fetch_array($result)) { - $items[] = $item['item_id']; - } + do { + $items = array(); + scan_api_set_active_shard('posts'); + $result = db_query_range("SELECT si.item_id FROM {scan_items} si " . $extra_join . " WHERE si.scan_id = %d AND si.created >= DATE_SUB(NOW(), INTERVAL %d SECOND)" . $extra_where . " ORDER BY si.recommend DESC, si.created DESC", $params, $offset, $count); + while ($item = db_fetch_array($result)) { + $items[] = $item['item_id']; + } + scan_api_set_active_shard(); + $offset += $count; + $count = $count*2; + $return_items = _scan_api_items_helper($items, $scan_id); + $return = array_merge($return, $return_items); + } while (count($return) < $items_needed && !empty($items)); } scan_api_set_active_shard(); - $return = _scan_api_items_helper($items, $scan_id); + $return = array_slice($return, 0, $items_needed); print _scan_api_format(array('items' => $total, 'data' => $return), $format); } @@ -578,7 +605,7 @@ } scan_api_set_active_shard('misc'); $settings = db_fetch_object(db_query(' - SELECT ss.content_filter, ss.replace_word, ss.use_globals, gs.content_filter AS global_filter, gs.replace_word AS global_replace + SELECT ss.content_filter, ss.replace_word, ss.use_globals, gs.content_filter AS global_filter, gs.replace_word AS global_replace,gs.filter_preferences FROM {scan} s INNER JOIN {scan_settings} ss ON ss.nid = s.nid INNER JOIN {og_ancestry} og ON s.nid = og.nid @@ -620,9 +647,16 @@ while ($o = db_fetch_array($result)) { $o['recommend'] = isset($recommendations[$o['item_id']]) ? $recommendations[$o['item_id']] : 0; $o['created'] = gmdate('c', strtotime($o['created'])); + $new_body = $o['body']; if ($filters) { - $o['body'] = str_replace($filters, $replace, $o['body']); + $new_body = str_replace($filters, $replace, $o['body']); } + if ($new_body != $o['body'] && $settings->filter_preferences == 1) { + continue; + } + else { + $o['body'] = $new_body; + } // @TODO: XML formatting. if ($pre_ordered) { $return[$o['item_id']] = $o; @@ -648,7 +682,7 @@ $empty_result = FALSE; $related_keywords = array(); $related_hashtags = array(); - $scan_ids = array($scan_id); + $scan_ids = array(intval($scan_id)); if ($related) { scan_api_set_active_shard('misc', 'scan'); // look for scans that has the words @@ -718,6 +752,7 @@ */ function _scan_top_keywords(&$velocity, &$trending, $type, $scan_ids, $order, $count, $related, $related_keywords, $related_hashtags) { $interval = ($type == 1) ? 'minutes' : 'hours'; + $multiplier = ($type == 1) ? 12 : 1; $velocity_field = $interval . '.velocity'; foreach (array('keyword', 'hashtag') as $collection_name) { if ($related) { @@ -729,37 +764,45 @@ if ($scan_ids && (!$related || ($related && isset($query['word'])))) { $order_field = ($order == 'velocity') ? $velocity_field : 'trending'; - $collection = scan_api_get_mongo('scan_stats', $collection_name); // keyword, hashtag + $cursor = scan_api_get_mongo('scan_stats', $collection_name); // keyword, hashtag + if (!$cursor) { + return; + } $data = array(); foreach ($scan_ids as $scan_id) { $fields = array($velocity_field => 1, 'trending' => 1, 'word' => 1); $query['scan_id'] = intval($scan_id); - $cursor = $collection->find($query, $fields) - ->sort(array($order_field => -1)) - ->limit($count); - foreach ($cursor as $row) { - if (!isset($row['word'])) { - // bug in workers. they write empty word records... bad - continue; + try { + $results = $cursor->find($query, $fields) + ->sort(array($order_field => -1)) + ->limit($count) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($results as $row) { + if (!isset($row['word'])) { + // bug in workers. they write empty word records... bad + continue; + } + if ($collection_name == 'hashtag') { + $row['word'] = '#' . $row['word']; + } + if ($order == 'velocity') { + $order_data = isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] : 0; + } + else { + $order_data = $row['trending']; + } + if (!isset($ordering_data[$row['word']]) || ($ordering_data[$row['word']] < $order_data)) { + $ordering_data[$row['word']] = $order_data; + $data[$row['word']] = array( + 'velocity' => isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] * $multiplier: 0, + 'trending' => $row['trending'], + 'word' => $row['word'], + ); + } } - if ($collection_name == 'hashtag') { - $row['word'] = '#' . $row['word']; - } - if ($order == 'velocity') { - $order_data = isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] : 0; - } - else { - $order_data = $row['trending']; - } - if (!isset($ordering_data[$row['word']]) || ($ordering_data[$row['word']] < $order_data)) { - $ordering_data[$row['word']] = $order_data; - $data[$row['word']] = array( - 'velocity' => isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] : 0, - 'trending' => $row['trending'], - 'word' => $row['word'], - ); - } } + catch (MongoCursorTimeoutException $e) { + } } if ($data) { arsort($ordering_data); @@ -798,28 +841,36 @@ $result = array(); $interval = ($type == 1) ? 'minutes' : 'hours'; + $multiplier = ($type == 1) ? 12 : 1; $velocity_field = $interval . '.velocity'; - $collection = $is_hashtag ? 'hashtag' : 'keyword'; + $collection_name = $is_hashtag ? 'hashtag' : 'keyword'; $fields = array($velocity_field => 1, 'trending' => 1, 'word' => 1); $query = array( 'scan_id' => intval($scan_id), 'word' => array('$in' => array_map('strtolower', $keywords)), ); - $cursor = scan_api_get_mongo('scan_stats', $collection) // keyword, hashtag - ->find($query, $fields) - ->sort(array($velocity_field => -1)); - if ($multirow) { - foreach ($cursor as $row) { - if ($is_hashtag) { - $row['word'] = '#' . $row['word']; + if ($cursor = scan_api_get_mongo('scan_stats', $collection_name)) { // keyword, hashtag + try { + $cursor = $cursor + ->find($query, $fields) + ->sort(array($velocity_field => -1)) + ->timeout(scan_api_get_mongo_timeout()); + if ($multirow) { + foreach ($cursor as $row) { + if ($is_hashtag) { + $row['word'] = '#' . $row['word']; + } + $result[$row['word']] = isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] * $multiplier : 0; + } } - $result[$row['word']] = isset($row[$interval]['velocity']) ? $row[$interval]['velocity'] : 0; + elseif ($cursor->hasNext()) { + $result = $cursor->getNext(); + if (isset($result[$interval]['velocity'])) { + $result['velocity'] = $result[$interval]['velocity'] * $multiplier; + } + } } - } - elseif ($cursor->hasNext()) { - $result = $cursor->getNext(); - if (isset($result[$interval]['velocity'])) { - $result['velocity'] = $result[$interval]['velocity']; + catch (MongoCursorTimeoutException $e) { } } return $result; @@ -936,18 +987,24 @@ $query = array( 'scan_id' => intval($scan_id), ); - $cursor = scan_api_get_mongo('scan_stats', 'scan') - ->find($query, $fields); - if ($cursor->hasNext()) { - $row = $cursor->getNext(); - $velocity = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] : 0; - $prev_velocity = isset($row['minutes']['prev_velocity']) ? $row['minutes']['prev_velocity'] : 0; - $return = array( - 'velocity' => $velocity, - 'difference' => $velocity - $prev_velocity, - ); + if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + try { + $cursor = $cursor + ->find($query, $fields) + ->timeout(scan_api_get_mongo_timeout()); + if ($cursor->hasNext()) { + $row = $cursor->getNext(); + $velocity = isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0; + $prev_velocity = isset($row['minutes']['prev_velocity']) ? round(12 * $row['minutes']['prev_velocity']) : 0; + $return = array( + 'velocity' => $velocity, + 'difference' => $velocity - $prev_velocity, + ); + } + } + catch (MongoCursorTimeoutException $e) { + } } - print _scan_api_format($return, $format); } @@ -963,28 +1020,29 @@ 'photo' => 0, 'video' => 0, ); - - $fields = array( - 'days.velocity' => 1, - 'days.general.velocity' => 1, - 'days.photo.velocity' => 1, - 'days.video.velocity' => 1, + $categories = array( + SCAN_CATEGORY_URLS => 'general', + SCAN_CATEGORY_PHOTOS => 'photo', + SCAN_CATEGORY_VIDEOS => 'video', ); - $query = array( - 'scan_id' => intval($scan_id), - ); - $cursor = scan_api_get_mongo('urls', 'scanurl') - ->find($query, $fields); - if ($cursor->hasNext()) { - $row = $cursor->getNext(); - $return = array( - 'summary' => $row['days']['velocity'], - 'general' => $row['days']['general']['velocity'], - 'photo' => $row['days']['photo']['velocity'], - 'video' => $row['days']['video']['velocity'], - ); + if ($cursor = scan_api_get_mongo('urls', 'url')) { + foreach ($categories as $category => $return_key) { + $query = array( + 'scan_id' => intval($scan_id), + 'category' => $category, + ); + try { + $count = $cursor + ->find($query) + ->timeout(scan_api_get_mongo_timeout()) + ->count(); + $return[$return_key] = $count; + $return['summary'] += $count; + } + catch (MongoCursorTimeoutException $e) { + } + } } - print _scan_api_format($return, $format); } @@ -997,6 +1055,14 @@ if ($xcache) { $cid = "group_blocked_urls:$client_id"; $blocked_url_ids = xcache_get($cid); + // Fix blocked url's in cache that were previously saved as strings. + // We can remove this code once we restart xcache after 12/11/09. + if ($blocked_url_ids && $blocked_url_ids[0] !== intval($blocked_url_ids[0])) { + foreach ($blocked_url_ids as $key => $id) { + $blocked_url_ids[$key] = intval($id); + } + xcache_set($cid, $blocked_url_ids); + } } if (!is_array($blocked_url_ids)) { $blocked_url_ids = array(); @@ -1007,7 +1073,7 @@ WHERE group_id = %d ", $client_id); while ($row = db_fetch_array($result)) { - $blocked_url_ids[] = $row['url_id']; + $blocked_url_ids[] = intval($row['url_id']); } scan_api_set_active_shard(); if ($xcache && $blocked_url_ids) { @@ -1021,9 +1087,13 @@ /** * Helper function to get ordered url statistics for different categories. */ -function _scan_api_get_url_statistics_all_time($client_id, $scan_ids, $category) { - $return = array(); - $blocked_url_ids = _scan_api_get_blocked_urls($client_id); +function _scan_api_get_url_statistics_all_time($client_id, $scan_ids, $category, $skip, $limit, $previous = array()) { + $return = array( + 'full' => array(), + 'url_ids' => array(), + 'scan_ids' => array(), + ); + $blocked_url_ids = array_merge(_scan_api_get_blocked_urls($client_id), $previous); if ($scan_ids && $category) { // Though scan_id is needed for $scantag_only, it's not efficient to check @@ -1033,24 +1103,66 @@ 'scan_id' => array('$in' => $scan_ids), 'category' => $category, ); - if ( $blocked_url_ids ) { - $query['url_id'] = array('$nin' => $blocked_url_ids); + $range = $blocked_url_ids ? sizeof($blocked_url_ids) : 0; + if ($cursor = scan_api_get_mongo('urls', 'url')) { + try { + $cursor = $cursor + ->find($query, $fields) + ->sort(array('count' => -1)) + ->skip($skip) + ->limit($limit + $range) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($cursor as $row) { + if (!in_array($row['url_id'], $blocked_url_ids)) { + $return['full'][$row['url_id']] = $row; + $return['url_ids'][] = $row['url_id']; + // The index is used to make the array unique. + $return['scan_ids'][$row['scan_id']] = $row['scan_id']; + if (count($return['url_ids']) >= $limit) { + break; + } + } + } + } + catch (MongoCursorTimeoutException $e) { + } } - $cursor = scan_api_get_mongo('urls', 'url') - ->find($query, $fields) - ->sort(array('count' => -1)) - ->limit(1000); - foreach ($cursor as $row) { - $return['full'][$row['url_id']] = $row; - $return['url_ids'][] = $row['url_id']; - // The index is used to make the array unique. - $return['scan_ids'][$row['scan_id']] = $row['scan_id']; - } } return $return; } /** + * Helper function to count url statistics for different categories. + */ +function _scan_api_get_url_statistics_all_time_count($client_id, $scan_ids, $category) { + if ($scan_ids && $category) { + $query = array( + 'scan_id' => array('$in' => $scan_ids), + 'category' => $category, + ); + if ($cursor = scan_api_get_mongo('urls', 'url')) { + try { + $count = $cursor + ->find($query) + ->timeout(scan_api_get_mongo_timeout()) + ->count(); + if ($blocked_url_ids = _scan_api_get_blocked_urls($client_id)) { + $query['url_id'] = array('$in' => $blocked_url_ids); + $count -= $cursor + ->find($query) + ->timeout(scan_api_get_mongo_timeout()) + ->count(); + } + return $count; + } + catch (MongoCursorTimeoutException $e) { + } + } + } + return 0; +} + +/** * These lines of code were used in 3 places. make a function for it. */ function _scan_api_correct_scan_ids_client_id($scan_id, &$client_id, &$scan_ids) { @@ -1067,205 +1179,154 @@ $client_id = _scan_get_client_id($scan_id); } } + /** - * All time most mentioned general urls for a given scan + * All time most mentioned general urls/photos/videos for a given scan. */ -function scan_top_urls() { - // $scan_id,, $count, $max_age, $format +function scan_get_links($category) { + // $scan_id, $count, $max_age, $format extract(_scan_get_args()); + $scan_ids = array(intval($scan_id)); // Now get keywords. $return = array(); - $scan_ids = array(intval($scan_id)); + $ret = array(); // Populate client_id and scan_ids as necessary _scan_api_correct_scan_ids_client_id($scan_id, $client_id, $scan_ids); - $result = _scan_api_get_url_statistics_all_time($client_id, $scan_ids, SCAN_CATEGORY_URLS); - if ($result) { - $return = $result['full']; - $url_ids = $result['url_ids']; - $scan_id_results = $result['scan_ids']; - $args = $url_ids; - $url_id_placeholders = implode(',', array_fill(0, count($url_ids), '%d')); - $filters = _scan_api_fetch_blocking_data('url', $scan_id_results); - $extra_join = ''; - if ($scantag_only) { - $scan_id_placeholders = implode(',', array_fill(0, count($scan_id_results), '%d')); - $args = array_merge($scan_id_results, $args); - $extra_join = "INNER JOIN {scan_urls} su USE INDEX(approved_join_idx) ON (su.url_id = u.id AND su.scan_id IN ($scan_id_placeholders) AND su.approved > 0)"; + // Limit the number of url statistics we get. + // When filters are applied, we don't know exactly how many we need, + // because we don't know how many will be filtered out. + // But we make a "best" initial guess, and then increase the number if needed. + $skip = 0; + $loop = 0; + $blacklist_size = 0; + $previous = array(); + while (count($ret) < $offset + $count && $loop < 8) { + // On each loop, get a few more. + $limit = pow(2, $loop) * $count; + // If we're on the first offset, get a few more mongo records initially, + // so that we can catch up to the right record. + if (count($ret) == 0 && $offset > 0) { + $limit += $offset; } - scan_api_set_active_shard('misc'); - $total = db_result(db_query(" - SELECT COUNT(*) - FROM {urls} u - $extra_join - WHERE u.id IN ($url_id_placeholders)", $args)); - $rs = db_query(" - SELECT u.resolved as url, u.title, u.id - FROM {urls} u - $extra_join - WHERE u.id IN ($url_id_placeholders)", $args); - $blacklist_size = 0; - while ($row = db_fetch_array($rs)) { - $blacklisted = FALSE; - foreach ($filters as $filter) { - if (strpos($row['url'], $filter) !== FALSE) { - $blacklisted = TRUE; - $blacklist_size++; + // Get the results from Mongo. + $result = _scan_api_get_url_statistics_all_time($client_id, $scan_ids, $category, $skip, $limit, $previous); + if ($result && $result['url_ids']) { + $return = $result['full']; + $url_ids = $result['url_ids']; + $scan_id_results = $result['scan_ids']; + $previous = array_merge($previous, $result['scan_ids']); + $args = $url_ids; + $url_id_placeholders = implode(',', array_fill(0, count($url_ids), '%d')); + $filters = _scan_api_fetch_blocking_data('url', $scan_id_results); + $extra_join = ''; + if ($scantag_only) { + $scan_id_placeholders = implode(',', array_fill(0, count($scan_id_results), '%d')); + $args = array_merge($scan_id_results, $args); + $extra_join = "INNER JOIN {scan_urls} su USE INDEX(approved_join_idx) ON (su.url_id = u.id AND su.scan_id IN ($scan_id_placeholders) AND su.approved > 0)"; + } + + scan_api_set_active_shard('misc'); + switch ($category) { + case SCAN_CATEGORY_URLS: + $result = db_query_range(" + SELECT u.resolved as url, u.title, u.id + FROM {urls} u + $extra_join + WHERE u.id IN (" . $url_id_placeholders . ") + ", $args, 0, $limit); break; - } + + case SCAN_CATEGORY_PHOTOS: + $result = db_query_range(" + SELECT u.id, u.resolved as url, u.title, si.original_path, si.thumb_path + FROM {urls} u + $extra_join + INNER JOIN {scan_images} si ON si.url_id = u.id + WHERE si.thumb_path != '' AND u.id IN (" . $url_id_placeholders . ") + ", $args, 0, $limit); + break; + + case SCAN_CATEGORY_VIDEOS: + $result = db_query_range(" + SELECT u.id, u.resolved as url, u.title, se.domain, se.video_id + FROM {urls} u + $extra_join + INNER JOIN {scan_embeds} se ON se.url_id = u.id + WHERE u.id IN (" . $url_id_placeholders . ") + ", $args, 0, $limit); + break; } - if (!$blacklisted) { - if (empty($row['title'])) { - $row['title'] = $row['url']; + while ($row = db_fetch_array($result)) { + $blacklisted = FALSE; + foreach ($filters as $filter) { + if (strpos($row['url'], $filter) !== FALSE) { + $blacklisted = TRUE; + $blacklist_size++; + unset($return[$row['id']]); + break; + } } - $return[$row['id']] += $row; + + if (!$blacklisted) { + // Fix some values and add them to the return array. + switch ($category) { + case SCAN_CATEGORY_URLS: + if (empty($row['title'])) { + $row['title'] = $row['url']; + } + break; + case SCAN_CATEGORY_PHOTOS: + foreach (array('original_path', 'thumb_path') as $key) { + $path = basename($row[$key]); + $row[$key] = variable_get('np_scan_media_server', 'http://media.scan.nowpublic.com/') . substr($path, 0, 1) . '/' . substr($path, 1, 1) . substr($path, 2, 1) . '/'. $path; + } + break; + case SCAN_CATEGORY_VIDEOS: + $row['embed_code'] = _api_embed_code($row['domain'], $row['video_id']); + break; + } + $return[$row['id']] += $row; + } } + $return = array_filter($return, '_scan_api_filter_urls'); + + scan_api_set_active_shard(); } - scan_api_set_active_shard(); + else { + // If mongo returns nothing, then that's all we have. + break; + } + $ret += $return; + $skip += $limit; + $loop ++; + } - $return = array_filter($return, '_scan_api_filter_urls'); - $return = array_slice($return, $offset, $count); - print _scan_api_format(array('items' => min($total - $blacklist_size, 1000), 'data' => $return), $format); + + $total = $ret ? _scan_api_get_url_statistics_all_time_count($client_id, $scan_ids, $category) : 0; + // The limit must be applied to the return array. + $ret = array_slice($ret, $offset, $count); + print _scan_api_format(array('items' => min($total - $blacklist_size, 1000), 'data' => $ret), $format); } /** + * All time most mentioned general urls for a given scan + */ +function scan_top_urls() { + scan_get_links(SCAN_CATEGORY_URLS); +} +/** * All time most mentioned photos for a given scan */ function scan_get_photos() { - // $scan_id,, $count, $max_age, $format - extract(_scan_get_args()); - // Now get keywords. - $return = array(); - - $scan_ids = array(intval($scan_id)); - - // Populate client_id and scan_ids as necessary - _scan_api_correct_scan_ids_client_id($scan_id, $client_id, $scan_ids); - - $result = _scan_api_get_url_statistics_all_time($client_id, $scan_ids, SCAN_CATEGORY_PHOTOS); - if ($result) { - $return = $result['full']; - $url_ids = $result['url_ids']; - $scan_id_results = $result['scan_ids']; - - $args = $url_ids; - $url_id_placeholders = implode(',', array_fill(0, count($url_ids), '%d')); - $filters = _scan_api_fetch_blocking_data('url', $scan_id_results); - $extra_join = ''; - - if ($scantag_only) { - $scan_id_placeholders = implode(',', array_fill(0, count($scan_id_results), '%d')); - $args = array_merge($scan_id_results, $args); - $extra_join = "INNER JOIN {scan_urls} su USE INDEX(approved_join_idx) ON (su.url_id = u.id AND su.scan_id IN ($scan_id_placeholders) AND su.approved > 0)"; - } - - scan_api_set_active_shard('misc'); - $total = db_result(db_query(" - SELECT COUNT(*) - FROM {urls} u - $extra_join - INNER JOIN {scan_images} si ON si.url_id = u.id - WHERE u.id IN (" . $url_id_placeholders .") - ", $args)); - $result = db_query(" - SELECT u.id, u.resolved as url, u.title, si.original_path, si.thumb_path - FROM {urls} u - $extra_join - INNER JOIN {scan_images} si ON si.url_id = u.id - WHERE si.thumb_path != '' AND u.id IN (" . $url_id_placeholders . ") - ", $args); - $blacklist_size = 0; - while ($row = db_fetch_array($result)) { - $blacklisted = FALSE; - foreach ($filters as $filter) { - if (strpos($row['url'], $filter) !== FALSE) { - $blacklisted = TRUE; - $blacklist_size++; - break; - } - } - if (!$blacklisted) { - foreach (array('original_path', 'thumb_path') as $key) { - $path = basename($row[$key]); - $row[$key] = 'http://www.scan.nowpublic.com/media/'. substr($path, 0, 1) . '/' . substr($path, 1, 1) . substr($path, 2, 1) . '/'. $path; - } - $return[$row['id']] += $row; - } - } - scan_api_set_active_shard(); - } - $return = array_filter($return, '_scan_api_filter_urls'); - $return = array_slice($return, $offset, $count); - print _scan_api_format(array('items' => min($total - $blacklist_size, 1000), 'data' => $return), $format); + scan_get_links(SCAN_CATEGORY_PHOTOS); } /** * All time most mentioned videos for a given scan */ function scan_get_videos() { - // $scan_id,, $count, $max_age, $format - extract(_scan_get_args()); - // Now get keywords. - $return = array(); - - $scan_ids = array(intval($scan_id)); - - // Populate client_id and scan_ids as necessary - _scan_api_correct_scan_ids_client_id($scan_id, $client_id, $scan_ids); - - $result = _scan_api_get_url_statistics_all_time($client_id, $scan_ids, SCAN_CATEGORY_VIDEOS); - if ($result) { - $return = $result['full']; - $url_ids = $result['url_ids']; - $scan_id_results = $result['scan_ids']; - - $args = $url_ids; - $url_id_placeholders = implode(',', array_fill(0, count($url_ids), '%d')); - $filters = _scan_api_fetch_blocking_data('url', $scan_id_results); - $extra_join = ''; - - if ($scantag_only) { - $scan_id_placeholders = implode(',', array_fill(0, count($scan_id_results), '%d')); - $args = array_merge($scan_id_results, $args); - $extra_join = "INNER JOIN {scan_urls} su USE INDEX(approved_join_idx) ON (su.url_id = u.id AND su.scan_id IN ($scan_id_placeholders) AND su.approved > 0)"; - } - - scan_api_set_active_shard('misc'); - $total = db_result(db_query(" - SELECT COUNT(*) - FROM {urls} u - $extra_join - INNER JOIN {scan_embeds} se ON se.url_id = u.id - WHERE u.id IN (" . $url_id_placeholders .") - ", $args)); - $rs = db_query(" - SELECT u.id, u.resolved as url, u.title, se.domain, se.video_id - FROM {urls} u - $extra_join - INNER JOIN {scan_embeds} se ON se.url_id = u.id - WHERE u.id IN (" . $url_id_placeholders .") - ", $args); - $blacklist_size = 0; - while ($row = db_fetch_array($rs)) { - $blacklisted = FALSE; - foreach ($filters as $filter) { - if (strpos($row['url'], $filter) !== FALSE) { - $blacklisted = TRUE; - $blacklist_size++; - unset($return[$row['id']]); - break; - } - } - if (!$blacklisted) { - $row['embed_code'] = _api_embed_code($row['domain'], $row['video_id']); - $return[$row['id']] += $row; - } - } - scan_api_set_active_shard(); - } - $return = array_filter($return, '_scan_api_filter_urls'); - $return = array_slice($return, $offset, $count); - print _scan_api_format(array('items' => min($total - $blacklist_size, 1000), 'data' => $return), $format); + scan_get_links(SCAN_CATEGORY_VIDEOS); } function scan_top_users() { @@ -1322,7 +1383,7 @@ $interval_size_ = scan_api_interval_size($collection, $time_type); $interval_count_ = scan_api_interval_count($collection, $time_type); - + // This is a direct port of the C++ code. // int i = when % (interval_size_ * interval_count_); @@ -1435,28 +1496,35 @@ $epoch = gmdate('c', 0); $return = array_fill(0, $count, array('count' => 0, 'start_time' => $epoch, 'last_occurence' => $epoch + 3600)); - $fields = array('hours' => 1, 'updated' => 1); - $query = array( - 'scan_id' => intval($scan_id), - ); - $cursor = scan_api_get_mongo('statistics', 'scan') - ->find($query); + if ($cursor = scan_api_get_mongo('statistics', 'scan')) { + $fields = array('hours' => 1, 'updated' => 1); + $query = array( + 'scan_id' => intval($scan_id), + ); + try { + $cursor = $cursor + ->find($query) + ->timeout(scan_api_get_mongo_timeout()); - if ($cursor->hasNext()) { - $row = $cursor->getNext(); - $updated = explode(' ', (string)$row['updated']); - $bucket = scan_api_reorder_scan_time_buckets($row['hours'], 'scan', 'hours', $updated[1]); + if ($cursor->hasNext()) { + $row = $cursor->getNext(); + $updated = explode(' ', (string)$row['updated']); + $bucket = scan_api_reorder_scan_time_buckets($row['hours'], 'scan', 'hours', $updated[1]); - $now = mktime(date('H'), 0, 0 ); - foreach($bucket as $index => $value ) { - if ( $count == $index ) { - break; + $now = mktime(date('H'), 0, 0 ); + foreach($bucket as $index => $value ) { + if ( $count == $index ) { + break; + } + $return[$index] = array( + 'count' => $value, + 'start_time' => gmdate('c', $now - ($index * 60 * 60)), + ); + } } - $return[$index] = array( - 'count' => $value, - 'start_time' => gmdate('c', $now - ($index * 60 * 60)), - ); } + catch (MongoCursorTimeoutException $e) { + } } print _scan_api_format($return, $format); } @@ -1497,14 +1565,21 @@ // fill up the stats if ($return) { $scan_ids = array_keys($return); - $fields = array('scan_id' => 1, 'minutes.velocity' => 1); + $fields = array('scan_id' => 1, 'minutes.velocity' => 1, 'hours.velocity'); $query = array( 'scan_id' => array('$in' => $scan_ids), ); - $cursor = scan_api_get_mongo('scan_stats', 'scan') - ->find($query, $fields); - foreach ($cursor as $row) { - $data[$row['scan_id']]['velocity'] = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] : 0; + if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + try { + $cursor = $cursor + ->find($query, $fields) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($cursor as $row) { + $return[$row['scan_id']]['velocity'] = isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0; + } + } + catch (MongoCursorTimeoutException $e) { + } } } } @@ -1513,60 +1588,75 @@ // get a basic data structure in ordered form $fields = array('scan_id' => 1, 'minutes.velocity' => 1); $query = array( - 'category' => $client_id, 'status' => 1, + 'scan.client_id' => intval($client_id), 'scan.status' => 1, 'scan.active' => 1, ); - $cursor = scan_api_get_mongo('scan_stats', 'scan') - ->find($query, $fields) - ->sort(array('minutes.velocity' => -1)) - ->limit($count); - foreach ($cursor as $row) { - $return[$row['scan_id']] = array( - 'scan_id' => $row['scan_id'], - 'velocity' => isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] : 0, - 'general' => 0, - 'photo' => 0, - 'video' => 0, - 'summary' => 0, - ); - } - if ($return) { - $scan_id_placeholders = implode(',', array_fill(0, count($return), '%d')); - $scan_ids = array_keys($return); - // fill up with basic info - scan_api_set_active_shard('misc'); - $rs = db_query(" - SELECT s.scan_id, s.nid, s.and_words, s.or_words, nr.title, n.created, n.np_views_page as pageviews, n.recommend - FROM {scan} s - INNER JOIN {scan_settings} ss ON s.vid = ss.active_vid - INNER JOIN {node_revisions} nr ON s.vid = nr.vid - INNER JOIN {node} n ON s.nid = n.nid - WHERE s.scan_id IN (" . $scan_id_placeholders . ") - ", $scan_ids); - while ($row = db_fetch_array($rs)) { - $row['created'] = gmdate('c', $row['created']); - $return[$row['scan_id']] += $row; + if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + try { + $cursor = $cursor + ->find($query, $fields) + ->sort(array('minutes.velocity' => -1)) + ->limit($count) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($cursor as $row) { + $return[$row['scan_id']] = array( + 'scan_id' => $row['scan_id'], + 'velocity' => isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0, + 'general' => 0, + 'photo' => 0, + 'video' => 0, + 'summary' => 0, + ); + } } + catch (MongoCursorTimeoutException $e) { + } + if ($return) { + $scan_id_placeholders = implode(',', array_fill(0, count($return), '%d')); + $scan_ids = array_keys($return); + // fill up with basic info + scan_api_set_active_shard('misc'); + $rs = db_query(" + SELECT s.scan_id, s.nid, s.and_words, s.or_words, nr.title, n.created, n.np_views_page as pageviews, n.recommend + FROM {scan} s + INNER JOIN {scan_settings} ss ON s.vid = ss.active_vid + INNER JOIN {node_revisions} nr ON s.vid = nr.vid + INNER JOIN {node} n ON s.nid = n.nid + WHERE s.scan_id IN (" . $scan_id_placeholders . ") + ", $scan_ids); + while ($row = db_fetch_array($rs)) { + $row['created'] = gmdate('c', $row['created']); + $return[$row['scan_id']] += $row; + } + } } } // common query for both cases: if we have data we need to fill up url uniq stats if ($return) { $fields = array( 'scan_id' => 1, - 'hours.velocity24' => 1, - 'hours.general.velocity24' => 1, - 'hours.photo.velocity24' => 1, - 'hours.video.velocity24' => 1, + 'days.velocity' => 1, + 'days.general.velocity' => 1, + 'days.photo.velocity' => 1, + 'days.video.velocity' => 1, ); $query = array( 'scan_id' => array('$in' => $scan_ids), ); - $cursor = scan_api_get_mongo('urls', 'scanurl') - ->find($query, $fields); - foreach ($cursor as $row) { - $return[$row['scan_id']]['summary'] = $row['hours']['velocity24']; - $return[$row['scan_id']]['general'] = $row['hours']['general']['velocity24']; - $return[$row['scan_id']]['photo'] = $row['hours']['photo']['velocity24']; - $return[$row['scan_id']]['video'] = $row['hours']['video']['velocity24']; + if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { + try { + $cursor = $cursor + ->find($query, $fields) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($cursor as $row) { + $return[$row['scan_id']]['summary'] = round($row['days']['velocity'] * 30); + $return[$row['scan_id']]['general'] = round($row['days']['general']['velocity'] * 30); + $return[$row['scan_id']]['photo'] = round($row['days']['photo']['velocity'] * 30); + $return[$row['scan_id']]['video'] = round($row['days']['video']['velocity'] * 30); + } + $return = array_filter($return, '_scan_api_filter_nid'); + } + catch (MongoCursorTimeoutException $e) { + } } } scan_api_set_active_shard(); @@ -1607,7 +1697,7 @@ $params = array_unique($params); } if (empty($params)) { - $params = _scan_get_client_scan_ids(array(intval($client_id))); + $params = _scan_get_client_scan_ids(array(intval($client_id)), TRUE); } else { // We are using related scans based on tids. @@ -1618,8 +1708,6 @@ $total = 0; if (($related || !empty($using_tids)) && !empty($params)) { $scan_id_placeholders = implode(',', array_fill(0, count($params), '%d')); - $current_hour = date('Y-m-d H:00:00'); - $minute = date('i'); // look for scans that has the words scan_api_set_active_shard('misc'); $rs = db_query(" @@ -1638,9 +1726,8 @@ $keywords = array_map('strtolower', array_filter(array_map('trim', array_merge(explode(',', $row->and_words), explode(',', $row->or_words))))); // We are using tids or have common keywords. if (!empty($using_tids) || $common = array_intersect($related, $keywords)) { - $matches[$row->nid] = !empty($using_tids) ? array() : $common; - $velocity = ($row->velocity_recent + $row->current) * 60 / ($minute + 60); - $data[$row->nid] = array( + $matches[$row->scan_id] = !empty($using_tids) ? array() : $common; + $data[$row->scan_id] = array( 'nid' => $row->nid, 'title' => $row->title, 'velocity' => 0, @@ -1662,40 +1749,53 @@ $matches = array_slice($matches, $offset, $count, TRUE); } } - if ($matches) { $scan_ids = array_keys($matches); // fill up the scan stats part - $fields = array('scan_id' => 1, 'minutes.velocity' => 1, 'minutes.prev_velocity' => 1); - $query = array( - 'scan_id' => array('$in' => $scan_ids), - ); - $cursor = scan_api_get_mongo('scan_stats', 'scan') - ->find($query, $fields); - foreach ($cursor as $row) { - $velocity = isset($row['minutes']['velocity']) ? $row['minutes']['velocity'] : 0; - $prev_velocity = isset($row['minutes']['prev_velocity']) ? $row['minutes']['prev_velocity'] : 0; - $data[$row['scan_id']]['velocity'] = $velocity; - $data[$row['scan_id']]['difference'] = $velocity - $prev_velocity; + if ($cursor = scan_api_get_mongo('scan_stats', 'scan')) { + $fields = array('scan_id' => 1, 'minutes.velocity' => 1, 'minutes.prev_velocity' => 1); + $query = array( + 'scan_id' => array('$in' => $scan_ids), + ); + try { + $cursor = $cursor + ->find($query, $fields) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($cursor as $row) { + $velocity = isset($row['minutes']['velocity']) ? round(12 * $row['minutes']['velocity']) : 0; + $prev_velocity = isset($row['minutes']['prev_velocity']) ? round(12 * $row['minutes']['prev_velocity']) : 0; + $data[$row['scan_id']]['velocity'] = $velocity; + $data[$row['scan_id']]['difference'] = $velocity - $prev_velocity; + } + } + catch (MongoCursorTimeoutException $e) { + } } // fill up the url uniq stats part - $fields = array( - 'scan_id' => 1, - 'hours.velocity24' => 1, - 'hours.general.velocity24' => 1, - 'hours.photo.velocity24' => 1, - 'hours.video.velocity24' => 1, - ); - $query = array( - 'scan_id' => array('$in' => $scan_ids), - ); - $cursor = scan_api_get_mongo('urls', 'scanurl') - ->find($query, $fields); - foreach ($cursor as $row) { - $data[$row['scan_id']]['uniq_links_summary'] = $row['hours']['velocity24']; - $data[$row['scan_id']]['uniq_links_general'] = $row['hours']['general']['velocity24']; - $data[$row['scan_id']]['uniq_links_photo'] = $row['hours']['photo']['velocity24']; - $data[$row['scan_id']]['uniq_links_video'] = $row['hours']['video']['velocity24']; + if ($cursor = scan_api_get_mongo('urls', 'scanurl')) { + $fields = array( + 'scan_id' => 1, + 'days.velocity' => 1, + 'days.general.velocity' => 1, + 'days.photo.velocity' => 1, + 'days.video.velocity' => 1, + ); + $query = array( + 'scan_id' => array('$in' => $scan_ids), + ); + try { + $cursor = $cursor + ->find($query, $fields) + ->timeout(scan_api_get_mongo_timeout()); + foreach ($cursor as $row) { + $data[$row['scan_id']]['uniq_links_summary'] = round($row['days']['velocity'] * 30); + $data[$row['scan_id']]['uniq_links_general'] = round($row['days']['general']['velocity'] * 30); + $data[$row['scan_id']]['uniq_links_photo'] = round($row['days']['photo']['velocity'] * 30); + $data[$row['scan_id']]['uniq_links_video'] = round($row['days']['video']['velocity'] * 30); + } + } + catch (MongoCursorTimeoutException $e) { + } } // now we have populated all the missing data, and now we need to get the data in order foreach ($matches as $scan_id => $match) { @@ -1799,21 +1899,27 @@ $screen_names = array_filter(array_map('trim', explode(',', $screen_name))); $return = array(); $total = 0; - $items = array(); + $needed_items = $count; if ($screen_names) { $screen_name_placeholder = implode(', ', array_fill(0, count($screen_names), "'%s'")); scan_api_set_active_shard('posts', 'source_items'); if ($total = db_result(db_query("SELECT COUNT(*) FROM {source_items} WHERE author IN (" . $screen_name_placeholder . ")", $screen_names))) { - $result = db_query_range("SELECT si.item_id FROM {source_items} si WHERE author IN (" . $screen_name_placeholder . ") ORDER BY created DESC", $screen_names, $offset, $count); - while ($item = db_fetch_array($result)) { - $items[] = $item['item_id']; - } + do { + $items = array(); + scan_api_set_active_shard('posts', 'source_items'); + $result = db_query_range("SELECT si.item_id FROM {source_items} si WHERE author IN (" . $screen_name_placeholder . ") ORDER BY created DESC", $screen_names, $offset, $count); + while ($item = db_fetch_array($result)) { + $items[] = $item['item_id']; + } + scan_api_set_active_shard(); + $items = _scan_api_items_helper($items, $scan_id); + $return = array_merge($items, $return); + } while (count($return) < $needed_items && !empty($items)); } scan_api_set_active_shard(); } - $return = _scan_api_items_helper($items, $scan_id); - + $return = array_slice($return, 0, $needed_items); print _scan_api_format(array('items' => $total, 'data' => $return), $format); } @@ -1868,15 +1974,24 @@ * This will be executed in 3 places and i didnt want to replicate the same query. Also by using function i'm hoping to show relation between uses. */ function _mongo_top_locations_get_query_result($query) { - return scan_api_get_mongo('statistics', 'location')->find($query,array( - 'location_id' => 1, - 'hours.velocity' => 1, - 'location.lat' => 1, - 'location.lon' => 1, - 'location.name' => 1, - ) ) - ->sort(array('hours.velocity' => -1)) - ->limit(10); + if ($cursor = scan_api_get_mongo('statistics', 'location')) { + $fields = array( + 'location_id' => 1, + 'hours.velocity' => 1, + 'location.lat' => 1, + 'location.lon' => 1, + 'location.name' => 1, + ); + try { + return $cursor + ->find($query, $fields) + ->sort(array('hours.velocity' => -1)) + ->limit(10) + ->timeout(scan_api_get_mongo_timeout()); + } + catch (MongoCursorTimeoutException $e) { + } + } } /** * Get locations @@ -2245,27 +2360,14 @@ INNER JOIN {scan_settings} ss ON ss.nid = o.nid AND ss.status = 1 WHERE o.group_nid IN (" . $client_id_placeholders . ") ", $client_ids); - $nids = array(); + $client_nids = array(); while ($row = db_fetch_array($result)) { - $nids[] = $row['nid']; + $client_nids[] = $row['nid']; } - if (!empty($nids)) { - $nid_placeholders = implode(',', array_fill(0, count($nids), '%d')); - scan_api_set_active_shard('misc', 'scan_settings'); - $result = db_query(" - SELECT ss.nid - FROM {scan_settings} - WHERE ss.nid IN (" . $nid_placeholders . ") AND ss.status = 1 - ", $nids); - - while ($row = db_fetch_array($result)) { - $client_nids[] = $row['nid']; - } - if (!empty($client_nids)) { - arsort($client_nids); - $client_nids = array_slice($client_nids, 0, 50); - } + if (!empty($client_nids)) { + arsort($client_nids); + $client_nids = array_slice($client_nids, 0, 50); } scan_api_set_active_shard(); } @@ -2281,6 +2383,10 @@ return isset($item['url']); } +function _scan_api_filter_nid($item) { + return isset($item['nid']); +} + function scan_api_nodeapi(&$node, $op) { if ($op == 'load') { scan_api_set_active_shard('drupal');