Index: www/sites/all/modules/custom/np_scan/np_scan.archive.inc =================================================================== --- www/sites/all/modules/custom/np_scan/np_scan.archive.inc (revision 34194) +++ www/sites/all/modules/custom/np_scan/np_scan.archive.inc (working copy) @@ -47,7 +47,7 @@ foreach (array('general', 'photo', 'video') as $key) { if (isset($result['days'][$key])) { // get the index of the last updated bucket - $last_update = _fetch_scan_bucket_index('scanurl', $result['updated']->sec, 'days'); + $last_update = scan_api_bucket_index('scanurl', 'days', $result['updated']->sec); $sum = 0; foreach ($result['days'][$key] as $index => $value) { // ignore velocity values and keys which have not been updated recently enough @@ -401,7 +401,7 @@ if ($cursor->hasNext()) { $result = $cursor->getNext(); $last_update_start = $result['updated']->sec; - $last_update_index = _fetch_scan_bucket_index('scan', $result['updated']->sec, 'days'); + $last_update_index = scan_api_bucket_index('scan', 'days', $result['updated']->sec); foreach ($result['days'] as $index => $count) { if (!is_int($index) && in_array($index, array('velocity', 'prev_velocity', 'velocity7'))) { continue; Index: www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.inc =================================================================== --- www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.inc (revision 34194) +++ www/sites/all/modules/custom/np_scan_stats/np_scan_stats.proto.inc (working copy) @@ -813,7 +813,7 @@ $updated = $record['updated']->sec; foreach (array('minutes', 'hours', 'days') as $type) { if (isset($record[$type])) { - $index = _np_scan_stats_fetch_scan_bucket_index($collection, $updated, $type); + $index = scan_api_bucket_index($collection, $type, $updated); if (isset($record[$type][$index])) { $record[$type][$index] .= ' (*' . t('updated') . ')'; } @@ -821,7 +821,7 @@ // drupal_set_message(t('current %type index of %index missing', array('%type' => $type, '%index' => $index))); } // Validate that the velocities. - $size = _np_scan_stats_fetch_collection_time_bucket_size($collection, $type); + $size = scan_api_interval_count($collection, $type); $velocity = 0; foreach ($record[$type] as $key => $value) { if (is_numeric($key)) { @@ -854,68 +854,6 @@ return $record; } -/** - * Caculate the correct bin index for scan.{minutes|hours|days} - * based on scan.updated or current time (default) - * - * Using scan.updated tells us the last updated bin index for minutes|hours|days - * - * Copied from scan_api.module. - */ -function _np_scan_stats_fetch_scan_bucket_index($collection, $updated = NULL, $type) { - $updated = isset($updated) ? $updated : mktime(date('H') , 0, 0 ); - - // see #2113, we need to add 1 to the doy because of a bug in the worker. - switch($type) { - case 'minutes': - $index = intval(date('i', $updated) * _np_scan_stats_fetch_collection_time_bucket_size($collection, $type) / 60 ); // (* 12 / 60 ) Every 5 minutes - break; - case 'hours': - $days_since_y2k = intval(((date('Y', $updated) - 2000) * 365.2475) + date('z', $updated) + 1); - $index = (date('G', $updated) + 24* $days_since_y2k) % _np_scan_stats_fetch_collection_time_bucket_size($collection, $type); - break; - case 'days': - $days_since_y2k = intval(((date('Y', $updated) - 2000) * 365.2475) + date('z', $updated) + 1); - $index = $days_since_y2k % _np_scan_stats_fetch_collection_time_bucket_size($collection, $type); - break; - } - - return $index; -} - -/** - * Returns size of time bucket size (ex. scan.hours == 48, scan.days == 30, etc) - * Keep this lookup table in sync with https://apps.d2.nowpublic.com/trac/wiki/Scan/MongoSchema - * - * Copied from scan_api.module. - */ -function _np_scan_stats_fetch_collection_time_bucket_size($collection, $time_type) { - $lookup = array( - 'scan' => array( - 'minutes' => 12, 'hours' => 48, 'days' => 30, - ), - 'hashtag' => array( - 'minutes' => 12, 'hours' => 24, 'days' => 30, - ), - 'keyword' => array( - 'minutes' => 12, 'hours' => 24, 'days' => 30, - ), - 'location' => array( - 'minutes' => 12, 'hours' => 24, 'days' => 30, - ), - 'retweet' => array( - 'minutes' => 12, 'hours' => 24, 'days' => 30, - ), - 'scanurl' => array( - 'minutes' => 12, 'hours' => 48, 'days' => 30, - ), - 'url' => array( - 'minutes' => 0, 'hours' => 6, 'days' => 0, - ), - ); - return isset($lookup[$collection], $lookup[$collection][$time_type]) ? $lookup[$collection][$time_type] : 0; -} - function np_scan_stats_proto_options_form($form_state) { $form = array(); $form['limit'] = array( Index: www/sites/all/modules/custom/scan_api/scan_api.module =================================================================== --- www/sites/all/modules/custom/scan_api/scan_api.module (revision 34194) +++ www/sites/all/modules/custom/scan_api/scan_api.module (working copy) @@ -1304,58 +1304,48 @@ db_set_active(); print _scan_api_format($return, $format); } + /** - * Caculate the correct bin index for scan.{minutes|hours|days} - * based on scan.updated or current time (default) - * - * Using scan.updated tells us the last updated bin index for minutes|hours|days + * Caculate the correct bin index for a collection at a point in time. */ -function _fetch_scan_bucket_index($collection, $updated = NULL, $type) { - $updated = isset($updated) ? $updated : mktime(date('H') , 0, 0 ); +function scan_api_bucket_index($collection, $time_type, $when = NULL) { + $when = isset($when) ? $when : time(); + $interval_size_ = scan_api_interval_size($collection, $time_type); + $interval_count_ = scan_api_interval_count($collection, $time_type); + + // This is a direct port of the C++ code. - // see #2113, we need to add 1 to the doy because of a bug in the worker. - switch($type) { - case 'minutes': - $index = intval(date('i', $updated) * _fetch_collection_time_bucket_size($collection, 'minutes') / 60 ); // (* 12 / 60 ) Every 5 minutes - break; - case 'hours': - $days_since_y2k = intval(((date('Y', $updated) - 2000) * 365.2475) + date('z', $updated)) + 1; - $index = (date('G', $updated) + 24* $days_since_y2k) % _fetch_collection_time_bucket_size($collection, 'hours'); - break; - case 'days': - $days_since_y2k = intval(((date('Y', $updated) - 2000) * 365.2475) + date('z', $updated)) + 1; - $index = $days_since_y2k % _fetch_collection_time_bucket_size($collection, 'days'); - break; - } - - return $index; + // int i = when % (interval_size_ * interval_count_); + $i = $when % ($interval_size_ * $interval_count_); + // i = floor(i / interval_size_); + $i = floor($i / $interval_size_); + // Do the integer cast explicitly, as PHP does not have static typing. + // In the C++ version, the cast is implicit as i is declared as int. + $i = intval($i); + return $i; } + /** - * Helper function that returns how much time in seconds would it take to update the whole time type bin - * ex it would take 12 * 5 * 60 = 60 * 60 = 3600 seconds to go through the scan.minutes bin - * One use case of this API is to find out whether a bin have been stale for more than a full cycle - * which leads to zeroing out the whole bin. See top of scan_api_reorder_scan_time_buckets() + * Get interval length (in seconds). */ -function _bucket_size_to_time($collection, $time_type) { - $time = 0; - switch($time_type) { - case 'minutes': - $time = _fetch_collection_time_bucket_size($collection, $time_type) * 5 * 60; - break; - case 'hours': - $time = _fetch_collection_time_bucket_size($collection, $time_type) * 60 * 60; - break; - case 'days': - $time = _fetch_collection_time_bucket_size($collection, $time_type) * 60 * 60 * 24; - break; +function scan_api_interval_size($collection, $time_type) { + if ($time_type == 'minutes') { + return 5; // 60 seconds / 12 divisions. } - return $time; + elseif ($time_type == 'days') { + return 86400; // 60 (seconds) * 60 (minutes) * 24 (hours) + } + elseif ($time_type == 'hours') { + return 3600; // 60 (seconds) * 60 (minutes) + } + assert("Invalid time_type!"); } + /** - * Returns size of time bucket size (ex. scan.hours == 48, scan.days == 30, etc) - * Keep this lookup table in sync with https://apps.d2.nowpublic.com/trac/wiki/Scan/MongoSchema + * Get number of intervals in interval array. + * This is based on https://apps.d2.nowpublic.com/trac/wiki/Scan/MongoSchema */ -function _fetch_collection_time_bucket_size($collection, $time_type) { +function scan_api_interval_count($collection, $time_type) { $lookup = array( 'scan' => array( 'minutes' => 12, 'hours' => 48, 'days' => 30, @@ -1376,10 +1366,13 @@ 'minutes' => 12, 'hours' => 48, 'days' => 30, ), 'url' => array( - 'minutes' => 0, 'hours' => 6, 'days' => 0, + 'hours' => 6, ), ); - return isset($lookup[$collection], $lookup[$collection][$time_type]) ? $lookup[$collection][$time_type] : 0; + if (isset($lookup[$collection], $lookup[$collection][$time_type])) { + return $lookup[$collection][$time_type]; + } + assert("Count entry missing from interval count table!"); } /** @@ -1388,22 +1381,23 @@ * which makes it not easy to manage. here's API for that. */ function scan_api_reorder_scan_time_buckets($bucket, $collection, $time_type, $last_updated = NULL) { + $bucket_cycle_time = scan_api_interval_size($collection, $time_type) * scan_api_interval_count($collection, $time_type); // bucket have been outdated for more than one full cycle, flush it completely - if ( time() > (_bucket_size_to_time($collection, $time_type) + $last_updated) ) { - $size = _fetch_collection_time_bucket_size($collection, $time_type); + if ( time() > ($bucket_cycle_time + $last_updated) ) { + $size = scan_api_interval_count($collection, $time_type); $bucket = array_fill(0, $size, 0 ) ; } else { // the following 6 lines normalize our bucket since sometimes buckets may have gaps making behavior unpredictable - $size = _fetch_collection_time_bucket_size($collection, $time_type); + $size = scan_api_interval_count($collection, $time_type); $new_bucket = array_fill(0, $size, 0 ); foreach($bucket as $index => $value ) { $new_bucket[$index] = $value; } $bucket = $new_bucket; // Flatten and flush stale bins for buckets that have been stale for less than full cycle - $current_index = _fetch_scan_bucket_index($collection, null, $time_type); - $last_updated_index = _fetch_scan_bucket_index($collection, $last_updated, $time_type); + $current_index = scan_api_bucket_index($collection, $time_type); + $last_updated_index = scan_api_bucket_index($collection, $time_type, $last_updated); // Remove non numerical indexes, leaving only time buckets foreach($bucket as $index => $value) { if (!is_numeric($index)) { Index: view/sites/all/modules/scan_api/scan_api.module =================================================================== --- view/sites/all/modules/scan_api/scan_api.module (revision 34194) +++ view/sites/all/modules/scan_api/scan_api.module (working copy) @@ -1365,57 +1365,48 @@ db_set_active(); print _scan_api_format($return, $format); } + /** - * Caculate the correct bin index for scan.{minutes|hours|days} - * based on scan.updated or current time (default) - * - * Using scan.updated tells us the last updated bin index for minutes|hours|days + * Caculate the correct bin index for a collection at a point in time. */ -function _fetch_scan_bucket_index($collection, $updated = NULL, $type) { - $updated = isset($updated) ? $updated : mktime(date('H') , 0, 0 ); +function scan_api_bucket_index($collection, $time_type, $when = NULL) { + $when = isset($when) ? $when : time(); + $interval_size_ = scan_api_interval_size($collection, $time_type); + $interval_count_ = scan_api_interval_count($collection, $time_type); + + // This is a direct port of the C++ code. - switch($type) { - case 'minutes': - $index = intval(date('i', $updated) * _fetch_collection_time_bucket_size($collection, 'minutes') / 60 ); // (* 12 / 60 ) Every 5 minutes - break; - case 'hours': - $days_since_y2k = intval(((date('Y', $updated) - 2000) * 365.2475) + date('z', $updated) - 1); - $index = (date('G', $updated) + 24* $days_since_y2k) % _fetch_collection_time_bucket_size($collection, 'hours'); - break; - case 'days': - $days_since_y2k = intval(((date('Y', $updated) - 2000) * 365.2475) + date('z', $updated) - 1); - $index = $days_since_y2k % _fetch_collection_time_bucket_size($collection, 'days'); - break; - } - - return $index; + // int i = when % (interval_size_ * interval_count_); + $i = $when % ($interval_size_ * $interval_count_); + // i = floor(i / interval_size_); + $i = floor($i / $interval_size_); + // Do the integer cast explicitly, as PHP does not have static typing. + // In the C++ version, the cast is implicit as i is declared as int. + $i = intval($i); + return $i; } + /** - * Helper function that returns how much time in seconds would it take to update the whole time type bin - * ex it would take 12 * 5 * 60 = 60 * 60 = 3600 seconds to go through the scan.minutes bin - * One use case of this API is to find out whether a bin have been stale for more than a full cycle - * which leads to zeroing out the whole bin. See top of scan_api_reorder_scan_time_buckets() + * Get interval length (in seconds). */ -function _bucket_size_to_time($collection, $time_type) { - $time = 0; - switch($time_type) { - case 'minutes': - $time = _fetch_collection_time_bucket_size($collection, $time_type) * 5 * 60; - break; - case 'hours': - $time = _fetch_collection_time_bucket_size($collection, $time_type) * 60 * 60; - break; - case 'days': - $time = _fetch_collection_time_bucket_size($collection, $time_type) * 60 * 60 * 24; - break; +function scan_api_interval_size($collection, $time_type) { + if ($time_type == 'minutes') { + return 5; // 60 seconds / 12 divisions. } - return $time; + elseif ($time_type == 'days') { + return 86400; // 60 (seconds) * 60 (minutes) * 24 (hours) + } + elseif ($time_type == 'hours') { + return 3600; // 60 (seconds) * 60 (minutes) + } + assert("Invalid time_type!"); } + /** - * Returns size of time bucket size (ex. scan.hours == 48, scan.days == 30, etc) - * Keep this lookup table in sync with https://apps.d2.nowpublic.com/trac/wiki/Scan/MongoSchema + * Get number of intervals in interval array. + * This is based on https://apps.d2.nowpublic.com/trac/wiki/Scan/MongoSchema */ -function _fetch_collection_time_bucket_size($collection, $time_type) { +function scan_api_interval_count($collection, $time_type) { $lookup = array( 'scan' => array( 'minutes' => 12, 'hours' => 48, 'days' => 30, @@ -1436,10 +1427,13 @@ 'minutes' => 12, 'hours' => 48, 'days' => 30, ), 'url' => array( - 'minutes' => 0, 'hours' => 6, 'days' => 0, + 'hours' => 6, ), ); - return isset($lookup[$collection], $lookup[$collection][$time_type]) ? $lookup[$collection][$time_type] : 0; + if (isset($lookup[$collection], $lookup[$collection][$time_type])) { + return $lookup[$collection][$time_type]; + } + assert("Count entry missing from interval count table!"); } /** @@ -1448,22 +1442,23 @@ * which makes it not easy to manage. here's API for that. */ function scan_api_reorder_scan_time_buckets($bucket, $collection, $time_type, $last_updated = NULL) { + $bucket_cycle_time = scan_api_interval_size($collection, $time_type) * scan_api_interval_count($collection, $time_type); // bucket have been outdated for more than one full cycle, flush it completely - if ( time() > (_bucket_size_to_time($collection, $time_type) + $last_updated) ) { - $size = _fetch_collection_time_bucket_size($collection, $time_type); + if ( time() > ($bucket_cycle_time + $last_updated) ) { + $size = scan_api_interval_count($collection, $time_type); $bucket = array_fill(0, $size, 0 ) ; } else { // the following 6 lines normalize our bucket since sometimes buckets may have gaps making behavior unpredictable - $size = _fetch_collection_time_bucket_size($collection, $time_type); + $size = scan_api_interval_count($collection, $time_type); $new_bucket = array_fill(0, $size, 0 ); foreach($bucket as $index => $value ) { $new_bucket[$index] = $value; } $bucket = $new_bucket; // Flatten and flush stale bins for buckets that have been stale for less than full cycle - $current_index = _fetch_scan_bucket_index($collection, null, $time_type); - $last_updated_index = _fetch_scan_bucket_index($collection, $last_updated, $time_type); + $current_index = scan_api_bucket_index($collection, $time_type); + $last_updated_index = scan_api_bucket_index($collection, $time_type, $last_updated); // Remove non numerical indexes, leaving only time buckets foreach($bucket as $index => $value) { if (!is_numeric($index)) {