????
Current Path : /home/innovagencyco/www/statxpress/wp-content/plugins/litespeed-cache/src/ |
Current File : /home/innovagencyco/www/statxpress/wp-content/plugins/litespeed-cache/src/crawler-map.cls.php |
<?php /** * The Crawler Sitemap Class * * @since 1.1.0 */ namespace LiteSpeed; defined('WPINC') || exit(); class Crawler_Map extends Root { const LOG_TAG = 'ππΊοΈ'; const BM_MISS = 1; const BM_HIT = 2; const BM_BLACKLIST = 4; private $_home_url; // Used to simplify urls private $_tb; private $_tb_blacklist; private $__data; private $_conf_map_timeout; private $_urls = array(); /** * Instantiate the class * * @since 1.1.0 */ public function __construct() { $this->_home_url = get_home_url(); $this->__data = Data::cls(); $this->_tb = $this->__data->tb('crawler'); $this->_tb_blacklist = $this->__data->tb('crawler_blacklist'); $this->_conf_map_timeout = $this->conf(Base::O_CRAWLER_MAP_TIMEOUT); } /** * Save URLs crawl status into DB * * @since 3.0 * @access public */ public function save_map_status($list, $curr_crawler) { global $wpdb; Utility::compatibility(); $total_crawler = count(Crawler::cls()->list_crawlers()); $total_crawler_pos = $total_crawler - 1; // Replace current crawler's position $curr_crawler = (int) $curr_crawler; foreach ($list as $bit => $ids) { // $ids = [ id => [ url, code ], ... ] if (!$ids) { continue; } self::debug("Update map [crawler] $curr_crawler [bit] $bit [count] " . count($ids)); // Update res first, then reason $right_pos = $total_crawler_pos - $curr_crawler; $sql_res = "CONCAT( LEFT( res, $curr_crawler ), '$bit', RIGHT( res, $right_pos ) )"; $id_all = implode(',', array_map('intval', array_keys($ids))); $wpdb->query("UPDATE `$this->_tb` SET res = $sql_res WHERE id IN ( $id_all )"); // Add blacklist if ($bit == 'B' || $bit == 'N') { $q = "SELECT a.id, a.url FROM `$this->_tb_blacklist` a LEFT JOIN `$this->_tb` b ON b.url=a.url WHERE b.id IN ( $id_all )"; $existing = $wpdb->get_results($q, ARRAY_A); // Update current crawler status tag in existing blacklist if ($existing) { $count = $wpdb->query("UPDATE `$this->_tb_blacklist` SET res = $sql_res WHERE id IN ( " . implode(',', array_column($existing, 'id')) . ' )'); self::debug('Update blacklist [count] ' . $count); } // Append new blacklist if (count($ids) > count($existing)) { $new_urls = array_diff(array_column($ids, 'url'), array_column($existing, 'url')); self::debug('Insert into blacklist [count] ' . count($new_urls)); $q = "INSERT INTO `$this->_tb_blacklist` ( url, res, reason ) VALUES " . implode(',', array_fill(0, count($new_urls), '( %s, %s, %s )')); $data = array(); $res = array_fill(0, $total_crawler, '-'); $res[$curr_crawler] = $bit; $res = implode('', $res); $default_reason = $total_crawler > 1 ? str_repeat(',', $total_crawler - 1) : ''; // Pre-populate default reason value first, update later foreach ($new_urls as $url) { $data[] = $url; $data[] = $res; $data[] = $default_reason; } $wpdb->query($wpdb->prepare($q, $data)); } } // Update sitemap reason w/ HTTP code $reason_array = array(); foreach ($ids as $id => $v2) { $code = (int) $v2['code']; if (empty($reason_array[$code])) { $reason_array[$code] = array(); } $reason_array[$code][] = (int) $id; } foreach ($reason_array as $code => $v2) { // Complement comma if ($curr_crawler) { $code = ',' . $code; } if ($curr_crawler < $total_crawler_pos) { $code .= ','; } $count = $wpdb->query( "UPDATE `$this->_tb` SET reason=CONCAT(SUBSTRING_INDEX(reason, ',', $curr_crawler), '$code', SUBSTRING_INDEX(reason, ',', -$right_pos)) WHERE id IN (" . implode(',', $v2) . ')' ); self::debug("Update map reason [code] $code [pos] left $curr_crawler right -$right_pos [count] $count"); // Update blacklist reason if ($bit == 'B' || $bit == 'N') { $count = $wpdb->query( "UPDATE `$this->_tb_blacklist` a LEFT JOIN `$this->_tb` b ON b.url = a.url SET a.reason=CONCAT(SUBSTRING_INDEX(a.reason, ',', $curr_crawler), '$code', SUBSTRING_INDEX(a.reason, ',', -$right_pos)) WHERE b.id IN (" . implode(',', $v2) . ')' ); self::debug("Update blacklist [code] $code [pos] left $curr_crawler right -$right_pos [count] $count"); } } // Reset list $list[$bit] = array(); } return $list; } /** * Add one record to blacklist * NOTE: $id is sitemap table ID * * @since 3.0 * @access public */ public function blacklist_add($id) { global $wpdb; $id = (int) $id; // Build res&reason $total_crawler = count(Crawler::cls()->list_crawlers()); $res = str_repeat('B', $total_crawler); $reason = implode(',', array_fill(0, $total_crawler, 'Man')); $row = $wpdb->get_row("SELECT a.url, b.id FROM `$this->_tb` a LEFT JOIN `$this->_tb_blacklist` b ON b.url = a.url WHERE a.id = '$id'", ARRAY_A); if (!$row) { self::debug('blacklist failed to add [id] ' . $id); return; } self::debug('Add to blacklist [url] ' . $row['url']); $q = "UPDATE `$this->_tb` SET res = %s, reason = %s WHERE id = %d"; $wpdb->query($wpdb->prepare($q, array($res, $reason, $id))); if ($row['id']) { $q = "UPDATE `$this->_tb_blacklist` SET res = %s, reason = %s WHERE id = %d"; $wpdb->query($wpdb->prepare($q, array($res, $reason, $row['id']))); } else { $q = "INSERT INTO `$this->_tb_blacklist` (url, res, reason) VALUES (%s, %s, %s)"; $wpdb->query($wpdb->prepare($q, array($row['url'], $res, $reason))); } } /** * Delete one record from blacklist * * @since 3.0 * @access public */ public function blacklist_del($id) { global $wpdb; if (!$this->__data->tb_exist('crawler_blacklist')) { return; } $id = (int) $id; self::debug('blacklist delete [id] ' . $id); $wpdb->query("UPDATE `$this->_tb` SET res=REPLACE(REPLACE(res, 'N', '-'), 'B', '-') WHERE url=(SELECT url FROM `$this->_tb_blacklist` WHERE id='$id')"); $wpdb->query("DELETE FROM `$this->_tb_blacklist` WHERE id='$id'"); } /** * Empty blacklist * * @since 3.0 * @access public */ public function blacklist_empty() { global $wpdb; if (!$this->__data->tb_exist('crawler_blacklist')) { return; } self::debug('Truncate blacklist'); $wpdb->query("UPDATE `$this->_tb` SET res=REPLACE(REPLACE(res, 'N', '-'), 'B', '-')"); $wpdb->query("TRUNCATE `$this->_tb_blacklist`"); } /** * List blacklist * * @since 3.0 * @access public */ public function list_blacklist($limit = false, $offset = false) { global $wpdb; if (!$this->__data->tb_exist('crawler_blacklist')) { return array(); } $q = "SELECT * FROM `$this->_tb_blacklist` ORDER BY id DESC"; if ($limit !== false) { if ($offset === false) { $total = $this->count_blacklist(); $offset = Utility::pagination($total, $limit, true); } $q .= ' LIMIT %d, %d'; $q = $wpdb->prepare($q, $offset, $limit); } return $wpdb->get_results($q, ARRAY_A); } /** * Count blacklist */ public function count_blacklist() { global $wpdb; if (!$this->__data->tb_exist('crawler_blacklist')) { return false; } $q = "SELECT COUNT(*) FROM `$this->_tb_blacklist`"; return $wpdb->get_var($q); } /** * Empty sitemap * * @since 3.0 * @access public */ public function empty_map() { Data::cls()->tb_del('crawler'); $msg = __('Sitemap cleaned successfully', 'litespeed-cache'); Admin_Display::succeed($msg); } /** * List generated sitemap * * @since 3.0 * @access public */ public function list_map($limit, $offset = false) { global $wpdb; if (!$this->__data->tb_exist('crawler')) { return array(); } if ($offset === false) { $total = $this->count_map(); $offset = Utility::pagination($total, $limit, true); } $type = Router::verify_type(); $where = ''; if (!empty($_POST['kw'])) { $q = "SELECT * FROM `$this->_tb` WHERE url LIKE %s"; if ($type == 'hit') { $q .= " AND res LIKE '%H%'"; } if ($type == 'miss') { $q .= " AND res LIKE '%M%'"; } if ($type == 'blacklisted') { $q .= " AND res LIKE '%B%'"; } $q .= ' ORDER BY id LIMIT %d, %d'; $where = '%' . $wpdb->esc_like($_POST['kw']) . '%'; return $wpdb->get_results($wpdb->prepare($q, $where, $offset, $limit), ARRAY_A); } $q = "SELECT * FROM `$this->_tb`"; if ($type == 'hit') { $q .= " WHERE res LIKE '%H%'"; } if ($type == 'miss') { $q .= " WHERE res LIKE '%M%'"; } if ($type == 'blacklisted') { $q .= " WHERE res LIKE '%B%'"; } $q .= ' ORDER BY id LIMIT %d, %d'; // self::debug("q=$q offset=$offset, limit=$limit"); return $wpdb->get_results($wpdb->prepare($q, $offset, $limit), ARRAY_A); } /** * Count sitemap */ public function count_map() { global $wpdb; if (!$this->__data->tb_exist('crawler')) { return false; } $q = "SELECT COUNT(*) FROM `$this->_tb`"; $type = Router::verify_type(); if ($type == 'hit') { $q .= " WHERE res LIKE '%H%'"; } if ($type == 'miss') { $q .= " WHERE res LIKE '%M%'"; } if ($type == 'blacklisted') { $q .= " WHERE res LIKE '%B%'"; } return $wpdb->get_var($q); } /** * Generate sitemap * * @since 1.1.0 * @access public */ public function gen($manual = false) { $count = $this->_gen(); if (!$count) { Admin_Display::error(__('No valid sitemap parsed for crawler.', 'litespeed-cache')); return; } if (!defined('DOING_CRON') && $manual) { $msg = sprintf(__('Sitemap created successfully: %d items', 'litespeed-cache'), $count); Admin_Display::succeed($msg); } } /** * Generate the sitemap * * @since 1.1.0 * @access private */ private function _gen() { global $wpdb; if (!$this->__data->tb_exist('crawler')) { $this->__data->tb_create('crawler'); } if (!$this->__data->tb_exist('crawler_blacklist')) { $this->__data->tb_create('crawler_blacklist'); } // use custom sitemap if (!($sitemap = $this->conf(Base::O_CRAWLER_SITEMAP))) { return false; } $offset = strlen($this->_home_url); $sitemap = Utility::sanitize_lines($sitemap); try { foreach ($sitemap as $this_map) { $this->_parse($this_map); } } catch (\Exception $e) { self::debug('β failed to parse custom sitemap: ' . $e->getMessage()); } if (is_array($this->_urls) && !empty($this->_urls)) { if ($this->conf(Base::O_CRAWLER_DROP_DOMAIN)) { foreach ($this->_urls as $k => $v) { if (stripos($v, $this->_home_url) !== 0) { unset($this->_urls[$k]); continue; } $this->_urls[$k] = substr($v, $offset); } } $this->_urls = array_unique($this->_urls); } self::debug('Truncate sitemap'); $wpdb->query("TRUNCATE `$this->_tb`"); self::debug('Generate sitemap'); // Filter URLs in blacklist $blacklist = $this->list_blacklist(); $full_blacklisted = array(); $partial_blacklisted = array(); foreach ($blacklist as $v) { if (strpos($v['res'], '-') === false) { // Full blacklisted $full_blacklisted[] = $v['url']; } else { // Replace existing reason $v['reason'] = explode(',', $v['reason']); $v['reason'] = array_map(function ($element) { return $element ? 'Existed' : ''; }, $v['reason']); $v['reason'] = implode(',', $v['reason']); $partial_blacklisted[$v['url']] = array( 'res' => $v['res'], 'reason' => $v['reason'], ); } } // Drop all blacklisted URLs $this->_urls = array_diff($this->_urls, $full_blacklisted); // Default res & reason $crawler_count = count(Crawler::cls()->list_crawlers()); $default_res = str_repeat('-', $crawler_count); $default_reason = $crawler_count > 1 ? str_repeat(',', $crawler_count - 1) : ''; $data = array(); foreach ($this->_urls as $url) { $data[] = $url; $data[] = array_key_exists($url, $partial_blacklisted) ? $partial_blacklisted[$url]['res'] : $default_res; $data[] = array_key_exists($url, $partial_blacklisted) ? $partial_blacklisted[$url]['reason'] : $default_reason; } foreach (array_chunk($data, 300) as $data2) { $this->_save($data2); } // Reset crawler Crawler::cls()->reset_pos(); return count($this->_urls); } /** * Save data to table * * @since 3.0 * @access private */ private function _save($data, $fields = 'url,res,reason') { global $wpdb; if (empty($data)) { return; } $q = "INSERT INTO `$this->_tb` ( $fields ) VALUES "; // Add placeholder $q .= Utility::chunk_placeholder($data, $fields); // Store data $wpdb->query($wpdb->prepare($q, $data)); } /** * Parse custom sitemap and return urls * * @since 1.1.1 * @access private */ private function _parse($sitemap) { /** * Read via wp func to avoid allow_url_fopen = off * @since 2.2.7 */ $response = wp_remote_get($sitemap, array('timeout' => $this->_conf_map_timeout, 'sslverify' => false)); if (is_wp_error($response)) { $error_message = $response->get_error_message(); self::debug('failed to read sitemap: ' . $error_message); throw new \Exception('Failed to remote read ' . $sitemap); } $xml_object = simplexml_load_string($response['body'], null, LIBXML_NOCDATA); if (!$xml_object) { if ($this->_urls) { return; } throw new \Exception('Failed to parse xml ' . $sitemap); } // start parsing $xml_array = (array) $xml_object; if (!empty($xml_array['sitemap'])) { // parse sitemap set if (is_object($xml_array['sitemap'])) { $xml_array['sitemap'] = (array) $xml_array['sitemap']; } if (!empty($xml_array['sitemap']['loc'])) { // is single sitemap $this->_parse($xml_array['sitemap']['loc']); } else { // parse multiple sitemaps foreach ($xml_array['sitemap'] as $val) { $val = (array) $val; if (!empty($val['loc'])) { $this->_parse($val['loc']); // recursive parse sitemap } } } } elseif (!empty($xml_array['url'])) { // parse url set if (is_object($xml_array['url'])) { $xml_array['url'] = (array) $xml_array['url']; } // if only 1 element if (!empty($xml_array['url']['loc'])) { $this->_urls[] = $xml_array['url']['loc']; } else { foreach ($xml_array['url'] as $val) { $val = (array) $val; if (!empty($val['loc'])) { $this->_urls[] = $val['loc']; } } } } } }