acf
This commit is contained in:
368
plugins/system/nrframework/NRFramework/DOMCrawler.php
Normal file
368
plugins/system/nrframework/NRFramework/DOMCrawler.php
Normal file
@ -0,0 +1,368 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @author Tassos Marinos <info@tassos.gr>
|
||||
* @link https://www.tassos.gr
|
||||
* @copyright Copyright © 2024 Tassos All Rights Reserved
|
||||
* @license GNU GPLv3 <http://www.gnu.org/licenses/gpl.html> or later
|
||||
*/
|
||||
|
||||
namespace NRFramework;
|
||||
|
||||
use Joomla\String\StringHelper;
|
||||
use NRFramework\Cache;
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
class DOMCrawler
|
||||
{
|
||||
/**
|
||||
* The content to craw
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $content;
|
||||
|
||||
/**
|
||||
* The nodes discovered by crawling
|
||||
*
|
||||
* @var object
|
||||
*/
|
||||
public $nodes;
|
||||
|
||||
/**
|
||||
* Class constructor
|
||||
*
|
||||
* @param mixed $content The content to crawl. Defaults
|
||||
*/
|
||||
public function __construct($content = null)
|
||||
{
|
||||
if (is_null($content))
|
||||
{
|
||||
$content = \NRFramework\Functions::getBuffer();
|
||||
}
|
||||
|
||||
$this->setContent($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set content to crawl
|
||||
*
|
||||
* @param string $content The content to crawl. Defaults
|
||||
|
||||
* @return void
|
||||
*/
|
||||
public function setContent($content)
|
||||
{
|
||||
$this->content = $this->stringToUTF8($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter dom elements with a CSS Selector or XPath expression
|
||||
*
|
||||
* @param string $expression A CSS Selector or XPath expression
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function filter($expression)
|
||||
{
|
||||
// If empty content, return
|
||||
if (empty($this->content))
|
||||
{
|
||||
return $this;
|
||||
}
|
||||
|
||||
// If empty selector, return
|
||||
if (empty($expression))
|
||||
{
|
||||
return $this;
|
||||
}
|
||||
|
||||
if (!class_exists('DOMDocument') || !class_exists('DOMXPath'))
|
||||
{
|
||||
return $this;
|
||||
}
|
||||
|
||||
// Cache check
|
||||
$hash = md5($expression);
|
||||
|
||||
if (Cache::has($hash))
|
||||
{
|
||||
$this->nodes = Cache::get($hash, false);
|
||||
return $this;
|
||||
}
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
$dom = new \DOMDocument;
|
||||
$dom->loadHTML($this->content);
|
||||
$finder = new \DOMXPath($dom);
|
||||
|
||||
// Check if we are writing our own XPath query
|
||||
// example: =//h1[contains(@class, "faq-question")]
|
||||
if (substr($expression, 0, 1) == '=')
|
||||
{
|
||||
$xpath = StringHelper::substr($expression, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Create the XPath via the provided selector
|
||||
$xpath = $this->cssSelectorToXPath($expression);
|
||||
}
|
||||
|
||||
$this->nodes = $finder->query($xpath);
|
||||
|
||||
// Speed up filtering by caching results
|
||||
Cache::set($hash, $this->nodes);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the HTML of the first discovered node
|
||||
*
|
||||
* @param string $fallback The fallback text to return if no node is found
|
||||
* @param boolean $inner If set to true, only the node's inner HTML will be returned.
|
||||
* @param boolean $firstOnly If set to true, only the first node will be returned.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function html($fallback = '', $inner = false, $firstOnly = true)
|
||||
{
|
||||
if (!$this->nodes || !$this->nodes->length)
|
||||
{
|
||||
return $fallback;
|
||||
}
|
||||
|
||||
if ($firstOnly)
|
||||
{
|
||||
return $this->cleanText($this->getNodeHTML($this->nodes[0], $inner));
|
||||
}
|
||||
|
||||
$result = [];
|
||||
|
||||
foreach ($this->nodes as $node)
|
||||
{
|
||||
$result[] = $this->cleanText($this->getNodeHTML($node, $inner));
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the text of the 1st discovered node.
|
||||
*
|
||||
* @param string $fallback The fallback text to return if no node is found
|
||||
* @param boolean $firstOnly If set to true, only the first node will be returned.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function text($fallback = '', $firstOnly = true)
|
||||
{
|
||||
if (!$this->nodes || !$this->nodes->length)
|
||||
{
|
||||
return $fallback;
|
||||
}
|
||||
|
||||
if ($firstOnly)
|
||||
{
|
||||
return $this->cleanText($this->nodes[0]->textContent);
|
||||
}
|
||||
|
||||
$result = [];
|
||||
|
||||
foreach ($this->nodes as $node)
|
||||
{
|
||||
$result[] = $this->cleanText($node->textContent);
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the attribute value of the 1st discovered node
|
||||
*
|
||||
* @param string $attribute_name The name of the attribute
|
||||
* @param string $fallback The fallback text to return if no nodes found
|
||||
* @param boolean $firstOnly If set to true, only the first node will be returned.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function attr($attribute_name, $fallback = '', $firstOnly = true)
|
||||
{
|
||||
if (!$this->nodes || !$this->nodes->length)
|
||||
{
|
||||
return $fallback;
|
||||
}
|
||||
|
||||
if ($firstOnly)
|
||||
{
|
||||
return $this->cleanText($this->nodes[0]->getAttribute($attribute_name));
|
||||
}
|
||||
|
||||
$result = [];
|
||||
|
||||
foreach ($this->nodes as $node)
|
||||
{
|
||||
$result[] = $this->cleanText($node->getAttribute($attribute_name));
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the total number of nodes found
|
||||
*
|
||||
* @param integer $fallback The fallback value number to return if no nodes found
|
||||
*
|
||||
* @return integer
|
||||
*/
|
||||
public function count($fallback = 0)
|
||||
{
|
||||
return $this->nodes && $this->nodes->length ? $this->nodes->length : $fallback;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to crawl page based on the value of a CSS Selector field.
|
||||
*
|
||||
* @param array $props Expected properties: selector, task, attr
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function readCSSSelectorField($props, $firstOnly = true)
|
||||
{
|
||||
$props = (array) $props;
|
||||
$fallback = $firstOnly ? '' : [];
|
||||
|
||||
if (empty($props['selector']))
|
||||
{
|
||||
return $fallback;
|
||||
}
|
||||
|
||||
$this->filter($props['selector']);
|
||||
|
||||
switch ($props['task'])
|
||||
{
|
||||
case 'html':
|
||||
return $this->html($fallback, false, $firstOnly);
|
||||
|
||||
case 'innerhtml':
|
||||
return $this->html($fallback, true, $firstOnly);
|
||||
|
||||
case 'attr':
|
||||
return $this->attr($props['attr'], $fallback, $firstOnly);
|
||||
|
||||
case 'count':
|
||||
return $this->count();
|
||||
|
||||
default:
|
||||
return $this->text($fallback, $firstOnly);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to clean the text
|
||||
*
|
||||
* @param string $text The text to clean
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function cleanText($text)
|
||||
{
|
||||
return StringHelper::trim($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms the CSS Selector to a valid XPath expression
|
||||
*
|
||||
* @param string $selector The CSS selector to transform
|
||||
*
|
||||
* @return string XPath expression
|
||||
*/
|
||||
private function cssSelectorToXPath($selector)
|
||||
{
|
||||
// explode() the given selectors and create a XPath syntax
|
||||
$selectors = explode(' ', $selector);
|
||||
|
||||
$xpath = '';
|
||||
|
||||
foreach ($selectors as $selector)
|
||||
{
|
||||
// Check if the selector contains a class or ID
|
||||
$explode_class = explode('.', $selector);
|
||||
$explode_id = explode('#', $selector);
|
||||
|
||||
// Selector contains a class
|
||||
if (count($explode_class) > 1)
|
||||
{
|
||||
$prefix = (isset($explode_class[0]) && !empty($explode_class[0])) ? $explode_class[0] : '*';
|
||||
$xpath .= '//' . $prefix . '[';
|
||||
|
||||
// When we use a selector such as div.class1.class2 or .class1.class2
|
||||
// we need to use all classes in the xpath and no the first one only
|
||||
unset($explode_class[0]);
|
||||
$total = count($explode_class);
|
||||
$counter = 1;
|
||||
$xpath_and_prefix = 'and';
|
||||
|
||||
foreach ($explode_class as $class)
|
||||
{
|
||||
$xpath .= ($counter != 1) ? $xpath_and_prefix : '';
|
||||
$xpath .= ' contains(concat(" ", normalize-space(@class), " "), " ' . $class . ' ") ';
|
||||
$counter++;
|
||||
}
|
||||
|
||||
$xpath .= ']';
|
||||
}
|
||||
else if (count($explode_id) > 1) // Selector contains an ID
|
||||
{
|
||||
$prefix = (isset($explode_id[0]) && !empty($explode_id[0])) ? $explode_id[0] : '*';
|
||||
$xpath .= './/' . $prefix . '[@id="' . $explode_id[1] . '"]';
|
||||
}
|
||||
else // No class or ID given
|
||||
{
|
||||
$xpath .= '//' . $selector;
|
||||
}
|
||||
}
|
||||
|
||||
return $xpath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a string to UTF8 encoding for non-latin languages
|
||||
*
|
||||
* @param string
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function stringToUTF8($string)
|
||||
{
|
||||
$string = iconv('UTF-8', 'UTF-8', $string);
|
||||
$string = mb_encode_numericentity($string, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
|
||||
return $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to return the outer or inner HTML of a node
|
||||
*
|
||||
* @param Node $node The node object
|
||||
* @param boolean $inner Whether to return the outer or inner HTML
|
||||
*
|
||||
* @return string The HTML of the node
|
||||
*/
|
||||
private function getNodeHTML($node, $inner = true)
|
||||
{
|
||||
if ($inner)
|
||||
{
|
||||
$html = '';
|
||||
|
||||
foreach ($node->childNodes as $child)
|
||||
{
|
||||
$html .= $node->ownerDocument->saveHTML($child);
|
||||
}
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
return $node->ownerDocument->saveHTML($node);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user