primo commit

2024-12-17 17:34:10 +01:00
commit e650f8df99
16435 changed files with 2451012 additions and 0 deletions
--- a/administrator/components/com_finder/src/Indexer/Parser/Html.php
+++ b/administrator/components/com_finder/src/Indexer/Parser/Html.php
@ -0,0 +1,158 @@
+<?php
+
+/**
+ * @package     Joomla.Administrator
+ * @subpackage  com_finder
+ *
+ * @copyright   (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
+ * @license     GNU General Public License version 2 or later; see LICENSE.txt
+ */
+
+namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
+
+use Joomla\Component\Finder\Administrator\Indexer\Parser;
+
+// phpcs:disable PSR1.Files.SideEffects
+\defined('_JEXEC') or die;
+// phpcs:enable PSR1.Files.SideEffects
+
+/**
+ * HTML Parser class for the Finder indexer package.
+ *
+ * @since  2.5
+ */
+class Html extends Parser
+{
+    /**
+     * Method to parse input and extract the plain text. Because this method is
+     * called from both inside and outside the indexer, it needs to be able to
+     * batch out its parsing functionality to deal with the inefficiencies of
+     * regular expressions. We will parse recursively in 2KB chunks.
+     *
+     * @param   string  $input  The input to parse.
+     *
+     * @return  string  The plain text input.
+     *
+     * @since   2.5
+     */
+    public function parse($input)
+    {
+        // Strip invalid UTF-8 characters.
+        $oldSetting = \ini_get('mbstring.substitute_character');
+        ini_set('mbstring.substitute_character', 'none');
+        $input = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
+        ini_set('mbstring.substitute_character', $oldSetting);
+
+        // Remove anything between <head> and </head> tags.  Do this first
+        // because there might be <script> or <style> tags nested inside.
+        $input = $this->removeBlocks($input, '<head>', '</head>');
+
+        // Convert <style> and <noscript> tags to <script> tags
+        // so we can remove them efficiently.
+        $search = [
+            '<style', '</style',
+            '<noscript', '</noscript',
+        ];
+        $replace = [
+            '<script', '</script',
+            '<script', '</script',
+        ];
+        $input = str_replace($search, $replace, $input);
+
+        // Strip all script blocks.
+        $input = $this->removeBlocks($input, '<script', '</script>');
+
+        // Decode HTML entities.
+        $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
+
+        // Convert entities equivalent to spaces to actual spaces.
+        $input = str_replace(['&nbsp;', '&#160;'], ' ', $input);
+
+        // Add a space before both the OPEN and CLOSE tags of BLOCK and LINE BREAKING elements,
+        // e.g. 'all<h1><em>m</em>obile  List</h1>' will become 'all mobile  List'
+        $input = preg_replace('/(<|<\/)(' .
+            'address|article|aside|blockquote|br|canvas|dd|div|dl|dt|' .
+            'fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|' .
+            'main|nav|noscript|ol|output|p|pre|section|table|tfoot|ul|video' .
+            ')\b/i', ' $1$2', $input);
+
+        // Strip HTML tags.
+        $input = strip_tags($input);
+
+        return parent::parse($input);
+    }
+
+    /**
+     * Method to process HTML input and extract the plain text.
+     *
+     * @param   string  $input  The input to process.
+     *
+     * @return  string  The plain text input.
+     *
+     * @since   2.5
+     */
+    protected function process($input)
+    {
+        // Replace any amount of white space with a single space.
+        return preg_replace('#\s+#u', ' ', $input);
+    }
+
+    /**
+     * Method to remove blocks of text between a start and an end tag.
+     * Each block removed is effectively replaced by a single space.
+     *
+     * Note: The start tag and the end tag must be different.
+     * Note: Blocks must not be nested.
+     * Note: This method will function correctly with multi-byte strings.
+     *
+     * @param   string  $input     String to be processed.
+     * @param   string  $startTag  String representing the start tag.
+     * @param   string  $endTag    String representing the end tag.
+     *
+     * @return  string with blocks removed.
+     *
+     * @since   3.4
+     */
+    private function removeBlocks($input, $startTag, $endTag)
+    {
+        $return         = '';
+        $offset         = 0;
+        $startTagLength = \strlen($startTag);
+        $endTagLength   = \strlen($endTag);
+
+        // Find the first start tag.
+        $start = stripos($input, $startTag);
+
+        // If no start tags were found, return the string unchanged.
+        if ($start === false) {
+            return $input;
+        }
+
+        // Look for all blocks defined by the start and end tags.
+        while ($start !== false) {
+            // Accumulate the substring up to the start tag.
+            $return .= substr($input, $offset, $start - $offset) . ' ';
+
+            // Look for an end tag corresponding to the start tag.
+            $end = stripos($input, $endTag, $start + $startTagLength);
+
+            // If no corresponding end tag, leave the string alone.
+            if ($end === false) {
+                // Fix the offset so part of the string is not duplicated.
+                $offset = $start;
+                break;
+            }
+
+            // Advance the start position.
+            $offset = $end + $endTagLength;
+
+            // Look for the next start tag and loop.
+            $start = stripos($input, $startTag, $offset);
+        }
+
+        // Add in the final substring after the last end tag.
+        $return .= substr($input, $offset);
+
+        return $return;
+    }
+}
--- a/administrator/components/com_finder/src/Indexer/Parser/Rtf.php
+++ b/administrator/components/com_finder/src/Indexer/Parser/Rtf.php
@ -0,0 +1,47 @@
+<?php
+
+/**
+ * @package     Joomla.Administrator
+ * @subpackage  com_finder
+ *
+ * @copyright   (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
+ * @license     GNU General Public License version 2 or later; see LICENSE.txt
+ */
+
+namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
+
+use Joomla\Component\Finder\Administrator\Indexer\Parser;
+
+// phpcs:disable PSR1.Files.SideEffects
+\defined('_JEXEC') or die;
+// phpcs:enable PSR1.Files.SideEffects
+
+/**
+ * RTF Parser class for the Finder indexer package.
+ *
+ * @since  2.5
+ */
+class Rtf extends Parser
+{
+    /**
+     * Method to process RTF input and extract the plain text.
+     *
+     * @param   string  $input  The input to process.
+     *
+     * @return  string  The plain text input.
+     *
+     * @since   2.5
+     */
+    protected function process($input)
+    {
+        // Remove embedded pictures.
+        $input = preg_replace('#{\\\pict[^}]*}#mi', '', $input);
+
+        // Remove control characters.
+        $input = str_replace(['{', '}', "\\\n"], [' ', ' ', "\n"], $input);
+        $input = preg_replace('#\\\([^;]+?);#m', ' ', $input);
+        $input = preg_replace('#\\\[\'a-zA-Z0-9]+#mi', ' ', $input);
+
+        return $input;
+    }
+}
--- a/administrator/components/com_finder/src/Indexer/Parser/Txt.php
+++ b/administrator/components/com_finder/src/Indexer/Parser/Txt.php
@ -0,0 +1,39 @@
+<?php
+
+/**
+ * @package     Joomla.Administrator
+ * @subpackage  com_finder
+ *
+ * @copyright   (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
+ * @license     GNU General Public License version 2 or later; see LICENSE.txt
+ */
+
+namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
+
+use Joomla\Component\Finder\Administrator\Indexer\Parser;
+
+// phpcs:disable PSR1.Files.SideEffects
+\defined('_JEXEC') or die;
+// phpcs:enable PSR1.Files.SideEffects
+
+/**
+ * Text Parser class for the Finder indexer package.
+ *
+ * @since  2.5
+ */
+class Txt extends Parser
+{
+    /**
+     * Method to process Text input and extract the plain text.
+     *
+     * @param   string  $input  The input to process.
+     *
+     * @return  string  The plain text input.
+     *
+     * @since   2.5
+     */
+    protected function process($input)
+    {
+        return $input;
+    }
+}