primo commit

This commit is contained in:
2024-12-17 17:34:10 +01:00
commit e650f8df99
16435 changed files with 2451012 additions and 0 deletions

View File

@ -0,0 +1,8 @@
<?php
namespace Wamania\Snowball;
class NotFoundException extends \Exception
{
}

View File

@ -0,0 +1,304 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/catalan/stemmer.html
* @author Orestes Sanchez Benavente <orestes@estotienearreglo.es>
*
*
* Some fine tuning was necessary in this implementation of the original catalan stemmer algorithm in Snowball:
*
* 1. Some suffix sets have overlapping items, so here all items are sorted by decreasing size, to
* prevent that a shorter suffix will skip a larger one.
*
* 2. Some alternatives (`or` operator in Snowball) in the original algorithm have
* been rearranged to make sure they are applied in the right order.
*
* Based on the reference Snowball implementation by Israel Olalla of iSOCO
*/
class Catalan extends Stem
{
/**
* All catalan vowels
*/
protected static $vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ï', 'ò', 'ü'];
protected static $standard_suffix_1a = [
'allengües', 'ativitats', 'bilitats', 'ionistes', 'ialistes', 'ialismes', 'ativitat', 'atòries', 'isament',
'bilitat', 'ivitats', 'ionisme', 'ionista', 'ialista', 'ialisme', 'íssimes', 'formes', 'ivisme', 'aments',
'nça', 'ificar', 'idores', 'ancies', 'atòria', 'ivitat', 'encies', 'ències', 'atives', 'íssima', 'íssims',
'ictes', 'eries', 'itats', 'itzar', 'ament', 'ments', 'sfera', 'ícies', 'àries', 'cions', 'ístic', 'issos',
'íssem', 'íssiu', 'issem', 'isseu', 'ísseu', 'dores', 'adura', 'ívola', 'ables', 'adors', 'idors', 'adora',
'doras', 'dures', 'ancia', 'toris', 'encia', 'ència', 'ïtats', 'atius', 'ativa', 'ibles', 'asses', 'assos',
'íssim', 'ìssem', 'ìsseu', 'ìssin', 'ismes', 'istes', 'inies', 'íinia', 'ínies', 'trius', 'atge', 'icte',
'ells', 'ella', 'essa', 'eres', 'ines', 'able', 'itat', 'ives', 'ment', 'amen', 'iste', 'aire', 'eria',
'eses', 'esos', 'ícia', 'icis', 'ícis', 'ària', 'alla', 'nces', 'enca', 'issa', 'dora', 'dors', 'bles',
'ívol', 'egar', 'ejar', 'itar', 'ació', 'ants', 'tori', 'ions', 'isam', 'ores', 'aris', 'ïtat', 'atiu',
'ible', 'assa', 'ents', 'imes', 'isme', 'ista', 'inia', 'ites', 'triu', 'oses', 'osos', 'ient', 'otes',
'ell', 'esc', 'ets', 'eta', 'ers', 'ina', 'iva', 'ius', 'fer', 'als', 'era', 'ana', 'esa', 'ici', 'íci',
'ció', 'dor', 'all', 'enc', 'osa', 'ble', 'dís', 'dur', 'ant', 'ats', 'ota', 'ors', 'ora', 'ari', 'uts',
'uds', 'ent', 'ims', 'ima', 'ita', 'ar', 'és', 'ès', 'et', 'ls', 'ió', 'ot', 'al', 'or', 'il', 'ís', 'ós',
'ud', 'ots', 'ó'
];
protected static $attached_pronoun = [
'selas', 'selos', '\'hi', '\'ho', '\'ls', '-les', '-nos', '\'ns', 'sela', 'selo', '\'s', '\'l', '-ls', '-la',
'-li', 'vos', 'nos', '-us', '\'n', '-ns', '\'m', '-me', '-te', '\'t', 'los', 'las', 'les', 'ens', 'se', 'us',
'-n', '-m', 'li', 'lo', 'me', 'le', 'la', 'ho', 'hi'
];
protected static $verb_suffixes = [
'aríamos', 'eríamos', 'iríamos', 'eresseu', 'iéramos', 'iésemos', 'adores', 'aríais', 'aremos', 'eríais',
'eremos', 'iríais', 'iremos', 'ierais', 'ieseis', 'asteis', 'isteis', 'ábamos', 'áramos', 'ásemos', 'isquen',
'esquin', 'esquis', 'esques', 'esquen', 'ïsquen', 'ïsques', 'adora', 'adors', 'arían', 'arías', 'arian',
'arien', 'aries', 'aréis', 'erían', 'erías', 'eréis', 'erass', 'irían', 'irías', 'iréis', 'asseu', 'esseu',
'àsseu', 'àssem', 'àssim', 'àssiu', 'essen', 'esses', 'assen', 'asses', 'assim', 'assiu', 'éssen', 'ésseu',
'éssim', 'éssiu', 'éssem', 'aríem', 'aríeu', 'eixer', 'eixes', 'ieran', 'iesen', 'ieron', 'iendo', 'essin',
'essis', 'assin', 'assis', 'essim', 'èssim', 'èssiu', 'ieras', 'ieses', 'abais', 'arais', 'aseis', 'íamos',
'irien', 'iries', 'irìem', 'irìeu', 'iguem', 'igueu', 'esqui', 'eixin', 'eixis', 'eixen', 'iríem', 'iríeu',
'atges', 'issen', 'isses', 'issin', 'issis', 'issiu', 'issim', 'ïssin', 'íssiu', 'íssim', 'ïssis', 'ïguem',
'ïgueu', 'ïssen', 'ïsses', 'itzeu', 'itzis', 'ador', 'ents', 'udes', 'eren', 'arán', 'arás', 'aria', 'aràs',
'aría', 'arés', 'erán', 'erás', 'ería', 'erau', 'irán', 'irás', 'iría', 'írem', 'íreu', 'aves', 'avem', 'ávem',
'àvem', 'àveu', 'áveu', 'aven', 'ares', 'àrem', 'àreu', 'àren', 'areu', 'aren', 'tzar', 'ides', 'ïdes', 'ades',
'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'abas', 'adas', 'idas', 'aras', 'ases', 'íais',
'ados', 'idos', 'amos', 'imos', 'ques', 'iran', 'irem', 'iren', 'ires', 'ireu', 'iria', 'iràs', 'eixi', 'eixo',
'isin', 'isis', 'esca', 'isca', 'ïsca', 'ïren', 'ïres', 'ïxen', 'ïxes', 'ixen', 'ixes', 'inin', 'inis', 'ineu',
'itza', 'itzi', 'itzo', 'itzà', 'arem', 'ent', 'arà', 'ará', 'ara', 'aré', 'erá', 'eré', 'irá', 'iré', 'íeu',
'ies', 'íem', 'ìeu', 'ien', 'uda', 'ava', 'ats', 'ant', 'ïen', 'ams', 'ïes', 'dre', 'eix', 'ïda', 'aba', 'ada',
'ida', 'its', 'ids', 'ase', 'ían', 'ado', 'ido', 'ieu', 'ess', 'ass', 'ías', 'áis', 'ira', 'irà', 'irè', 'sis',
'sin', 'int', 'isc', 'ïsc', 'ïra', 'ïxo', 'ixo', 'ixa', 'ini', 'itz', 'iïn', 're', 'ie', 'er', 'ia', 'at', 'ut',
'au', 'ïm', 'ïu', 'és', 'en', 'es', 'em', 'am', 'ïa', 'it', 'ït', 'ía', 'ad', 'ed', 'id', 'an', 'ió', 'ar',
'ir', 'as', 'ii', 'io', 'ià', 'ís', 'ïx', 'ix', 'in', 'às', 'iï', 'iïs', 'í'
];
protected static $residual_suffixes = [
'itz', 'it', 'os', 'eu', 'iu', 'is', 'ir', 'ïn', 'ïs', 'a', 'o', 'á', 'à', 'í', 'ó', 'e', 'é', 'i', 's', 'ì',
'ï'
];
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// Catalan stemmer does not use Rv
$this->r1();
$this->r2();
// Step 0: Attached pronoun
$this->step0();
$word = $this->word;
// Step 1a: Standard suffix
$this->step1a();
// Step 1b: Verb suffix
// Do step 1b if no ending was removed by step 1a.
if ($this->word == $word) {
$this->step1b();
}
$this->step2();
$this->finish();
return $this->word;
}
/**
* Step 0: Attached pronoun
*
* Search for the longest among the following suffixes
* and delete it in R1.
*/
private function step0()
{
if (($position = $this->search(static::$attached_pronoun)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
return false;
}
/**
* Step 1a: Standard suffix
*/
private function step1a()
{
// Run step 1a.2 before 1a.1, since they overlap on `cions` (1a.1) and `acions` (1a.2)
//
// Step 1a.2.
// acions ada ades
// delete if in R2
if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// Step 1a.1.
// ar atge formes icte ictes ell ells ella és ès esc essa et ets eta eres eries ers ina ines able ls ió itat
// itats itzar iva ives ivisme ius fer ment amen ament aments ments ot sfera al als era ana iste aire eria esa
// eses esos or ícia ícies icis ici íci ícis ària àries alla ció cions n{c}a nces ó dor all il ístic enc enca
// ís issa issos íssem íssiu issem isseu ísseu ós osa dora dores dors adura ble bles ívol ívola dís egar ejar
// ificar itar ables adors idores idors adora ació doras dur dures alleng{u"}es ant ants ancia ancies atòria
// atòries tori toris ats ions ota isam ors ora ores isament bilitat bilitats ivitat ivitats ari aris ionisme
// ionista ionistes ialista ialistes ialisme ialismes ud uts uds encia encies ència ències ïtat ïtats atiu
// atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes
// ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos
// ient otes ots
//
// delete if in R1
if (($position = $this->search(self::$standard_suffix_1a)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// Step 1a.3.
// logía logíes logia logies logi logis lógica lógics lógiques
// replace with log if in R2
if (($position = $this->search(
['logía', 'logíes', 'logia', 'logies', 'logis', 'lógica', 'lógics', 'lógiques', 'logi']
)) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace(
'#(logía|logíes|logia|logies|logis|lógica|lógics|lógiques|logi)$#u', 'log', $this->word
);
}
return true;
}
// Step 1a.4.
// ic ica ics iques
// replace with ic if in R2
if (($position = $this->search(['ics', 'ica', 'iques', 'ic'])) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(ics|ica|iques|ic)$#u', 'ic', $this->word);
}
return true;
}
// Step 1a.5.
// quíssims quíssimes quíssima quíssim
// replace with c if in R1
if (($position = $this->search(['quíssima', 'quíssims', 'quíssimes', 'quíssim'])) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(quíssima|quíssims|quíssimes|quíssim)$#u', 'c', $this->word);
}
return true;
}
return false;
}
/**
* Step 1b: Verb suffixes
* Search for the longest among the following suffixes in r1 and r2, and
* perform the action indicated.
*/
private function step1b()
{
// Step 1b.1
//
// aríamos eríamos iríamos eresseu iéramos iésemos adores aríais aremos eríais
// eremos iríais iremos ierais ieseis asteis isteis ábamos áramos ásemos isquen
// esquin esquis esques esquen ïsquen ïsques adora adors arían arías arian
// arien aries aréis erían erías eréis erass irían irías iréis asseu esseu
// àsseu àssem àssim àssiu essen esses assen asses assim assiu éssen ésseu
// éssim éssiu éssem aríem aríeu eixer eixes ieran iesen ieron iendo essin
// essis assin assis essim èssim èssiu ieras ieses abais arais aseis íamos
// irien iries irìem irìeu iguem igueu esqui eixin eixis eixen iríem iríeu
// atges issen isses issin issis issiu issim ïssin íssiu íssim ïssis ïguem
// ïgueu ïssen ïsses itzeu itzis ador ents udes eren arán arás aria aràs
// aría arés erán erás ería erau irán irás iría írem íreu aves avem ávem
// àvem àveu áveu aven ares àrem àreu àren areu aren tzar ides ïdes ades
// iera iese aste iste aban aran asen aron abas adas idas aras ases íais
// ados idos amos imos ques iran irem iren ires ireu iria iràs eixi eixo
// isin isis esca isca ïsca ïren ïres ïxen ïxes ixen ixes inin inis ineu
// itza itzi itzo itzà arem ent arà ará ara aré erá eré irá iré íeu
// ies íem ìeu ien uda ava ats ant ïen ams ïes dre eix ïda aba ada
// ida its ids ase ían ado ido ieu ess ass ías áis ira irà irè sis
// sin int isc ïsc ïra ïxo ixo ixa ini itz iïn re ie er ia at ut
// au ïm ïu és en es em am ïa it ït ía ad ed id an ió ar
// ir as ii io ià ís ïx ix in às iï iïs í
// delete if in R1
if (($position = $this->search(static::$verb_suffixes)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// Step 1b.2
// ando
// delete if in R2
if (($position = $this->search(['ando'])) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
/**
* Step 2: residual suffix
* Search for the longest among the following suffixes in R1, and perform
* the action indicated.
*/
private function step2()
{
// Step 2.1
// residual suffix
// delete if in R1
if (($position = $this->search(static::$residual_suffixes)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// Step 2.2
// iqu
// replace with ic if in R1
if (($position = $this->search(['iqu'])) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(iqu)$#u', 'ic', $this->word);
}
return true;
}
return false;
}
/**
* And finally:
* Remove accents and l aggeminades
*/
private function finish()
{
$this->word = UTF8::str_replace(
['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'],
['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'],
$this->word
);
}
}

View File

@ -0,0 +1,152 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/danish/stemmer.html
* @author wamania
*
*/
class Danish extends Stem
{
/**
* All danish vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'æ', 'å', 'ø');
/**
* {@inheritdoc}
*/
public function stem($word): string
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// R2 is not used: R1 is defined in the same way as in the German stemmer
$this->r1();
// then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
}
// Do each of steps 1, 2 3 and 4.
$this->step1();
$this->step2();
$this->step3();
$this->step4();
return $this->word;
}
/**
* Define a valid s-ending as one of
* a b c d f g h j k l m n o p r t v y z å
*
* @param string $ending
* @return boolean
*/
private function hasValidSEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'));
}
/**
* Step 1
* Search for the longest among the following suffixes in R1, and perform the action indicated.
*/
private function step1()
{
// hed ethed ered e erede ende erende ene erne ere en heden eren er heder erer
// heds es endes erendes enes ernes eres ens hedens erens ers ets erets et eret
// delete
if ( ($position = $this->searchIfInR1(array(
'erendes', 'erende', 'hedens', 'erede', 'ethed', 'heden', 'endes', 'erets', 'heder', 'ernes',
'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds',
'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e'
))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// s
// delete if preceded by a valid s-ending
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
return true;
}
}
/**
* Step 2
* Search for one of the following suffixes in R1, and if found delete the last letter.
* gd dt gt kt
*/
private function step2()
{
if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 3:
*/
private function step3()
{
// If the word ends igst, remove the final st.
if ($this->search(array('igst')) !== false) {
$this->word = UTF8::substr($this->word, 0, -2);
}
// Search for the longest among the following suffixes in R1, and perform the action indicated.
// ig lig elig els
// delete, and then repeat step 2
if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->step2();
return true;
}
// løst
// replace with løs
if ($this->searchIfInR1(array('løst')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 4: undouble
* If the word ends with double consonant in R1, remove one of the consonants.
*/
private function step4()
{
$length = UTF8::strlen($this->word);
if (!$this->inR1(($length-1))) {
return false;
}
$lastLetter = UTF8::substr($this->word, -1, 1);
if (in_array($lastLetter, self::$vowels)) {
return false;
}
$beforeLastLetter = UTF8::substr($this->word, -2, 1);
if ($lastLetter == $beforeLastLetter) {
$this->word = UTF8::substr($this->word, 0, -1);
}
return true;
}
}

View File

@ -0,0 +1,306 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/dutch/stemmer.html
* @author wamania
*
*/
class Dutch extends Stem
{
/**
* All dutch vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'è');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// First, remove all umlaut and acute accents.
$this->word = UTF8::str_replace(
array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'),
array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'),
$this->word);
$this->plainVowels = implode('', self::$vowels);
// Put initial y, y after a vowel, and i between vowels into upper case.
$this->word = preg_replace('#^y#u', 'Y', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
// R1 and R2 (see the note on R1 and R2) are then defined as in German.
// R1 and R2 are first set up in the standard way
$this->r1();
$this->r2();
// but then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
}
// Do each of steps 1, 2 3 and 4.
$this->step1();
$removedE = $this->step2();
$this->step3a();
$this->step3b($removedE);
$this->step4();
$this->finish();
return $this->word;
}
/**
* Define a valid s-ending as a non-vowel other than j.
* @param string $ending
* @return boolean
*/
private function hasValidSEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
return !in_array($lastLetter, array_merge(self::$vowels, array('j')));
}
/**
* Define a valid en-ending as a non-vowel, and not gem.
* @param string $ending
* @return boolean
*/
private function hasValidEnEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
if (in_array($lastLetter, self::$vowels)) {
return false;
}
$threeLastLetters = UTF8::substr($word, -3, 3);
if ($threeLastLetters == 'gem') {
return false;
}
return true;
}
/**
* Define undoubling the ending as removing the last letter if the word ends kk, dd or tt.
*/
private function unDoubling()
{
if ($this->search(array('kk', 'dd', 'tt')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 1
* Search for the longest among the following suffixes, and perform the action indicated
*/
private function step1()
{
// heden
// replace with heid if in R1
if ( ($position = $this->search(array('heden'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(heden)$#u', 'heid', $this->word);
}
return true;
}
// en ene
// delete if in R1 and preceded by a valid en-ending, and then undouble the ending
if ( ($position = $this->search(array('ene', 'en'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidEnEnding($word)) {
$this->word = $word;
$this->unDoubling();
}
}
return true;
}
// s se
// delete if in R1 and preceded by a valid s-ending
if ( ($position = $this->search(array('se', 's'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
}
return true;
}
return false;
}
/**
* Step 2
* Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
*/
private function step2()
{
if ( ($position = $this->search(array('e'))) !== false) {
if ($this->inR1($position)) {
$letter = UTF8::substr($this->word, -2, 1);
if (!in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->unDoubling();
return true;
}
}
}
return false;
}
/**
* Step 3a: heid
* delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b)
*/
private function step3a()
{
if ( ($position = $this->search(array('heid'))) !== false) {
if ($this->inR2($position)) {
$letter = UTF8::substr($this->word, -5, 1);
if ($letter !== 'c') {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position = $this->search(array('en'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidEnEnding($word)) {
$this->word = $word;
$this->unDoubling();
}
}
}
}
}
}
}
/**
* Step 3b: d-suffixe
* Search for the longest among the following suffixes, and perform the action indicated.
*/
private function step3b($removedE)
{
// end ing
// delete if in R2
// if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending
if ( ($position = $this->search(array('end', 'ing'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) {
$letter = UTF8::substr($this->word, -3, 1);
if ($letter !== 'e') {
$this->word = UTF8::substr($this->word, 0, $position2);
}
} else {
$this->unDoubling();
}
}
return true;
}
// ig
// delete if in R2 and not preceded by e
if ( ($position = $this->search(array('ig'))) !== false) {
if ($this->inR2($position)) {
$letter = UTF8::substr($this->word, -3, 1);
if ($letter !== 'e') {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
// lijk
// delete if in R2, and then repeat step 2
if ( ($position = $this->search(array('lijk'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->step2();
}
return true;
}
// baar
// delete if in R2
if ( ($position = $this->search(array('baar'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// bar
// delete if in R2 and if step 2 actually removed an e
if ( ($position = $this->search(array('bar'))) !== false) {
if ($this->inR2($position) && $removedE) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
/**
* Step 4: undouble vowel
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u,
* remove one of the vowels from V (for example, maan -> man, brood -> brod).
*/
private function step4()
{
// D is a non-vowel other than I
$d = UTF8::substr($this->word, -1, 1);
if (in_array($d, array_merge(self::$vowels, array('I')))) {
return false;
}
// V is double a, e, o or u
$v = UTF8::substr($this->word, -3, 2);
if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) {
return false;
}
$singleV = UTF8::substr($v, 0, 1);
// C is a non-vowel
$c = UTF8::substr($this->word, -4, 1);
if (in_array($c, self::$vowels)) {
return false;
}
$this->word = UTF8::substr($this->word, 0, -4);
$this->word .= $c . $singleV .$d;
}
/**
* Finally
* Turn I and Y back into lower case.
*/
private function finish()
{
$this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word);
}
}

View File

@ -0,0 +1,602 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
* English Porter 2
*
* @link http://snowball.tartarus.org/algorithms/english/stemmer.html
* @author wamania
*
*/
class English extends Stem
{
/**
* All english vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
protected static $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
protected static $liEnding = array('c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
if (Utf8::strlen($word) < 3) {
return $word;
}
$this->word = UTF8::strtolower($word);
// exceptions
if (null !== ($word = $this->exception1())) {
return $word;
}
$this->plainVowels = implode('', self::$vowels);
// Remove initial ', if present.
$first = UTF8::substr($this->word, 0, 1);
if ($first == "'") {
$this->word = UTF8::substr($this->word, 1);
}
// Set initial y, or y after a vowel, to Y
if ($first == 'y') {
$this->word = preg_replace('#^y#u', 'Y', $this->word);
}
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
$this->r1();
$this->exceptionR1();
$this->r2();
$this->step0();
$this->step1a();
// exceptions 2
if (null !== ($word = $this->exception2())) {
return $word;
}
$this->step1b();
$this->step1c();
$this->step2();
$this->step3();
$this->step4();
$this->step5();
$this->finish();
return $this->word;
}
/**
* Step 0
* Remove ', 's, 's'
*/
private function step0()
{
if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
private function step1a()
{
// sses
// replace by ss
if ( ($position = $this->search(array('sses'))) !== false) {
$this->word = preg_replace('#(sses)$#u', 'ss', $this->word);
return true;
}
// ied+ ies*
// replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri)
if ( ($position = $this->search(array('ied', 'ies'))) !== false) {
if ($position > 1) {
$this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word);
} else {
$this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word);
}
return true;
}
// us+ ss
// do nothing
if ( ($position = $this->search(array('us', 'ss'))) !== false) {
return true;
}
// s
// delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it)
if ( ($position = $this->search(array('s'))) !== false) {
for ($i=0; $i<$position-1; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
return true;
}
return false;
}
/**
* Step 1b
*/
private function step1b()
{
// eed eedly+
// replace by ee if in R1
if ( ($position = $this->search(array('eedly', 'eed'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word);
}
return true;
}
// ed edly+ ing ingly+
// delete if the preceding word part contains a vowel, and after the deletion:
// if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
// if the word ends with a double remove the last letter (so hopp -> hop), or
// if the word is short, add e (so hop -> hope)
if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) {
for ($i=0; $i<$position; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0, $position);
if ($this->search(array('at', 'bl', 'iz')) !== false) {
$this->word .= 'e';
} elseif ( ($position2 = $this->search(self::$doubles)) !== false) {
$this->word = UTF8::substr($this->word, 0, ($position2+1));
} elseif ($this->isShort()) {
$this->word .= 'e';
}
return true;
}
}
return true;
}
return false;
}
/**
* Step 1c: *
*/
private function step1c()
{
// replace suffix y or Y by i if preceded by a non-vowel
// which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
$length = UTF8::strlen($this->word);
if ($length < 3) {
return true;
}
if ( ($position = $this->search(array('y', 'Y'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if (! in_array($letter, self::$vowels)) {
$this->word = preg_replace('#(y|Y)$#u', 'i', $this->word);
}
return true;
}
return false;
}
/**
* Step 2
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
*/
private function step2()
{
// iveness iviti: replace by ive
if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word);
}
return true;
}
// ousli ousness: replace by ous
if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word);
}
return true;
}
// izer ization: replace by ize
if ( ($position = $this->search(array('izer', 'ization'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word);
}
return true;
}
// ational ation ator: replace by ate
if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word);
}
return true;
}
// biliti bli+: replace by ble
if ( ($position = $this->search(array('biliti', 'bli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word);
}
return true;
}
// lessli+: replace by less
if ( ($position = $this->search(array('lessli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(lessli)$#u', 'less', $this->word);
}
return true;
}
// fulness: replace by ful
if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word);
}
return true;
}
// tional: replace by tion
if ( ($position = $this->search(array('tional'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
}
return true;
}
// alism aliti alli: replace by al
if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word);
}
return true;
}
// enci: replace by ence
if ( ($position = $this->search(array('enci'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(enci)$#u', 'ence', $this->word);
}
return true;
}
// anci: replace by ance
if ( ($position = $this->search(array('anci'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(anci)$#u', 'ance', $this->word);
}
return true;
}
// abli: replace by able
if ( ($position = $this->search(array('abli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(abli)$#u', 'able', $this->word);
}
return true;
}
// entli: replace by ent
if ( ($position = $this->search(array('entli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(entli)$#u', 'ent', $this->word);
}
return true;
}
// ogi+: replace by og if preceded by l
if ( ($position = $this->search(array('ogi'))) !== false) {
if ($this->inR1($position)) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 'l') {
$this->word = preg_replace('#(ogi)$#u', 'og', $this->word);
}
}
return true;
}
// li+: delete if preceded by a valid li-ending
if ( ($position = $this->search(array('li'))) !== false) {
if ($this->inR1($position)) {
// a letter for you
$letter = UTF8::substr($this->word, ($position-1), 1);
if (in_array($letter, self::$liEnding)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
return false;
}
/**
* Step 3:
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
*/
private function step3()
{
// ational+: replace by ate
if ($this->searchIfInR1(array('ational')) !== false) {
$this->word = preg_replace('#(ational)$#u', 'ate', $this->word);
return true;
}
// tional+: replace by tion
if ($this->searchIfInR1(array('tional')) !== false) {
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
return true;
}
// alize: replace by al
if ($this->searchIfInR1(array('alize')) !== false) {
$this->word = preg_replace('#(alize)$#u', 'al', $this->word);
return true;
}
// icate iciti ical: replace by ic
if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) {
$this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word);
return true;
}
// ful ness: delete
if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// ative*: delete if in R2
if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
return false;
}
/**
* Step 4
* Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
*/
private function step4()
{
// ement ance ence able ible ant ment ent ism ate iti ous ive ize al er ic
// delete
if ( ($position = $this->search(array(
'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism',
'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// ion
// delete if preceded by s or t
if ( ($position = $this->searchIfInR2(array('ion'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 's' || $letter == 't') {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
/**
* Step 5: *
* Search for the the following suffixes, and, if found, perform the action indicated.
*/
private function step5()
{
// e
// delete if in R2, or in R1 and not preceded by a short syllable
if ( ($position = $this->search(array('e'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
} elseif ($this->inR1($position)) {
if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
// l
// delete if in R2 and preceded by l
if ( ($position = $this->searchIfInR2(array('l'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 'l') {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
private function finish()
{
$this->word = UTF8::str_replace('Y', 'y', $this->word);
}
private function exceptionR1()
{
if (Utf8::strpos($this->word, 'gener') === 0) {
$this->r1 = UTF8::substr($this->word, 5);
$this->r1Index = 5;
} elseif (Utf8::strpos($this->word, 'commun') === 0) {
$this->r1 = UTF8::substr($this->word, 6);
$this->r1Index = 6;
} elseif (Utf8::strpos($this->word, 'arsen') === 0) {
$this->r1 = UTF8::substr($this->word, 5);
$this->r1Index = 5;
}
}
/**
* 1/ Stem certain special words as follows,
* 2/ If one of the following is found, leave it invariant,
*/
private function exception1()
{
$exceptions = array(
'skis' => 'ski',
'skies' => 'sky',
'dying' => 'die',
'lying' => 'lie',
'tying' => 'tie',
'idly' => 'idl',
'gently' => 'gentl',
'ugly' => 'ugli',
'early' => 'earli',
'only' => 'onli',
'singly' => 'singl',
// invariants
'sky' => 'sky',
'news' => 'news',
'howe' => 'howe',
'atlas' => 'atlas',
'cosmos' => 'cosmos',
'bias' => 'bias',
'andes' => 'andes'
);
if (isset($exceptions[$this->word])) {
return $exceptions[$this->word];
}
return null;
}
/**
* Following step 1a, leave the following invariant,
*/
private function exception2()
{
$exceptions = array(
'inning' => 'inning',
'outing' => 'outing',
'canning' => 'canning',
'herring' => 'herring',
'earring' => 'earring',
'proceed' => 'proceed',
'exceed' => 'exceed',
'succeed' => 'succeed'
);
if (isset($exceptions[$this->word])) {
return $exceptions[$this->word];
}
return null;
}
/**
* A word is called short if it ends in a short syllable, and if R1 is null.
* Note : R1 not really null, but the word at this state must be smaller than r1 index
*
* @return boolean
*/
private function isShort()
{
$length = UTF8::strlen($this->word);
return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) );
}
/**
* Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel,
* or * (b) a vowel at the beginning of the word followed by a non-vowel.
*
* So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables.
* But uproot, bestow, disturb do not end with a short syllable.
*/
private function searchShortSyllabe($from, $nbLetters)
{
$length = UTF8::strlen($this->word);
if ($from < 0) {
$from = $length + $from;
}
if ($from < 0) {
$from = 0;
}
// (a) is just for beginning of the word
if ( ($nbLetters == 2) && ($from != 0) ) {
return false;
}
$first = UTF8::substr($this->word, $from, 1);
$second = UTF8::substr($this->word, ($from+1), 1);
if ($nbLetters == 2) {
if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) {
return true;
}
}
$third = UTF8::substr($this->word, ($from+2), 1);
if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels))
&& (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) {
return true;
}
return false;
}
}

View File

@ -0,0 +1,444 @@
<?php
/**
* Finnish Snowball Stemmer.
*
* @author msaari <mikko@mikkosaari.fi>
*/
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
* Finnish Snowball Stemmer.
*
* @link http://snowball.tartarus.org/algorithms/finnish/stemmer.html
* @author msaari
*/
class Finnish extends Stem
{
/**
* All swedish vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö');
protected static $consonants = array('b', 'c', 'd', 'f', 'g', 'h', 'j',
'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z');
protected static $restrictedVowels = array('a', 'e', 'i', 'o', 'u', 'ä', 'ö');
/**
* Long restricted vowels, ie. doubled vowels.
*/
protected static $longVowels = array('aa', 'ee', 'ii', 'oo', 'uu', 'ää', 'öö');
private $_removedInStep3 = false;
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (! UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = Utf8::strtolower($word);
// R1 and R2 are then defined in the usual way
$this->r1();
$this->r2();
// Do each of steps 1, 2 3, 4, 5 and 6.
$this->step1();
$this->step2();
$this->step3();
$this->step4();
$this->step5();
$this->step6();
return $this->word;
}
/**
* Step 1
*
* Search for the longest among the following suffixes in R1, and perform
* the action indicated.
*
* @return boolean True when something is done.
*/
private function step1()
{
// (a) kin kaan kään ko kö han hän pa pä
// delete if preceded by n, t or a vowel
if (($position = $this->searchIfInR1(array('kaan', 'kään', 'kin', 'han', 'hän', 'ko', 'kö', 'pa', 'pä'))) !== false) {
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
if (in_array($lastLetter, array_merge(['t', 'n'], self::$vowels))) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
}
return true;
}
// sti
// delete if in R2
if (($position = $this->searchIfInR1(array('sti'))) !== false) {
if ($this->inR2($position)) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
}
return true;
}
}
/**
* Step 2: possessives.
*
* Search for the longest among the following suffixes in R1, and perform
* the action indicated.
*
* @return boolean True when something is done.
*/
private function step2()
{
// si
// delete if not preceded by k
if (($position = $this->searchIfInR1(array('si'))) !== false) {
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
if ($lastLetter !== 'k') {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
return true;
}
}
// ni
// delete
if (($position = $this->searchIfInR1(array('ni'))) !== false) {
$this->word = Utf8::substr($this->word, 0, $position);
// if preceded by kse, replace with ksi
if ( ($position = $this->search(array('kse'))) !== false) {
$this->word = preg_replace('#(kse)$#u', 'ksi', $this->word);
}
$this->r1();
$this->r2();
return true;
}
// nsa nsä mme nne
// delete
if (($position = $this->searchIfInR1(array('nsa', 'nsä', 'mme', 'nne'))) !== false) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
return true;
}
// an
// delete if preceded by one of ta ssa sta lla lta na
if (($position = $this->searchIfInR1(array('an'))) !== false) {
$word = Utf8::substr($this->word, 0, $position);
$lastThreeLetters = Utf8::substr($word, -3, 3);
$lastTwoLetters = Utf8::substr($word, -2, 2);
if (in_array($lastThreeLetters, array('ssa', 'sta', 'lla', 'lta'), true) || in_array($lastTwoLetters, array('na', 'ta'), true)) {
$this->word = $word;
$this->r1();
$this->r2();
return true;
}
}
// än
// delete if preceded by one of tä ssä stä llä ltä nä
if (($position = $this->searchIfInR1(array('än'))) !== false) {
$word = Utf8::substr($this->word, 0, $position);
$lastThreeLetters = Utf8::substr($word, -3, 3);
$lastTwoLetters = Utf8::substr($word, -2, 2);
if (in_array($lastThreeLetters, array('ssä', 'stä', 'llä', 'ltä'), true) || in_array($lastTwoLetters, array('nä', 'tä'), true)) {
$this->word = $word;
$this->r1();
$this->r2();
return true;
}
}
// en
// delete if preceded by one of lle ine
if (($position = $this->searchIfInR1(array('en'))) !== false) {
$word = Utf8::substr($this->word, 0, $position);
if (Utf8::strlen($this->word) > 4) {
$lastThreeLetters = Utf8::substr($this->word, -5, 3);
if (in_array($lastThreeLetters, array('lle', 'ine'), true)) {
$this->word = $word;
$this->r1();
$this->r2();
return true;
}
}
}
}
/**
* Step 3: cases
*
* Search for the longest among the following suffixes in R1, and perform
* the action indicated.
*
* @return boolean True when something is done.
*/
private function step3()
{
// hXn
// delete if preceded by X, where X is a V other than u (a/han, e/hen etc)
foreach (self::$restrictedVowels as $vowel) {
if ($vowel === 'u') {
continue;
}
if (($position = $this->searchIfInR1(array('h' . $vowel . 'n'))) !== false) {
$lastLetter = Utf8::substr($this->word, $position-1, 1);
if ($lastLetter === $vowel) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->_removedInStep3 = true;
$this->r1();
$this->r2();
}
return true;
}
}
// siin den tten
// delete if preceded by Vi
if (($position = $this->searchIfInR1(array('siin', 'den', 'tten'))) !== false) {
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
if ($lastLetter === 'i') {
$nextLastLetter = Utf8::substr($this->word, ($position-2), 1);
if (in_array($nextLastLetter, self::$restrictedVowels, true)) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->_removedInStep3 = true;
$this->r1();
$this->r2();
return true;
}
}
}
// seen
// delete if preceded by LV
if (($position = $this->searchIfInR1(array('seen'))) !== false) {
$lastLetters = Utf8::substr($this->word, ($position-2), 2);
if (in_array($lastLetters, self::$longVowels, true)) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->_removedInStep3 = true;
$this->r1();
$this->r2();
return true;
}
}
// tta ttä
// delete if preceded by e
if (($position = $this->searchIfInR1(array('tta', 'ttä'))) !== false) {
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
if ($lastLetter === 'e') {
$this->word = Utf8::substr($this->word, 0, $position);
$this->_removedInStep3 = true;
$this->r1();
$this->r2();
return true;
}
}
// ta tä ssa ssä sta stä lla llä lta ltä lle na nä ksi ine
// delete
if (($position = $this->searchIfInR1(array('ssa', 'ssä', 'sta', 'stä', 'lla', 'llä', 'lta', 'ltä', 'lle', 'ksi', 'na', 'nä', 'ine', 'ta', 'tä'))) !== false) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->_removedInStep3 = true;
$this->r1();
$this->r2();
return true;
}
// a ä
// delete if preceded by cv
if (($position = $this->searchIfInR1(array('a', 'ä'))) !== false) {
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
$nextLastLetter = Utf8::substr($this->word, ($position-2), 1);
if (in_array($lastLetter, self::$vowels, true) && in_array($nextLastLetter, self::$consonants, true)) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->_removedInStep3 = true;
$this->r1();
$this->r2();
return true;
}
}
// n
// delete, and if preceded by LV or ie, delete the last vowel
if (($position = $this->searchIfInR1(array('n'))) !== false) {
$lastLetters = Utf8::substr($this->word, ($position-2), 2);
if (in_array($lastLetters, self::$longVowels, true) || $lastLetters === 'ie') {
$this->word = Utf8::substr($this->word, 0, $position-1);
} else {
$this->word = Utf8::substr($this->word, 0, $position);
}
$this->r1();
$this->r2();
$this->_removedInStep3 = true;
return true;
}
}
/**
* Step 4: other endings
*
* Search for the longest among the following suffixes in R2, and perform
* the action indicated
*
* @return boolean True when something is done.
*/
private function step4()
{
// mpi mpa mpä mmi mma mmä
// delete if not preceded by po
if (($position = $this->searchIfInR2(array('mpi', 'mpa', 'mpä', 'mmi', 'mma', 'mmä'))) !== false) {
$lastLetters = Utf8::substr($this->word, ($position-2), 2);
if ($lastLetters !== 'po') {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
return true;
}
}
// impi impa impä immi imma immä eja ejä
// delete
if (($position = $this->searchIfInR2(array('impi', 'impa', 'impä', 'immi', 'imma', 'immä', 'eja', 'ejä'))) !== false) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
return true;
}
}
/**
* Step 5: plurals
* If an ending was removed in step 3, delete a final i or j if in R1;
* otherwise,
* if an ending was not removed in step 3, delete a final t in R1 if it
* follows a vowel, and, if a t is removed, delete a final mma or imma in
* R2, unless the mma is preceded by po.
*
* @return boolean True when something is done.
*/
private function step5()
{
if ($this->_removedInStep3) {
if (($position = $this->searchIfInR1(array('i', 'j'))) !== false) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
return true;
}
} else {
if (($position = $this->searchIfInR1(array('t'))) !== false) {
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
if (in_array($lastLetter, self::$vowels, true)) {
$this->word = Utf8::substr($this->word, 0, $position);
$this->r1();
$this->r2();
if (($position2 = $this->searchIfInR2(array('imma'))) !== false) {
$this->word = Utf8::substr($this->word, 0, $position2);
$this->r1();
$this->r2();
return true;
} elseif (($position2 = $this->searchIfInR2(array('mma'))) !== false) {
$lastLetters = Utf8::substr($this->word, ($position2-2), 2);
if ($lastLetters !== 'po') {
$this->word = Utf8::substr($this->word, 0, $position2);
$this->r1();
$this->r2();
return true;
}
}
}
}
}
}
/**
* Step 6: tidying up
*
* Do in turn steps (a), (b), (c), (d), restricting all tests to the
* region R1.
*/
private function step6()
{
// a) If R1 ends LV
// delete the last letter
if (($position = $this->searchIfInR1(self::$longVowels)) !== false) {
$this->word = Utf8::substr($this->word, 0, $position+1);
$this->r1();
$this->r2();
}
// b) If R1 ends cX, c a consonant and X one of a ä e i,
// delete the last letter
$lastLetter = Utf8::substr($this->r1, -1, 1);
$secondToLastLetter = Utf8::substr($this->r1, -2, 1);
if (in_array($secondToLastLetter, self::$consonants, true) && in_array($lastLetter, array('a', 'e', 'i', 'ä'))) {
$this->word = Utf8::substr($this->word, 0, -1);
$this->r1();
$this->r2();
}
// c) If R1 ends oj or uj
// delete the last letter
$twoLastLetters = Utf8::substr($this->r1, -2, 2);
if (in_array($twoLastLetters, array('oj', 'uj'))) {
$this->word = Utf8::substr($this->word, 0, -1);
$this->r1();
$this->r2();
}
// d) If R1 ends jo
// delete the last letter
$twoLastLetters = Utf8::substr($this->r1, -2, 2);
if ($twoLastLetters === 'jo') {
$this->word = Utf8::substr($this->word, 0, -1);
$this->r1();
$this->r2();
}
// e) If the word ends with a double consonant followed by zero or more
// vowels, remove the last consonant (so eläkk -> eläk,
// aatonaatto -> aatonaato)
$endVowels = '';
for ($i = Utf8::strlen($this->word) - 1; $i > 0; $i--) {
$letter = Utf8::substr($this->word, $i, 1);
if (in_array($letter, self::$vowels, true)) {
$endVowels = $letter . $endVowels;
} else {
// check for double consonant
$prevLetter = Utf8::substr($this->word, $i-1, 1);
if ($prevLetter === $letter) {
$this->word = Utf8::substr($this->word, 0, $i) . $endVowels;
}
break;
}
}
}
}

View File

@ -0,0 +1,533 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/french/stemmer.html
* @author wamania
*
*/
class French extends Stem
{
/**
* All french vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'â', 'à', 'ë', 'é', 'ê', 'è', 'ï', 'î', 'ô', 'û', 'ù');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
$this->plainVowels = implode('', self::$vowels);
$this->step0();
$this->rv();
$this->r1();
$this->r2();
// to know if step1, 2a or 2b have altered the word
$this->originalWord = $this->word;
$nextStep = $this->step1();
// Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found.
if ( ($nextStep == 2) || ($this->originalWord == $this->word) ) {
$modified = $this->step2a();
if (!$modified) {
$this->step2b();
}
}
if ($this->word != $this->originalWord) {
$this->step3();
} else {
$this->step4();
}
$this->step5();
$this->step6();
$this->finish();
return $this->word;
}
/**
* Assume the word is in lower case.
* Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel.
* u after q is also put into upper case. For example,
* jouer -> joUer
* ennuie -> ennuIe
* yeux -> Yeux
* quand -> qUand
*/
private function step0()
{
$this->word = preg_replace('#([q])u#u', '$1U', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
$this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
}
/**
* Step 1
* Search for the longest among the following suffixes, and perform the action indicated.
*
* @return integer Next step number
*/
private function step1()
{
// ance iqUe isme able iste eux ances iqUes ismes ables istes
// delete if in R2
if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return 3;
}
// atrice ateur ation atrices ateurs ations
// delete if in R2
// if preceded by ic, delete if in R2, else replace by iqU
if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
} else {
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
}
}
return 3;
}
// logie logies
// replace with log if in R2
if ( ($position = $this->search(array('logies', 'logie'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word);
}
return 3;
}
// usion ution usions utions
// replace with u if in R2
if ( ($position = $this->search(array('usions', 'utions', 'usion', 'ution'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word);
}
return 3;
}
// ence ences
// replace with ent if in R2
if ( ($position = $this->search(array('ences', 'ence'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word);
}
return 3;
}
// issement issements
// delete if in R1 and preceded by a non-vowel
if ( ($position = $this->search(array('issements', 'issement'))) != false) {
if ($this->inR1($position)) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if (! in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return 3;
}
// ement ements
// delete if in RV
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
// if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise,
// if preceded by abl or iqU, delete if in R2, otherwise,
// if preceded by ièr or Ièr, replace by i if in RV
if ( ($position = $this->search(array('ements', 'ement'))) !== false) {
// delete if in RV
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
if ( ($position = $this->searchIfInR2(array('iv'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
// if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise,
} elseif ( ($position = $this->search(array('eus'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
} elseif ($this->inR1($position)) {
$this->word = preg_replace('#(eus)$#u', 'eux', $this->word);
}
// if preceded by abl or iqU, delete if in R2, otherwise,
} elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
// if preceded by ièr or Ièr, replace by i if in RV
} elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) {
$this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word);
}
return 3;
}
// ité ités
// delete if in R2
// if preceded by abil, delete if in R2, else replace by abl, otherwise,
// if preceded by ic, delete if in R2, else replace by iqU, otherwise,
// if preceded by iv, delete if in R2
if ( ($position = $this->search(array('ités', 'ité'))) !== false) {
// delete if in R2
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by abil, delete if in R2, else replace by abl, otherwise,
if ( ($position = $this->search(array('abil'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
} else {
$this->word = preg_replace('#(abil)$#u', 'abl', $this->word);
}
// if preceded by ic, delete if in R2, else replace by iqU, otherwise,
} elseif ( ($position = $this->search(array('ic'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
} else {
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
}
// if preceded by iv, delete if in R2
} elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return 3;
}
// if ive ifs ives
// delete if in R2
// if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU)
if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
if ( ($position = $this->searchIfInR2(array('at'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->search(array('ic'))) !== false) {
if ($this->inR2($position2)) {
$this->word = UTF8::substr($this->word, 0, $position2);
} else {
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
}
}
}
return 3;
}
// eaux
// replace with eau
if ( ($position = $this->search(array('eaux'))) !== false) {
$this->word = preg_replace('#(eaux)$#u', 'eau', $this->word);
return 3;
}
// aux
// replace with al if in R1
if ( ($position = $this->search(array('aux'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(aux)$#u', 'al', $this->word);
}
return 3;
}
// euse euses
// delete if in R2, else replace by eux if in R1
if ( ($position = $this->search(array('euses', 'euse'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
} elseif ($this->inR1($position)) {
$this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word);
//return 3;
}
return 3;
}
// amment
// replace with ant if in RV
if ( ($position = $this->search(array('amment'))) !== false) {
if ($this->inRv($position)) {
$this->word = preg_replace('#(amment)$#u', 'ant', $this->word);
}
return 2;
}
// emment
// replace with ent if in RV
if ( ($position = $this->search(array('emment'))) !== false) {
if ($this->inRv($position)) {
$this->word = preg_replace('#(emment)$#u', 'ent', $this->word);
}
return 2;
}
// ment ments
// delete if preceded by a vowel in RV
if ( ($position = $this->search(array('ments', 'ment'))) != false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return 2;
}
return 2;
}
/**
* Step 2a: Verb suffixes beginning i
* In steps 2a and 2b all tests are confined to the RV region.
* Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel.
* îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez
* irions irons iront is issaIent issais issait issant issante issantes issants isse
* issent isses issez issiez issions issons it
* (Note that the non-vowel itself must also be in RV.)
*/
private function step2a()
{
if ( ($position = $this->searchIfInRv(array(
'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez',
'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants',
'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
return false;
}
/**
* Do step 2b if step 2a was done, but failed to remove a suffix.
* Step 2b: Other verb suffixes
*/
private function step2b()
{
// é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez
// delete
if ( ($position = $this->searchIfInRv(array(
'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez',
'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions
// delete
// if preceded by e, delete
if ( ($position = $this->searchIfInRv(array(
'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant',
'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ( $this->inRv($before) && ($letter == 'e') ) {
$this->word = UTF8::substr($this->word, 0, $before);
} else {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// ions
// delete if in R2
if ( ($position = $this->searchIfInRv(array('ions'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
/**
* Step 3: Replace final Y with i or final ç with c
*/
private function step3()
{
$this->word = preg_replace('#(Y)$#u', 'i', $this->word);
$this->word = preg_replace('#(ç)$#u', 'c', $this->word);
}
/**
* Step 4: Residual suffix
*/
private function step4()
{
//If the word ends s, not preceded by a, i, o, u, è or s, delete it.
if (preg_match('#[^aiouès]s$#', $this->word)) {
$this->word = UTF8::substr($this->word, 0, -1);
}
// In the rest of step 4, all tests are confined to the RV region.
// ion
// delete if in R2 and preceded by s or t
if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// ier ière Ier Ière
// replace with i
if ( ($this->searchIfInRv(array('ier', 'ière', 'Ier', 'Ière'))) !== false) {
$this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word);
return true;
}
// e
// delete
if ( ($this->searchIfInRv(array('e'))) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
return true;
}
// ë
// if preceded by gu, delete
if ( ($position = $this->searchIfInRv(array('guë'))) !== false) {
if ($this->inRv($position+2)) {
$this->word = UTF8::substr($this->word, 0, -1);
return true;
}
}
return false;
}
/**
* Step 5: Undouble
* If the word ends enn, onn, ett, ell or eill, delete the last letter
*/
private function step5()
{
if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 6: Un-accent
* If the words ends é or è followed by at least one non-vowel, remove the accent from the e.
*/
private function step6()
{
$this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word);
}
/**
* And finally:
* Turn any remaining I, U and Y letters in the word back into lower case.
*/
private function finish()
{
$this->word = UTF8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word);
}
/**
* If the word begins with two vowels, RV is the region after the third letter,
* otherwise the region after the first vowel not at the beginning of the word,
* or the end of the word if these positions cannot be found.
* (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.)
*/
protected function rv()
{
$length = UTF8::strlen($this->word);
$this->rv = '';
$this->rvIndex = $length;
if ($length < 3) {
return true;
}
// If the word begins with two vowels, RV is the region after the third letter
$first = UTF8::substr($this->word, 0, 1);
$second = UTF8::substr($this->word, 1, 1);
if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) {
$this->rv = UTF8::substr($this->word, 3);
$this->rvIndex = 3;
return true;
}
// (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.)
$begin3 = UTF8::substr($this->word, 0, 3);
if (in_array($begin3, array('par', 'col', 'tap'))) {
$this->rv = UTF8::substr($this->word, 3);
$this->rvIndex = 3;
return true;
}
// otherwise the region after the first vowel not at the beginning of the word,
for ($i=1; $i<$length; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (in_array($letter, self::$vowels)) {
$this->rv = UTF8::substr($this->word, ($i + 1));
$this->rvIndex = $i + 1;
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,216 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/german/stemmer.html
* @author wamania
*
*/
class German extends Stem
{
/**
* All German vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü');
protected static $sEndings = array('b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r' ,'t');
protected static $stEndings = array('b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->plainVowels = implode('', self::$vowels);
$this->word = UTF8::strtolower($word);
// First, replace ß by ss
$this->word = UTF8::str_replace('ß', 'ss', $this->word);
// put u and y between vowels into upper case
$this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
// R1 and R2 are first set up in the standard way
$this->r1();
$this->r2();
// but then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
}
$this->step1();
$this->step2();
$this->step3();
$this->finish();
return $this->word;
}
/**
* Step 1
*/
private function step1()
{
// delete if in R1
if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// delete if in R1
if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
//If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s
if ($this->search(array('niss')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
return true;
}
// s (preceded by a valid s-ending)
if ( ($position = $this->search(array('s'))) !== false) {
if ($this->inR1($position)) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if (in_array($letter, self::$sEndings)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
return false;
}
/**
* Step 2
*/
private function step2()
{
// en er est
// delete if in R1
if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// st (preceded by a valid st-ending, itself preceded by at least 3 letters)
// delete if in R1
if ( ($position = $this->search(array('st'))) !== false) {
if ($this->inR1($position)) {
$before = $position - 1;
if ($before >= 3) {
$letter = UTF8::substr($this->word, $before, 1);
if (in_array($letter, self::$stEndings)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
}
return true;
}
return false;
}
/**
* Step 3: d-suffixes
*/
private function step3()
{
// end ung
// delete if in R2
// if preceded by ig, delete if in R2 and not preceded by e
if ( ($position = $this->search(array('end', 'ung'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
if ( ($position2 = $this->search(array('ig'))) !== false) {
$before = $position2 - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ( ($this->inR2($position2)) && ($letter != 'e') ) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
}
return true;
}
// ig ik isch
// delete if in R2 and not preceded by e
if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ( ($this->inR2($position)) && ($letter != 'e') ) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// lich heit
// delete if in R2
// if preceded by er or en, delete if in R1
if ( ($position = $this->search(array('lich', 'heit'))) != false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
if ( ($position2 = $this->search(array('er', 'en'))) !== false) {
if ($this->inR1($position2)) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
}
return true;
}
// keit
// delete if in R2
// if preceded by lich or ig, delete if in R2
if ( ($position = $this->search(array('keit'))) != false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) {
if ($this->inR2($position2)) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
}
return true;
}
return false;
}
/**
* Finally
*/
private function finish()
{
// turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
$this->word = UTF8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word);
}
}

View File

@ -0,0 +1,289 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/italian/stemmer.html
* @author wamania
*
*/
class Italian extends Stem
{
/**
* All Italian vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'à', 'è', 'ì', 'ò', 'ù');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->plainVowels = implode('', self::$vowels);
$this->word = UTF8::strtolower($word);
// First, replace all acute accents by grave accents.
$this->word = UTF8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word);
//And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then
$this->word = preg_replace('#([q])u#u', '$1U', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
$this->rv();
$this->r1();
$this->r2();
$this->step0();
$word = $this->word;
$this->step1();
//Do step 2 if no ending was removed by step 1.
if ($word == $this->word) {
$this->step2();
}
$this->step3a();
$this->step3b();
$this->finish();
return $this->word;
}
/**
* Step 0: Attached pronoun
*/
private function step0()
{
// Search for the longest among the following suffixes
if ( ($position = $this->search(array(
'gliela', 'gliele', 'glieli', 'glielo', 'gliene',
'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela',
'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene',
'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) {
$suffixe = UTF8::substr($this->word, $position);
// following one of (in RV)
// a
$a = array('ando', 'endo');
$a = array_map(function($item) use ($suffixe) {
return $item . $suffixe;
}, $a);
// In case of (a) the suffix is deleted
if ($this->searchIfInRv($a) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
}
//b
$b = array('ar', 'er', 'ir');
$b = array_map(function($item) use ($suffixe) {
return $item . $suffixe;
}, $b);
// in case (b) it is replace by e
if ($this->searchIfInRv($b) !== false) {
$this->word = preg_replace('#('.$suffixe.')$#u', 'e', $this->word);
}
return true;
}
return false;
}
/**
* Step 1: Standard suffix removal
*/
private function step1()
{
// amente
// delete if in R1
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
// if preceded by os, ic or abil, delete if in R2
if ( ($position = $this->search(array('amente'))) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position3);
}
// if preceded by os, ic or ad, delete if in R2
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position4);
}
return true;
}
// delete if in R2
if ( ($position = $this->search(array(
'ibili', 'atrice', 'abili', 'abile', 'ibile', 'atrici', 'mente',
'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'ante', 'anti',
'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose'
))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// azione azioni atore atori
// delete if in R2
// if preceded by ic, delete if in R2
if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->search(array('ic'))) !== false) {
if ($this->inR2($position2)) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
}
}
return true;
}
// logia logie
// replace with log if in R2
if ( ($position = $this->search(array('logia', 'logie'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(logia|logie)$#u', 'log', $this->word);
}
return true;
}
// uzione uzioni usione usioni
// replace with u if in R2
if ( ($position = $this->search(array('uzione', 'uzioni', 'usione', 'usioni'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(uzione|uzioni|usione|usioni)$#u', 'u', $this->word);
}
return true;
}
// enza enze
// replace with ente if in R2
if ( ($position = $this->search(array('enza', 'enze'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(enza|enze)$#u', 'ente', $this->word);
}
return true;
}
// amento amenti imento imenti
// delete if in RV
if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// ità
// delete if in R2
// if preceded by abil, ic or iv, delete if in R2
if ( ($position = $this->search(array('ità'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
// ivo ivi iva ive
// delete if in R2
// if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2)
if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position3);
}
}
return true;
}
return false;
}
/**
* Step 2: Verb suffixes
* Search for the longest among the following suffixes in RV, and if found, delete.
*/
private function step2()
{
if ( ($position = $this->searchIfInRv(array(
'assimo', 'assero', 'eranno', 'erebbero', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbero', 'irebbe', 'iremmo',
'iscano', 'ireste', 'iresti', 'iscono', 'issero',
'avamo', 'arono', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'ivamo', 'ivano', 'ivate', 'iremo', 'irete', 'irono',
'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'irei',
'isca', 'isce', 'isci', 'isco',
'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva',
'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
/**
* Step 3a
* Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a preceding i if it is in RV
*/
private function step3a()
{
if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
if ($this->searchIfInRv(array('i')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
return true;
}
return false;
}
/**
* Step 3b
* Replace final ch (or gh) with c (or g) if in RV (crocch -> crocc)
*/
private function step3b()
{
if ($this->searchIfInRv(array('ch')) !== false) {
$this->word = preg_replace('#(ch)$#u', 'c', $this->word);
} elseif ($this->searchIfInRv(array('gh')) !== false) {
$this->word = preg_replace('#(gh)$#u', 'g', $this->word);
}
}
/**
* Finally
* turn I and U back into lower case
*/
private function finish()
{
$this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word);
}
}

View File

@ -0,0 +1,130 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
* @author wamania
*
*/
class Norwegian extends Stem
{
/**
* All norwegian vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'æ', 'å', 'ø');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// R2 is not used: R1 is defined in the same way as in the German stemmer
$this->r1();
// then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
}
// Do each of steps 1, 2 3 and 4.
$this->step1();
$this->step2();
$this->step3();
return $this->word;
}
/**
* Define a valid s-ending as one of
* b c d f g h j l m n o p r t v y z,
* or k not preceded by a vowel
*
* @param string $ending
* @return boolean
*/
private function hasValidSEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) {
return true;
}
if ($lastLetter == 'k') {
$beforeLetter = UTF8::substr($word, -2, 1);
if (!in_array($beforeLetter, self::$vowels)) {
return true;
}
}
return false;
}
/**
* Step 1
* Search for the longest among the following suffixes in R1, and perform the action indicated.
*/
private function step1()
{
// erte ert
// replace with er
if ( ($position = $this->searchIfInR1(array('erte', 'ert'))) !== false) {
$this->word = preg_replace('#(erte|ert)$#u', 'er', $this->word);
return true;
}
// a e ede ande ende ane ene hetene en heten ar er heter as es edes endes enes hetenes ens hetens ers ets et het ast
// delete
if ( ($position = $this->searchIfInR1(array(
'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane',
'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e'
))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// s
// delete if preceded by a valid s-ending
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
return true;
}
}
/**
* Step 2
* If the word ends dt or vt in R1, delete the t.
*/
private function step2()
{
if ($this->searchIfInR1(array('dt', 'vt')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 3:
* Search for the longest among the following suffixes in R1, and if found, delete.
*/
private function step3()
{
// leg eleg ig eig lig elig els lov elov slov hetslov
if ( ($position = $this->searchIfInR1(array(
'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig'
))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
}

View File

@ -0,0 +1,283 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
* @author wamania
*
*/
class Portuguese extends Stem
{
/**
* All Portuguese vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'â', 'ê', 'ô');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
$this->word = UTF8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word);
$this->rv();
$this->r1();
$this->r2();
$word = $this->word;
$this->step1();
if ($word == $this->word) {
$this->step2();
}
if ($word != $this->word) {
$this->step3();
} else {
$this->step4();
}
$this->step5();
$this->finish();
return $this->word;
}
/**
* Step 1: Standard suffix removal
*/
private function step1()
{
// delete if in R2
if ( ($position = $this->search(array(
'amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância',
'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso',
'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// logía logías
// replace with log if in R2
if ( ($position = $this->search(array('logías', 'logía'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word);
}
return true;
}
// ución uciones
// replace with u if in R2
if ( ($position = $this->search(array('uciones', 'ución'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word);
}
return true;
}
// ência ências
// replace with ente if in R2
if ( ($position = $this->search(array('ências', 'ência'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(ências|ência)$#u', 'ente', $this->word);
}
return true;
}
// amente
// delete if in R1
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
// if preceded by os, ic or ad, delete if in R2
if ( ($position = $this->search(array('amente'))) !== false) {
// delete if in R1
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position3);
}
// if preceded by os, ic or ad, delete if in R2
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position4);
}
return true;
}
// mente
// delete if in R2
// if preceded by ante, avel or ível, delete if in R2
if ( ($position = $this->search(array('mente'))) !== false) {
// delete if in R2
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by ante, avel or ível, delete if in R2
if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
// idade idades
// delete if in R2
// if preceded by abil, ic or iv, delete if in R2
if ( ($position = $this->search(array('idades', 'idade'))) !== false) {
// delete if in R2
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by abil, ic or iv, delete if in R2
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
// iva ivo ivas ivos
// delete if in R2
// if preceded by at, delete if in R2
if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) !== false) {
// delete if in R2
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by at, delete if in R2
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
// ira iras
// replace with ir if in RV and preceded by e
if ( ($position = $this->search(array('iras', 'ira'))) !== false) {
if ($this->inRv($position)) {
$before = $position -1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 'e') {
$this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word);
}
}
return true;
}
return false;
}
/**
* Step 2: Verb suffixes
* Search for the longest among the following suffixes in RV, and if found, delete.
*/
private function step2()
{
if ( ($position = $this->searchIfInRv(array(
'aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos',
'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos',
'aremos', 'eremos', 'iremos',
'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes',
'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis',
'áveis', 'íamos', 'armos', 'ermos', 'irmos',
'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas',
'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o',
'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos',
'emos', 'imos', 'iras',
'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira',
'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou',
))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
return false;
}
/**
* Step 3: d-suffixes
*
*/
private function step3()
{
// Delete suffix i if in RV and preceded by c
if ($this->searchIfInRv(array('i')) !== false) {
$letter = UTF8::substr($this->word, -2, 1);
if ($letter == 'c') {
$this->word = UTF8::substr($this->word, 0, -1);
}
return true;
}
return false;
}
/**
* Step 4
*/
private function step4()
{
// If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it
if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
return false;
}
/**
* Step 5
*/
private function step5()
{
// If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i).
if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) {
if ($this->inRv(($position2+1))) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
return true;
} else if ($this->search(array('ç')) !== false) {
$this->word = preg_replace('#(ç)$#u', 'c', $this->word);
return true;
}
return false;
}
/**
* Finally
*/
private function finish()
{
// turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
$this->word = UTF8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word);
}
}

View File

@ -0,0 +1,334 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/romanian/stemmer.html
* @author wamania
*
*/
class Romanian extends Stem
{
/**
* All Romanian vowels
*/
protected static $vowels = array('a', 'ă', 'â', 'e', 'i', 'î', 'o', 'u');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
$this->plainVowels = implode('', self::$vowels);
// First, i and u between vowels are put into upper case (so that they are treated as consonants).
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
$this->rv();
$this->r1();
$this->r2();
$this->step0();
$word1 = $this->word;
$word2 = $this->word;
do {
$word1 = $this->word;
$this->step1();
} while ($this->word != $word1);
$this->step2();
// Do step 3 if no suffix was removed either by step 1 or step 2.
if ($word2 == $this->word) {
$this->step3();
}
$this->step4();
$this->finish();
return $this->word;
}
/**
* Step 0: Removal of plurals (and other simplifications)
* Search for the longest among the following suffixes, and, if it is in R1, perform the action indicated.
* @return boolean
*/
private function step0()
{
// ul ului
// delete
if ( ($position = $this->search(array('ul', 'ului'))) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// aua
// replace with a
if ( ($position = $this->search(array('aua'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(aua)$#u', 'a', $this->word);
}
return true;
}
// ea ele elor
// replace with e
if ( ($position = $this->search(array('ea', 'ele', 'elor'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ea|ele|elor)$#u', 'e', $this->word);
}
return true;
}
// ii iua iei iile iilor ilor
// replace with i
if ( ($position = $this->search(array('ii', 'iua', 'iei', 'iile', 'iilor', 'ilor'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ii|iua|iei|iile|iilor|ilor)$#u', 'i', $this->word);
}
return true;
}
// ile
// replace with i if not preceded by ab
if ( ($position = $this->search(array('ile'))) !== false) {
if ($this->inR1($position)) {
$before = UTF8::substr($this->word, ($position-2), 2);
if ($before != 'ab') {
$this->word = preg_replace('#(ile)$#u', 'i', $this->word);
}
}
return true;
}
// atei
// replace with at
if ( ($position = $this->search(array('atei'))) != false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(atei)$#u', 'at', $this->word);
}
return true;
}
// aţie aţia
// replace with aţi
if ( ($position = $this->search(array('aţie', 'aţia'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(aţie|aţia)$#u', 'aţi', $this->word);
}
return true;
}
return false;
}
/**
* Step 1: Reduction of combining suffixes
* Search for the longest among the following suffixes, and, if it is in R1, preform the replacement action indicated.
* Then repeat this step until no replacement occurs.
* @return boolean
*/
private function step1()
{
// abilitate abilitati abilităi abilităţi
// replace with abil
if ( ($position = $this->search(array('abilitate', 'abilitati', 'abilităi', 'abilităţi'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(abilitate|abilitati|abilităi|abilităţi)$#u', 'abil', $this->word);
}
return true;
}
// ibilitate
// replace with ibil
if ( ($position = $this->search(array('ibilitate'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ibilitate)$#u', 'ibil', $this->word);
}
return true;
}
// ivitate ivitati ivităi ivităţi
// replace with iv
if ( ($position = $this->search(array('ivitate', 'ivitati', 'ivităi', 'ivităţi'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ivitate|ivitati|ivităi|ivităţi)$#u', 'iv', $this->word);
}
return true;
}
// icitate icitati icităi icităţi icator icatori iciv iciva icive icivi icivă ical icala icale icali icală
// replace with ic
if ( ($position = $this->search(array(
'icitate', 'icitati', 'icităi', 'icităţi', 'icatori', 'icator', 'iciva',
'icive', 'icivi', 'icivă', 'icala', 'icale', 'icali', 'icală', 'iciv', 'ical'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(icitate|icitati|icităi|icităţi|cator|icatori|iciva|icive|icivi|icivă|icala|icale|icali|icală|ical|iciv)$#u', 'ic', $this->word);
}
return true;
}
// ativ ativa ative ativi ativă aţiune atoare ator atori ătoare ător ători
// replace with at
if ( ($position = $this->search(array('ativa', 'ative', 'ativi', 'ativă', 'ativ', 'aţiune', 'atoare', 'atori', 'ătoare', 'ători', 'ător', 'ator'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ativa|ative|ativi|ativă|ativ|aţiune|atoare|atori|ătoare|ători|ător|ator)$#u', 'at', $this->word);
}
return true;
}
// itiv itiva itive itivi itivă iţiune itoare itor itori
// replace with it
if ( ($position = $this->search(array('itiva', 'itive', 'itivi', 'itivă', 'itiv', 'iţiune', 'itoare', 'itori', 'itor'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(itiva|itive|itivi|itivă|itiv|iţiune|itoare|itori|itor)$#u', 'it', $this->word);
}
return true;
}
return false;
}
/**
* Step 2: Removal of 'standard' suffixes
* Search for the longest among the following suffixes, and, if it is in R2, perform the action indicated.
* @return boolean
*/
private function step2()
{
// atori itate itati, ităţi, abila abile abili abilă, ibila ibile ibili ibilă
// anta, ante, anti, antă, ator, ibil, oasa oasă oase, ităi, abil
// osi oşi ant ici ică iva ive ivi ivă ata ată ati ate, ata ată ati ate uta ută uti ute, ita ită iti ite ica ice
// at, os, iv, ut, it, ic
// delete
if ( ($position = $this->search(array(
'atori', 'itate', 'itati', 'ităţi', 'abila', 'abile', 'abili', 'abilă', 'ibila', 'ibile', 'ibili', 'ibilă',
'anta', 'ante', 'anti', 'antă', 'ator', 'ibil', 'oasa', 'oasă', 'oase', 'ităi', 'abil',
'osi', 'oşi', 'ant', 'ici', 'ică', 'iva', 'ive', 'ivi', 'ivă', 'ata', 'ată', 'ati', 'ate', 'ata', 'ată',
'ati', 'ate', 'uta', 'ută', 'uti', 'ute', 'ita', 'ită', 'iti', 'ite', 'ica', 'ice',
'at', 'os', 'iv', 'ut', 'it', 'ic'
))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// iune iuni
// delete if preceded by ţ, and replace the ţ by t.
if ( ($position = $this->search(array('iune', 'iuni'))) !== false) {
if ($this->inR2($position)) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 'ţ') {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = preg_replace('#(ţ)$#u', 't', $this->word);
}
}
return true;
}
// ism isme ist ista iste isti istă işti
// replace with ist
if ( ($position = $this->search(array('isme', 'ism', 'ista', 'iste', 'isti', 'istă', 'işti', 'ist'))) !== false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(isme|ism|ista|iste|isti|istă|işti|ist)$#u', 'ist', $this->word);
}
return true;
}
return false;
}
/**
* Step 3: Removal of verb suffixes
* Do step 3 if no suffix was removed either by step 1 or step 2.
* @return boolean
*/
private function step3()
{
// are ere ire âre ind ând indu ându eze ească ez ezi ează esc eşti
// eşte ăsc ăşti ăşte am ai au eam eai ea eaţi eau iam iai ia iaţi
// iau ui aşi arăm arăţi ară uşi urăm urăţi ură işi irăm irăţi iră âi
// âşi ârăm ârăţi âră asem aseşi ase aserăm aserăţi aseră isem iseşi ise
// iserăm iserăţi iseră âsem âseşi âse âserăm âserăţi âseră usem useşi use userăm userăţi useră
// delete if preceded in RV by a consonant or u
if ( ($position = $this->searchIfInRv(array(
'userăţi', 'iserăţi', 'âserăţi', 'aserăţi',
'userăm', 'iserăm', 'âserăm', 'aserăm',
'iseră', 'âseşi', 'useră', 'âseră', 'useşi', 'iseşi', 'aseră', 'aseşi', 'ârăţi', 'irăţi', 'urăţi', 'arăţi', 'ească',
'usem', 'âsem', 'isem', 'asem', 'ârăm', 'urăm', 'irăm', 'arăm', 'iaţi', 'eaţi', 'ăşte', 'ăşti', 'eşte', 'eşti', 'ează', 'ându', 'indu',
'âse', 'use', 'ise', 'ase', 'âră', 'iră', 'işi', 'ură', 'uşi', 'ară', 'aşi', 'âşi', 'iau', 'iai', 'iam', 'eau', 'eai', 'eam', 'ăsc',
'are', 'ere', 'ire', 'âre', 'ind', 'ând', 'eze', 'ezi', 'esc',
'âi', 'ui', 'ia', 'ea', 'au', 'ai', 'am', 'ez'
))) !== false) {
if ($this->inRv($position)) {
$before = $position - 1;
if ($this->inRv($before)) {
$letter = UTF8::substr($this->word, $before, 1);
if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
}
return true;
}
// ăm aţi em eţi im iţi âm âţi seşi serăm serăţi seră sei se sesem seseşi sese seserăm seserăţi seseră
// delete
if ( ($position = $this->searchIfInRv(array(
'seserăm', 'seserăţi', 'seseră', 'seseşi', 'sesem', 'serăţi', 'serăm', 'seşi', 'sese', 'seră',
'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im'
))) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
}
/**
* Step 4: Removal of final vowel
*/
private function step4()
{
// Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it.
if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
/**
* Finally
* Turn I, U back into i, u
*/
private function finish()
{
// Turn I, U back into i, u
$this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word);
}
}

View File

@ -0,0 +1,252 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/russian/stemmer.html
* @author wamania
*
*/
class Russian extends Stem
{
/**
* All russian vowels
*/
protected static $vowels = array('а', 'е', 'и', 'о', 'у', 'ы', 'э', 'ю', 'я');
protected static $perfectiveGerund = array(
array('вшись', 'вши', 'в'),
array('ывшись', 'ившись', 'ывши', 'ивши', 'ив', 'ыв')
);
protected static $adjective = array(
'ыми', 'ими', 'ему', 'ому', 'его', 'ого', 'ее', 'ие', 'ые', 'ое', 'ей', 'ий',
'ый', 'ой', 'ем', 'им', 'ым','ом','их', 'ых', 'ую', 'юю', 'ая', 'яя', 'ою', 'ею'
);
protected static $participle = array(
array('ем', 'нн', 'вш', 'ющ', 'щ'),
array('ивш', 'ывш', 'ующ')
);
protected static $reflexive = array('ся', 'сь');
protected static $verb = array(
array('ешь', 'нно', 'ете', 'йте', 'ла', 'на', 'ли', 'й', 'л', 'ем', 'н', 'ло', 'но', 'ет', 'ют', 'ны', 'ть'),
array(
'уйте', 'ило', 'ыло', 'ено','ила', 'ыла', 'ена', 'ейте', 'ены', 'ить', 'ыть', 'ишь', 'ите', 'или', 'ыли',
'ует', 'уют', 'ей', 'уй', 'ил', 'ыл', 'им', 'ым', 'ен', 'ят', 'ит', 'ыт', 'ую', 'ю'
)
);
protected static $noun = array(
'иями', 'ями', 'ами', 'ией', 'иям', 'ием', 'иях', 'ев', 'ов', 'ие', 'ье', 'еи', 'ии', 'ей', 'ой', 'ий', 'ям',
'ем', 'ам', 'ом', 'ах', 'ях', 'ию', 'ью', 'ия', 'ья', 'я', 'а', 'е', 'ы', 'ь', 'и', 'о', 'у', 'й', 'ю'
);
protected static $superlative = array('ейше', 'ейш');
protected static $derivational = array('ость', 'ост');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// R2 is not used: R1 is defined in the same way as in the German stemmer
$this->r1();
$this->r2();
$this->rv();
// Do each of steps 1, 2 3 and 4.
$this->step1();
$this->step2();
$this->step3();
$this->step4();
return $this->word;
}
/**
* Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.
* Otherwise try and remove a REFLEXIVE ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending.
* As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
*/
private function step1()
{
// Search for a PERFECTIVE GERUND ending.
// group 1
if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) {
if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
// group 2
if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
// Otherwise try and remove a REFLEXIVE ending
if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
// then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending.
// As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->search(self::$participle[0])) !== false) {
if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) {
$this->word = UTF8::substr($this->word, 0, $position2);
return true;
}
}
if ( ($position2 = $this->search(self::$participle[1])) !== false) {
if ($this->inRv($position2)) {
$this->word = UTF8::substr($this->word, 0, $position2);
return true;
}
}
return true;
}
}
if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) {
if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
if ( ($position = $this->searchIfInRv(self::$noun)) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
return false;
}
/**
* Step 2: If the word ends with и (i), remove it.
*/
private function step2()
{
if ( ($position = $this->searchIfInRv(array('и'))) !== false) {
if ($this->inRv($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
return false;
}
/**
* Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2),
* and if one is found, remove it.
*/
private function step3()
{
if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
}
/**
* Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it
* and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it.
*/
private function step4()
{
// (2) if the word ends with a SUPERLATIVE ending, remove it
if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// (1) Undouble н (n)
if ( ($position = $this->searchIfInRv(array('нн'))) !== false) {
$this->word = UTF8::substr($this->word, 0, ($position+1));
return true;
}
// (3) if the word ends ь (') (soft sign) remove it
if ( ($position = $this->searchIfInRv(array('ь'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
/**
* In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
*/
protected function rv()
{
$length = UTF8::strlen($this->word);
$this->rv = '';
$this->rvIndex = $length;
for ($i=0; $i<$length; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (in_array($letter, self::$vowels)) {
$this->rv = UTF8::substr($this->word, ($i+1));
$this->rvIndex = $i + 1;
return true;
}
}
return false;
}
/**
* group 1 endings must follow а (a) or я (ia)
*
* @param integer $position
* @return boolean
*/
private function checkGroup1($position)
{
if (! $this->inRv(($position-1))) {
return false;
}
$letter = UTF8::substr($this->word, ($position - 1), 1);
if ($letter == 'а' || $letter == 'я') {
return true;
}
return false;
}
}

View File

@ -0,0 +1,348 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/spanish/stemmer.html
* @author wamania
*
*/
class Spanish extends Stem
{
/**
* All spanish vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'ü');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
$this->rv();
$this->r1();
$this->r2();
$this->step0();
$word = $this->word;
$this->step1();
// Do step 2a if no ending was removed by step 1.
if ($this->word == $word) {
$this->step2a();
// Do Step 2b if step 2a was done, but failed to remove a suffix.
if ($this->word == $word) {
$this->step2b();
}
}
$this->step3();
$this->finish();
return $this->word;
}
/**
* Step 0: Attached pronoun
*
* Search for the longest among the following suffixes
* me se sela selo selas selos la le lo las les los nos
*
* and delete it, if comes after one of
* (a) iéndo ándo ár ér ír
* (b) ando iendo ar er ir
* (c) yendo following u
*
* in RV. In the case of (c), yendo must lie in RV, but the preceding u can be outside it.
* In the case of (a), deletion is followed by removing the acute accent (for example, haciéndola -> haciendo).
*/
private function step0()
{
if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) {
$suffixe = UTF8::substr($this->word, $position);
// a
$a = array('iéndo', 'ándo', 'ár', 'ér', 'ír');
$a = array_map(function($item) use ($suffixe) {
return $item . $suffixe;
}, $a);
if ( ($position2 = $this->searchIfInRv($a)) !== false) {
$suffixe2 = UTF8::substr($this->word, $position2);
$suffixe2 = UTF8::to_utf8(UTF8::to_ascii($suffixe2)); // unaccent
$this->word = UTF8::substr($this->word, 0, $position2);
$this->word .= $suffixe2;
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// b
$b = array('iendo', 'ando', 'ar', 'er', 'ir');
$b = array_map(function($item) use ($suffixe) {
return $item . $suffixe;
}, $b);
if ( ($position2 = $this->searchIfInRv($b)) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// c
if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) {
$before = UTF8::substr($this->word, ($position2-1), 1);
if ( (isset($before)) && ($before == 'u') ) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
}
return false;
}
/**
* Step 1
*/
private function step1()
{
// anza anzas ico ica icos icas ismo ismos able ables ible ibles ista
// istas oso osa osos osas amiento amientos imiento imientos
// delete if in R2
if ( ($position = $this->search(array(
'imientos', 'imiento', 'amientos', 'amiento', 'osas', 'osos', 'osa', 'oso', 'istas', 'ista', 'ibles',
'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// adora ador ación adoras adores aciones ante antes ancia ancias
// delete if in R2
// if preceded by ic, delete if in R2
if ( ($position = $this->search(array(
'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
if ( ($position2 = $this->searchIfInR2(array('ic')))) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
// logía logías
// replace with log if in R2
if ( ($position = $this->search(array('logías', 'logía'))) != false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word);
}
return true;
}
// ución uciones
// replace with u if in R2
if ( ($position = $this->search(array('uciones', 'ución'))) != false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word);
}
return true;
}
// encia encias
// replace with ente if in R2
if ( ($position = $this->search(array('encias', 'encia'))) != false) {
if ($this->inR2($position)) {
$this->word = preg_replace('#(encias|encia)$#u', 'ente', $this->word);
}
return true;
}
// amente
// delete if in R1
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
// if preceded by os, ic or ad, delete if in R2
if ( ($position = $this->search(array('amente'))) != false) {
// delete if in R1
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position2);
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position3);
}
// if preceded by os, ic or ad, delete if in R2
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position4);
}
return true;
}
// mente
// delete if in R2
// if preceded by ante, able or ible, delete if in R2
if ( ($position = $this->search(array('mente'))) != false) {
// delete if in R2
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by ante, able or ible, delete if in R2
if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
// idad idades
// delete if in R2
// if preceded by abil, ic or iv, delete if in R2
if ( ($position = $this->search(array('idades', 'idad'))) != false) {
// delete if in R2
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by abil, ic or iv, delete if in R2
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
// iva ivo ivas ivos
// delete if in R2
// if preceded by at, delete if in R2
if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) != false) {
// delete if in R2
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
// if preceded by at, delete if in R2
if ( ($position2 = $this->searchIfInR2(array('at'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position2);
}
return true;
}
return false;
}
/**
* Step 2a: Verb suffixes beginning y
*/
private function step2a()
{
// if found, delete if preceded by u
// (Note that the preceding u need not be in RV.)
if ( ($position = $this->searchIfInRv(array(
'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) {
$before = UTF8::substr($this->word, ($position-1), 1);
if ( (isset($before)) && ($before == 'u') ) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
return false;
}
/**
* Step 2b: Other verb suffixes
* Search for the longest among the following suffixes in RV, and perform the action indicated.
*/
private function step2b()
{
// delete
if ( ($position = $this->searchIfInRv(array(
'iésemos', 'iéramos', 'ábamos', 'iríamos', 'eríamos', 'aríamos', 'áramos', 'ásemos', 'eríais',
'aremos', 'eremos', 'iremos', 'asteis', 'ieseis', 'ierais', 'isteis', 'aríais',
'irían', 'aréis', 'erían', 'erías', 'eréis', 'iréis', 'irías', 'ieran', 'iesen', 'ieron', 'iendo', 'ieras',
'iríais', 'arían', 'arías',
'amos', 'imos', 'ados', 'idos', 'irán', 'irás', 'erán', 'erás', 'ería', 'iría', 'íais', 'arán', 'arás', 'aría',
'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'ando', 'abas', 'adas', 'idas', 'ases', 'aras',
'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará',
'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an'
))) != false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// en es éis emos
// delete, and if preceded by gu delete the u (the gu need not be in RV)
if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->search(array('gu'))) != false) {
$this->word = UTF8::substr($this->word, 0, ($position2+1));
}
return true;
}
}
/**
* Step 3: residual suffix
* Search for the longest among the following suffixes in RV, and perform the action indicated.
*/
private function step3()
{
// os a o á í ó
// delete if in RV
if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// e é
// delete if in RV, and if preceded by gu with the u in RV delete the u
if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->searchIfInRv(array('u'))) != false) {
$before = UTF8::substr($this->word, ($position2-1), 1);
if ( (isset($before)) && ($before == 'g') ) {
$this->word = UTF8::substr($this->word, 0, $position2);
return true;
}
}
}
return false;
}
/**
* And finally:
* Remove acute accents
*/
private function finish()
{
$this->word = UTF8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word);
}
}

View File

@ -0,0 +1,221 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
abstract class Stem implements Stemmer
{
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
/**
* helper, contains stringified list of vowels
* @var string
*/
protected $plainVowels;
/**
* The word we are stemming
* @var string
*/
protected $word;
/**
* The original word, use to check if word has been modified
* @var string
*/
protected $originalWord;
/**
* RV value
* @var string
*/
protected $rv;
/**
* RV index (based on the beginning of the word)
* @var integer
*/
protected $rvIndex;
/**
* R1 value
* @var integer
*/
protected $r1;
/**
* R1 index (based on the beginning of the word)
* @var int
*/
protected $r1Index;
/**
* R2 value
* @var integer
*/
protected $r2;
/**
* R2 index (based on the beginning of the word)
* @var int
*/
protected $r2Index;
protected function inRv($position)
{
return ($position >= $this->rvIndex);
}
protected function inR1($position)
{
return ($position >= $this->r1Index);
}
protected function inR2($position)
{
return ($position >= $this->r2Index);
}
protected function searchIfInRv($suffixes)
{
return $this->search($suffixes, $this->rvIndex);
}
protected function searchIfInR1($suffixes)
{
return $this->search($suffixes, $this->r1Index);
}
protected function searchIfInR2($suffixes)
{
return $this->search($suffixes, $this->r2Index);
}
protected function search($suffixes, $offset = 0)
{
$length = UTF8::strlen($this->word);
if ($offset > $length) {
return false;
}
foreach ($suffixes as $suffixe) {
if ( (($position = UTF8::strrpos($this->word, $suffixe, $offset)) !== false) && ((Utf8::strlen($suffixe)+$position) == $length) ) {
return $position;
}
}
return false;
}
/**
* R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
*/
protected function r1()
{
list($this->r1Index, $this->r1) = $this->rx($this->word);
}
/**
* R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
*/
protected function r2()
{
list($index, $value) = $this->rx($this->r1);
$this->r2 = $value;
$this->r2Index = $this->r1Index + $index;
}
/**
* Common function for R1 and R2
* Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel.
* R1 : $in = $this->word
* R2 : $in = R1
*/
protected function rx($in)
{
$length = UTF8::strlen($in);
// defaults
$value = '';
$index = $length;
// we search all vowels
$vowels = array();
for ($i=0; $i<$length; $i++) {
$letter = UTF8::substr($in, $i, 1);
if (in_array($letter, static::$vowels)) {
$vowels[] = $i;
}
}
// search the non-vowel following a vowel
foreach ($vowels as $position) {
$after = $position + 1;
$letter = UTF8::substr($in, $after, 1);
if (! in_array($letter, static::$vowels)) {
$index = $after + 1;
$value = UTF8::substr($in, ($after+1));
break;
}
}
return array($index, $value);
}
/**
* Used by spanish, italian, portuguese, etc (but not by french)
*
* If the second letter is a consonant, RV is the region after the next following vowel,
* or if the first two letters are vowels, RV is the region after the next consonant,
* and otherwise (consonant-vowel case) RV is the region after the third letter.
* But RV is the end of the word if these positions cannot be found.
*/
protected function rv()
{
$length = UTF8::strlen($this->word);
$this->rv = '';
$this->rvIndex = $length;
if ($length < 3) {
return true;
}
$first = UTF8::substr($this->word, 0, 1);
$second = UTF8::substr($this->word, 1, 1);
// If the second letter is a consonant, RV is the region after the next following vowel,
if (!in_array($second, static::$vowels)) {
for ($i=2; $i<$length; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (in_array($letter, static::$vowels)) {
$this->rvIndex = $i + 1;
$this->rv = UTF8::substr($this->word, ($i+1));
return true;
}
}
}
// or if the first two letters are vowels, RV is the region after the next consonant,
if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) {
for ($i=2; $i<$length; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (! in_array($letter, static::$vowels)) {
$this->rvIndex = $i + 1;
$this->rv = UTF8::substr($this->word, ($i+1));
return true;
}
}
}
// and otherwise (consonant-vowel case) RV is the region after the third letter.
if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) {
$this->rv = UTF8::substr($this->word, 3);
$this->rvIndex = 3;
return true;
}
}
}

View File

@ -0,0 +1,19 @@
<?php
namespace Wamania\Snowball\Stemmer;
/**
* @author Luís Cobucci <lcobucci@gmail.com>
*/
interface Stemmer
{
/**
* Main function to get the STEM of a word
*
* @param string $word A valid UTF-8 word
*
* @return string
*
* @throws \Exception
*/
public function stem($word);
}

View File

@ -0,0 +1,130 @@
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/swedish/stemmer.html
* @author wamania
*
*/
class Swedish extends Stem
{
/**
* All swedish vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// R2 is not used: R1 is defined in the same way as in the German stemmer
$this->r1();
// then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
}
// Do each of steps 1, 2 3 and 4.
$this->step1();
$this->step2();
$this->step3();
return $this->word;
}
/**
* Define a valid s-ending as one of
* b c d f g h j k l m n o p r t v y
*
* @param string $ending
* @return boolean
*/
private function hasValidSEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y'));
}
/**
* Step 1
* Search for the longest among the following suffixes in R1, and perform the action indicated.
*/
private function step1()
{
// a arna erna heterna orna ad e ade ande arne are aste en anden aren heten
// ern ar er heter or as arnas ernas ornas es ades andes ens arens hetens
// erns at andet het ast
// delete
if ( ($position = $this->searchIfInR1(array(
'heterna', 'hetens', 'ornas', 'andes', 'arnas', 'heter', 'ernas', 'anden', 'heten', 'andet', 'arens',
'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het',
'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e'
))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// s
// delete if preceded by a valid s-ending
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
}
}
/**
* Step 2
* Search for one of the following suffixes in R1, and if found delete the last letter.
*/
private function step2()
{
// dd gd nn dt gt kt tt
if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 3:
* Search for the longest among the following suffixes in R1, and perform the action indicated.
*/
private function step3()
{
// lig ig els
// delete
if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// löst
// replace with lös
if ( ($this->searchIfInR1(array('löst'))) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
return true;
}
// fullt
// replace with full
if ( ($this->searchIfInR1(array('fullt'))) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
return true;
}
}
}

View File

@ -0,0 +1,56 @@
<?php
namespace Wamania\Snowball;
use voku\helper\UTF8;
use Wamania\Snowball\Stemmer\Catalan;
use Wamania\Snowball\Stemmer\Danish;
use Wamania\Snowball\Stemmer\Dutch;
use Wamania\Snowball\Stemmer\English;
use Wamania\Snowball\Stemmer\Finnish;
use Wamania\Snowball\Stemmer\French;
use Wamania\Snowball\Stemmer\German;
use Wamania\Snowball\Stemmer\Italian;
use Wamania\Snowball\Stemmer\Norwegian;
use Wamania\Snowball\Stemmer\Portuguese;
use Wamania\Snowball\Stemmer\Romanian;
use Wamania\Snowball\Stemmer\Russian;
use Wamania\Snowball\Stemmer\Spanish;
use Wamania\Snowball\Stemmer\Stemmer;
use Wamania\Snowball\Stemmer\Swedish;
class StemmerFactory
{
const LANGS = [
Catalan::class => ['ca', 'cat', 'catalan'],
Danish::class => ['da', 'dan', 'danish'],
Dutch::class => ['nl', 'dut', 'nld', 'dutch'],
English::class => ['en', 'eng', 'english'],
Finnish::class => ['fi', 'fin', 'finnish'],
French::class => ['fr', 'fre', 'fra', 'french'],
German::class => ['de', 'deu', 'ger', 'german'],
Italian::class => ['it', 'ita', 'italian'],
Norwegian::class => ['no', 'nor', 'norwegian'],
Portuguese::class => ['pt', 'por', 'portuguese'],
Romanian::class => ['ro', 'rum', 'ron', 'romanian'],
Russian::class => ['ru', 'rus', 'russian'],
Spanish::class => ['es', 'spa', 'spanish'],
Swedish::class => ['sv', 'swe', 'swedish']
];
/**
* @throws NotFoundException
*/
public static function create(string $code): Stemmer
{
$code = UTF8::strtolower($code);
foreach (self::LANGS as $classname => $isoCodes) {
if (in_array($code, $isoCodes)) {
return new $classname;
}
}
throw new NotFoundException(sprintf('Stemmer not found for %s', $code));
}
}

View File

@ -0,0 +1,26 @@
<?php
namespace Wamania\Snowball;
class StemmerManager
{
/** @var array */
private $stemmers;
public function __construct()
{
$this->stemmers = [];
}
/**
* @throws NotFoundException
*/
public function stem(string $word, string $isoCode): string
{
if (!isset($this->stemmers[$isoCode])) {
$this->stemmers[$isoCode] = StemmerFactory::create($isoCode);
}
return $this->stemmers[$isoCode]->stem($word);
}
}