Your IP : 18.118.227.42


Current Path : /home/bitrix/ext_www/klimatlend.ua/bitrix/modules/search/tools/
Upload File :
Current File : /home/bitrix/ext_www/klimatlend.ua/bitrix/modules/search/tools/stemming.php

<?
function stemming_init($sLang="ru")
{
	static $arStemFunc = false;

	//Init all languages
	if($arStemFunc === false)
	{
		$arStemFunc = array();
		$rsLanguages = CLanguage::GetList(($b=""), ($o=""));
		while($arLanguage = $rsLanguages->Fetch())
			stemming_init($arLanguage["LID"]);
	}

	//Check if language was not used
	if($sLang !== false && !isset($arStemFunc[$sLang]))
	{
		$stemming_function_suf = $sLang;

		if(!function_exists("stemming_".$sLang))
		{
			$strFileName=$_SERVER["DOCUMENT_ROOT"].BX_PERSONAL_ROOT."/php_interface/".$sLang."/search/stemming.php";
			if(file_exists($strFileName))
				@include($strFileName);
			if(!function_exists("stemming_".$sLang))
			{
				$strFileName=$_SERVER["DOCUMENT_ROOT"]."/bitrix/modules/search/tools/".$sLang."/stemming.php";
				if(file_exists($strFileName))
					@include($strFileName);
				if(!function_exists("stemming_".$sLang))
				{
					$stemming_function_suf = "default";
				}
			}
		}

		$stemming_stop_function = "stemming_stop_".$sLang;
		if(!function_exists($stemming_stop_function))
			$stemming_stop_function = "stemming_stop_default";

		$stemming_upper_function = "stemming_upper_".$sLang;
		if(!function_exists($stemming_upper_function))
			$stemming_upper_function = "stemming_upper_default";

		$letters = stemming_letter_default();
		$stemming_letter_function = "stemming_letter_".$sLang;
		if(function_exists($stemming_letter_function))
			$letters .= $stemming_letter_function();
		$letters .= COption::GetOptionString("search", "letters");

		if(function_exists($stemming_letter_function))
			$abc = $stemming_letter_function();
		else
			$abc = "";

		if(strlen($abc) <= 0)
			$abc = stemming_letter_default();

		$arStemFunc[$sLang] = array(
			"stem" => "stemming_".$stemming_function_suf,
			"stop" => $stemming_stop_function,
			"upper" => $stemming_upper_function,
			"letters" => $letters,
			"pcre_letters" => "\\w\\d".str_replace(
				array("\\"  , "-"  , "^"  , "]"  , "/"),
				array("\\\\", "\\-", "\\^", "\\]", "\\/"),
				$letters
			),
			"abc" => $abc,
			"pcre_abc" => "\\w\\d".str_replace(
				array("\\"  , "-"  , "^"  , "]"  , "/"),
				array("\\\\", "\\-", "\\^", "\\]", "\\/"),
				$abc
			),
		);
	}

	if($sLang === false)
		return $arStemFunc;
	else
		return $arStemFunc[$sLang];
}

function stemming_upper($sText, $sLang="ru")
{
	$arStemFunc = stemming_init($sLang);
	$upper_function = $arStemFunc["upper"];
	return $upper_function($sText);
}

function stemming_split($sText, $sLang="ru")
{
	$arStemFunc = stemming_init($sLang);

	$words = array();

	$tok = " ";
	$sText = preg_replace("/[^".$arStemFunc["pcre_letters"]."]/".BX_UTF_PCRE_MODIFIER, $tok, ToUpper($sText));

	$word = strtok($sText, $tok);
	while($word !== false)
	{
		$word = substr($word, 0, 100);

		if(!isset($words[$word]))
			$words[$word] = strpos($sText, $word);

		$word = strtok($tok);
	}

	return $words;
}

function stemming($sText, $sLang="ru", $bIgnoreStopWords = false, $bReturnPositions = false)
{
	static $STOP_CACHE=array();
	if(!isset($STOP_CACHE[$sLang]))
		$STOP_CACHE[$sLang] = array();
	$stop_cache = &$STOP_CACHE[$sLang];

	//Result
	$stems = array();

	//Get info about all languages
	$arStemInfo = stemming_init(false);
	//Add default functions if language was not defined
	if(!isset($arStemInfo[$sLang]))
		$arStemInfo[$sLang] = stemming_init($sLang);

	$stem_func = $arStemInfo[$sLang]["stem"];
	$pcre_abc = "/[^".$arStemInfo[$sLang]["pcre_abc"]."]+/".BX_UTF_PCRE_MODIFIER;

	//Delimiter of the words
	$tok = " ";

	if($bReturnPositions)
	{
		$sText = preg_replace("/[^".$arStemInfo[$sLang]["pcre_letters"].".!?]+/".BX_UTF_PCRE_MODIFIER, $tok, ToUpper($sText));
		$sText = preg_replace("/[!?]+/".BX_UTF_PCRE_MODIFIER, ".", $sText);
	}
	else
	{
		$sText = preg_replace("/[^".$arStemInfo[$sLang]["pcre_letters"]."]+/".BX_UTF_PCRE_MODIFIER, $tok, ToUpper($sText));
	}

	//Parse text
	$words = strtok($sText, $tok);
	$pos = 1;
	while($words !== false)
	{
		if($bReturnPositions)
			$words = explode(".", $words);
		else
			$words = array($words);

		foreach($words as $i => $word)
		{
			$word = substr($word, 0, 50);

			if($bReturnPositions)
			{
				if($i > 0)
					$pos += 5; //Sentence distance
				if(!strlen($word))
					continue;
			}

			//Try to stem starting with desired language
			//1 - stemming may return more than one word
			$stem = $stem_func($word, 1);
			$stop_lang = $sLang;

			//If word equals it's stemming
			//and has letters not from ABC
			if(
				!is_array($stem)
				&& $stem === $word
				&& preg_match($pcre_abc, $word)
			)
			{
				//Do the best to detect correct one
				$guess = stemming_detect($word, $arStemInfo, $sLang);
				if(strlen($guess[0]))
				{
					$stem = $guess[0];
					$stop_lang = $guess[1];
				}
			}

			if($bIgnoreStopWords)
			{
				if(is_array($stem))
				{
					foreach($stem as $st)
						$stems[$st] = isset($stems[$st])? $stems[$st] + $pos: $pos;
				}
				else
				{
					$stems[$stem] = isset($stems[$stem])? $stems[$stem] + $pos: $pos;
				}
			}
			else
			{
				$stop_func = $arStemInfo[$stop_lang]["stop"];
				if(is_array($stem))
				{
					foreach($stem as $st)
					{
						if(!isset($stop_cache[$st]))
							$stop_cache[$st] = $stop_func($st);

						if($stop_cache[$st])
							$stems[$st] = isset($stems[$st])? $stems[$st] + $pos: $pos;
					}
				}
				else
				{
					if(!isset($stop_cache[$stem]))
						$stop_cache[$stem] = $stop_func($stem);

					if($stop_cache[$stem])
						$stems[$stem] = isset($stems[$stem])? $stems[$stem] + $pos: $pos;
				}
			}

			if($bReturnPositions)
				$pos++;
		}
		//Next word
		$words = strtok($tok);
	}

	return $stems;
}

function stemming_detect($word, $arStemInfo, $skipLang)
{
	$stem = "";
	$lang = "";

	foreach($arStemInfo as $sGuessLang => $arInfo)
	{
		if($sGuessLang === $skipLang)
			continue;

		//Word has letters not from ABC, so skip to next language
		if(preg_match("/[^".$arInfo["pcre_abc"]."]+/".BX_UTF_PCRE_MODIFIER, $word))
			continue;

		$stem = $arInfo["stem"]($word);
		$lang = $sGuessLang;

		//It looks like stemming succseeded
		if($stem !== $word)
			break;

		//Check if stop function flag word as stop
		$stop_func = $arInfo["stop"];
		if(!$stop_func($stem))
			break;
	}

	//It' s the best we can do
	//return word and lang to use as stop
	return array($stem, $lang);
}

function stemming_upper_default($sText)
{
	return ToUpper($sText);
}

function stemming_default($sText)
{
	return $sText;
}
function stemming_stop_default($sWord)
{
	if(strlen($sWord) < 2)
		return false;
	else
		return true;
}
function stemming_letter_default()
{
	return "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM0123456789";
}
?>