@@ -11,18 +11,18 @@ class NGramFactory
1111{
1212 const BIGRAM = 2 ;
1313 const TRIGRAM = 3 ;
14-
14+
1515 /**
16- * Protect the constructor
17- */
16+ * Protect the constructor
17+ */
1818 protected function __construct (){}
19-
19+
2020 /**
21- * Generate Ngrams from the tokens
22- * @param array $tokens
23- * @param int $nGramSize
24- * @return array return an array of the ngrams
25- */
21+ * Generate Ngrams from the tokens
22+ * @param array $tokens
23+ * @param int $nGramSize
24+ * @return array return an array of the ngrams
25+ */
2626 static public function create (array $ tokens , $ nGramSize = self ::BIGRAM , $ separator = ' ' ) : array
2727 {
2828 $ separatorLength = strlen ($ separator );
@@ -31,17 +31,100 @@ static public function create(array $tokens, $nGramSize = self::BIGRAM, $separat
3131 return [];
3232 }
3333 $ ngrams = array_fill (0 , $ length , '' ); // initialize the array
34-
34+
3535 for ($ index = 0 ; $ index < $ length ; $ index ++)
3636 {
3737 for ($ jindex = 0 ; $ jindex < $ nGramSize ; $ jindex ++)
3838 {
39- $ ngrams [$ index ] .= $ tokens [$ index + $ jindex ];
40- if ($ jindex < $ nGramSize - $ separatorLength ) {
39+ $ ngrams [$ index ] .= $ tokens [$ index + $ jindex ];
40+ //alterado a condição, pois não considera-se o tamanho do separador e sim a posição do ponteiro em relação ao tamanho do Ngram
41+ if ($ jindex < $ nGramSize - 1 ) {
4142 $ ngrams [$ index ] .= $ separator ;
4243 }
4344 }
4445 }
45- return $ ngrams ;
46+ return $ ngrams ;
47+ }
48+
49+ /**
50+ * Set the frenquecies of the ngrams and their respective tokens
51+ * @param array $ngrams
52+ * @param string $sep
53+ * @return array return an array of the ngrams with frequencies
54+ */
55+ static public function getFreq (array $ ngrams , string $ sep = ' ' ) : array
56+ {
57+ //getting the frequencies of the ngrams array and an array with no repetition
58+ $ ngramsUnique = array_count_values ($ ngrams );
59+
60+ //array to be the product of this function
61+ $ ngramsFinal = array ();
62+
63+ //creates an array of tokens per ngram
64+ $ ngramsArray = self ::ngramsAsArray ($ sep , $ ngrams );
65+
66+ //interate the array with no repeated ngrams
67+ foreach ($ ngramsUnique as $ ngramString => $ ngramFrequency ) {
68+ $ ngramsFinal [$ ngramString ] = array ($ ngramFrequency ); //putting into the final array an array of frequencies (first, the ngram frequency)
69+
70+ $ ngramArray = explode ($ sep , $ ngramString ); //getting an array of tokens of the ngram
71+ $ ngramSize = count ($ ngramArray ); //getting the size of ngram
72+ foreach ($ ngramArray as $ kToken => $ token ) { //iterating the array of tokens of the ngram
73+ $ ngramsFinal [$ ngramString ][$ kToken +1 ] = self ::countFreq ($ ngramsArray , $ token , $ kToken ); //getting the frequency of the token
74+
75+ if ($ ngramSize > 2 ) {
76+ //getting the combined frequency of the tokens
77+ for ($ i = $ kToken +1 ; $ i < $ ngramSize ; $ i ++) {
78+ $ ngramsFinal [$ ngramString ][$ ngramSize +$ kToken +$ i ] = self ::countFreq ($ ngramsArray , $ token , $ kToken , $ ngramArray [$ i ], $ i );
79+ }
80+ }
81+ }
82+
83+ }
84+
85+ return $ ngramsFinal ;
86+ }
87+
88+ /**
89+ * Count the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
90+ * @param array $ngramsArray
91+ * @param string $str1
92+ * @param int $pos1
93+ * @param string $str2
94+ * @param int $pos2
95+ * @return int $count return the frequency
96+ */
97+ static private function countFreq (array $ ngramsArray , string $ str1 , int $ pos1 , string $ str2 = null , int $ pos2 = null ) : int
98+ {
99+ $ count = 0 ;
100+
101+ //counts the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
102+ foreach ($ ngramsArray as $ ngramArray ) {
103+ if ($ str1 === $ ngramArray [$ pos1 ]) {
104+ if (isset ($ str2 ) && isset ($ pos2 )) {
105+ if ($ str2 === $ ngramArray [$ pos2 ]) {
106+ $ count ++;
107+ }
108+ } else {
109+ $ count ++;
110+ }
111+ }
112+ }
113+
114+ return $ count ;
115+ }
116+
117+ /**
118+ * Transform the ngram array to an array of their tokens
119+ * @param string $sep
120+ * @param array $ngrams
121+ * @return array $ngramsArray
122+ */
123+ static private function ngramsAsArray (string $ sep , array $ ngrams ) : array {
124+ $ ngramsArray = array ();
125+ foreach ($ ngrams as $ key => $ ngram ) {
126+ $ ngramsArray [] = explode ($ sep , $ ngram );
127+ }
128+ return $ ngramsArray ;
46129 }
47130}
0 commit comments