Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
54 / 54 |
|
100.00% |
7 / 7 |
CRAP | |
100.00% |
1 / 1 |
SearchEngineService | |
100.00% |
54 / 54 |
|
100.00% |
7 / 7 |
32 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
match | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
6 | |||
getSearchEngines | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
matchVariant | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
6 | |||
matchHostPattern | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
matchesHiddenKeyword | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
7 | |||
extractKeyword | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
8 |
1 | <?php |
2 | |
3 | declare(strict_types=1); |
4 | |
5 | namespace Drupal\visitors\Service; |
6 | |
7 | use Drupal\Core\Config\ConfigFactoryInterface; |
8 | use Drupal\visitors\VisitorsSearchEngineInterface; |
9 | |
10 | /** |
11 | * Service for matching URLs to search engines. |
12 | */ |
13 | final class SearchEngineService implements VisitorsSearchEngineInterface { |
14 | |
15 | /** |
16 | * The config factory service. |
17 | * |
18 | * @var \Drupal\Core\Config\ConfigFactoryInterface |
19 | */ |
20 | private ConfigFactoryInterface $configFactory; |
21 | |
22 | /** |
23 | * Cached search engines configuration. |
24 | * |
25 | * @var array|null |
26 | */ |
27 | private ?array $searchEngines = NULL; |
28 | |
29 | /** |
30 | * Constructs a new SearchEngineService. |
31 | * |
32 | * @param \Drupal\Core\Config\ConfigFactoryInterface $config_factory |
33 | * The config factory service. |
34 | */ |
35 | public function __construct(ConfigFactoryInterface $config_factory) { |
36 | $this->configFactory = $config_factory; |
37 | } |
38 | |
39 | /** |
40 | * {@inheritdoc} |
41 | */ |
42 | public function match(string $url): ?array { |
43 | $parsed_url = parse_url($url); |
44 | |
45 | if (!$parsed_url || !isset($parsed_url['host'])) { |
46 | return NULL; |
47 | } |
48 | |
49 | $search_engines = $this->getSearchEngines(); |
50 | $host = strtolower($parsed_url['host']); |
51 | $query_string = $parsed_url['query'] ?? ''; |
52 | $path = $parsed_url['path'] ?? ''; |
53 | |
54 | foreach ($search_engines as $engine) { |
55 | foreach ($engine['variants'] as $variant) { |
56 | if ($this->matchVariant($host, $query_string, $path, $variant)) { |
57 | return [ |
58 | 'name' => $engine['label'], |
59 | 'variant' => $variant, |
60 | 'keyword' => $this->extractKeyword($query_string, $path, $variant), |
61 | ]; |
62 | } |
63 | } |
64 | } |
65 | |
66 | return NULL; |
67 | } |
68 | |
69 | /** |
70 | * Gets the search engines configuration. |
71 | * |
72 | * @return array |
73 | * The search engines configuration. |
74 | */ |
75 | private function getSearchEngines(): array { |
76 | if ($this->searchEngines === NULL) { |
77 | $config = $this->configFactory->get('visitors.search_engines'); |
78 | $this->searchEngines = $config->get('sites') ?? []; |
79 | } |
80 | |
81 | return $this->searchEngines; |
82 | } |
83 | |
84 | /** |
85 | * Matches a URL against a search engine variant. |
86 | * |
87 | * @param string $host |
88 | * The hostname from the URL. |
89 | * @param string $query_string |
90 | * The query string from the URL. |
91 | * @param string $path |
92 | * The path from the URL. |
93 | * @param array $variant |
94 | * The search engine variant configuration. |
95 | * |
96 | * @return bool |
97 | * TRUE if the variant matches, FALSE otherwise. |
98 | */ |
99 | private function matchVariant(string $host, string $query_string, string $path, array $variant): bool { |
100 | $urls = $variant['urls'] ?? []; |
101 | |
102 | foreach ($urls as $pattern) { |
103 | if ($this->matchHostPattern($host, $pattern)) { |
104 | // Check for hidden keywords that should exclude this match. |
105 | if (isset($variant['hiddenkeyword'])) { |
106 | foreach ($variant['hiddenkeyword'] as $hidden_pattern) { |
107 | if ($this->matchesHiddenKeyword($query_string, $path, $hidden_pattern)) { |
108 | return FALSE; |
109 | } |
110 | } |
111 | } |
112 | return TRUE; |
113 | } |
114 | } |
115 | |
116 | return FALSE; |
117 | } |
118 | |
119 | /** |
120 | * Matches a hostname against a pattern. |
121 | * |
122 | * @param string $host |
123 | * The hostname to match. |
124 | * @param string $pattern |
125 | * The pattern to match against. |
126 | * |
127 | * @return bool |
128 | * TRUE if the pattern matches, FALSE otherwise. |
129 | */ |
130 | private function matchHostPattern(string $host, string $pattern): bool { |
131 | // Handle wildcard patterns like 'google.{}' or '{}.google.com'. |
132 | $pattern = strtolower($pattern); |
133 | |
134 | if (strpos($pattern, '{}') !== FALSE) { |
135 | // Replace {} placeholder with a temporary marker. |
136 | $temp_marker = '__WILDCARD_PLACEHOLDER__'; |
137 | $regex_pattern = str_replace('{}', $temp_marker, $pattern); |
138 | |
139 | // Escape the entire pattern for regex. |
140 | $regex_pattern = preg_quote($regex_pattern, '/'); |
141 | |
142 | // Replace the temporary marker with the actual wildcard pattern. |
143 | $regex_pattern = str_replace($temp_marker, '[a-z0-9.-]+', $regex_pattern); |
144 | |
145 | return (bool) preg_match('/^' . $regex_pattern . '$/', $host); |
146 | } |
147 | |
148 | // Exact match. |
149 | return $host === $pattern; |
150 | } |
151 | |
152 | /** |
153 | * Checks if the URL matches a hidden keyword pattern. |
154 | * |
155 | * @param string $query_string |
156 | * The query string from the URL. |
157 | * @param string $path |
158 | * The path from the URL. |
159 | * @param string $pattern |
160 | * The hidden keyword pattern. |
161 | * |
162 | * @return bool |
163 | * TRUE if matches hidden keyword, FALSE otherwise. |
164 | */ |
165 | private function matchesHiddenKeyword(string $query_string, string $path, string $pattern): bool { |
166 | // Handle regex patterns. |
167 | if (preg_match('/^\/.*\/$/', $pattern)) { |
168 | $full_url_part = $path . ($query_string ? '?' . $query_string : ''); |
169 | |
170 | // Special handling for Google search patterns |
171 | // Don't exclude /search URLs that have a 'q' parameter (actual searches) |
172 | if ($pattern === '/\/search(\?.*)?/' && $path === '/search' && strpos($query_string, 'q=') !== FALSE) { |
173 | return FALSE; |
174 | } |
175 | |
176 | return (bool) preg_match($pattern, $full_url_part); |
177 | } |
178 | |
179 | // Handle exact matches. |
180 | return $query_string === $pattern || $path === $pattern; |
181 | } |
182 | |
183 | /** |
184 | * Extracts the search keyword from the URL. |
185 | * |
186 | * @param string $query_string |
187 | * The query string from the URL. |
188 | * @param string $path |
189 | * The path from the URL. |
190 | * @param array $variant |
191 | * The search engine variant configuration. |
192 | * |
193 | * @return string|null |
194 | * The extracted keyword, or NULL if not found. |
195 | */ |
196 | private function extractKeyword(string $query_string, string $path, array $variant): ?string { |
197 | $params = $variant['params'] ?? []; |
198 | |
199 | foreach ($params as $param) { |
200 | // Handle regex parameter patterns. |
201 | if (preg_match('/^\/.*\/$/', $param)) { |
202 | $full_url_part = $path . ($query_string ? '?' . $query_string : ''); |
203 | if (preg_match($param, $full_url_part, $matches)) { |
204 | return isset($matches[1]) ? urldecode($matches[1]) : NULL; |
205 | } |
206 | } |
207 | // Handle query parameters. |
208 | else { |
209 | parse_str($query_string, $query_params); |
210 | if (isset($query_params[$param]) && !empty($query_params[$param])) { |
211 | return urldecode((string) $query_params[$param]); |
212 | } |
213 | } |
214 | } |
215 | |
216 | return NULL; |
217 | } |
218 | |
219 | } |