aboutsummaryrefslogtreecommitdiff
path: root/mod/search/start.php
diff options
context:
space:
mode:
Diffstat (limited to 'mod/search/start.php')
-rw-r--r--mod/search/start.php522
1 files changed, 333 insertions, 189 deletions
diff --git a/mod/search/start.php b/mod/search/start.php
index 47405450a..8a112a3a3 100644
--- a/mod/search/start.php
+++ b/mod/search/start.php
@@ -1,54 +1,64 @@
<?php
/**
- * Elgg core search.
- *
- * @package Elgg
- * @subpackage Core
- * @author Curverider Ltd <info@elgg.com>, The MITRE Corporation <http://www.mitre.org>
- * @link http://elgg.org/
- */
+ * Elgg search plugin
+ *
+ */
+
+elgg_register_event_handler('init','system','search_init');
/**
- * Initialise search helper functions.
- *
+ * Initialize search plugin
*/
function search_init() {
global $CONFIG;
require_once 'search_hooks.php';
// page handler for search actions and results
- register_page_handler('search','search_page_handler');
+ elgg_register_page_handler('search', 'search_page_handler');
// register some default search hooks
- register_plugin_hook('search', 'object', 'search_objects_hook');
- register_plugin_hook('search', 'user', 'search_users_hook');
+ elgg_register_plugin_hook_handler('search', 'object', 'search_objects_hook');
+ elgg_register_plugin_hook_handler('search', 'user', 'search_users_hook');
+ elgg_register_plugin_hook_handler('search', 'group', 'search_groups_hook');
- // @todo pull this out into groups
- register_plugin_hook('search', 'group', 'search_groups_hook');
+ // tags and comments are a bit different.
+ // register a search types and a hooks for them.
+ elgg_register_plugin_hook_handler('search_types', 'get_types', 'search_custom_types_tags_hook');
+ elgg_register_plugin_hook_handler('search', 'tags', 'search_tags_hook');
- // tags are a bit different.
- // register a custom search type and a hook for that.
- register_plugin_hook('search_types', 'get_types', 'search_custom_types_tags_hook');
- register_plugin_hook('search', 'tags', 'search_tags_hook');
+ elgg_register_plugin_hook_handler('search_types', 'get_types', 'search_custom_types_comments_hook');
+ elgg_register_plugin_hook_handler('search', 'comments', 'search_comments_hook');
// get server min and max allowed chars for ft searching
- $word_lens = get_data('SELECT @@ft_min_word_len as min, @@ft_max_word_len as max');
-
$CONFIG->search_info = array();
- $CONFIG->search_info['min_chars'] = $word_lens[0]->min;
- $CONFIG->search_info['max_chars'] = $word_lens[0]->max;
+
+ // can't use get_data() here because some servers don't have these globals set,
+ // which throws a db exception.
+ $dblink = get_db_link('read');
+ $r = mysql_query('SELECT @@ft_min_word_len as min, @@ft_max_word_len as max', $dblink);
+ if ($r && ($word_lens = mysql_fetch_assoc($r))) {
+ $CONFIG->search_info['min_chars'] = $word_lens['min'];
+ $CONFIG->search_info['max_chars'] = $word_lens['max'];
+ } else {
+ // uhhh these are good numbers.
+ $CONFIG->search_info['min_chars'] = 4;
+ $CONFIG->search_info['max_chars'] = 90;
+ }
// add in CSS for search elements
- extend_view('css', 'search/css');
+ elgg_extend_view('css/elgg', 'search/css');
+
+ // extend view for elgg topbar search box
+ elgg_extend_view('page/elements/header', 'search/header');
}
/**
* Page handler for search
*
- * @param array $page Page elements from pain page handler
+ * @param array $page Page elements from core page handler
+ * @return bool
*/
function search_page_handler($page) {
- global $CONFIG;
// if there is no q set, we're being called from a legacy installation
// it expects a search by tags.
@@ -59,242 +69,315 @@ function search_page_handler($page) {
//set_input('search_type', 'tags');
}
- include_once('index.php');
+ $base_dir = elgg_get_plugins_path() . 'search/pages/search';
+
+ include_once("$base_dir/index.php");
+ return true;
}
/**
- * Return a string with highlighted matched elements.
- * Checks for "s
- * Provides context for matched elements.
- * Will not return more than $max_length of full context.
- * Only highlights words
+ * Return a string with highlighted matched queries and relevant context
+ * Determines context based upon occurance and distance of words with each other.
*
- * @param unknown_type $haystack
- * @param unknown_type $need
- * @param unknown_type $context
- * @param unknown_type $max_length
- * @return unknown_type
+ * @param string $haystack
+ * @param string $query
+ * @param int $min_match_context = 30
+ * @param int $max_length = 300
+ * @param bool $tag_match Search is for tags. Don't ignore words.
+ * @return string
*/
-function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 250) {
+function search_get_highlighted_relevant_substrings($haystack, $query, $min_match_context = 30, $max_length = 300, $tag_match = false) {
+
$haystack = strip_tags($haystack);
- $haystack_lc = strtolower($haystack);
+ $haystack_length = elgg_strlen($haystack);
+ $haystack_lc = elgg_strtolower($haystack);
- // for now don't worry about "s or boolean operators
- $needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle)));
- $words = explode(' ', $needle);
+ if (!$tag_match) {
+ $words = search_remove_ignored_words($query, 'array');
+ } else {
+ $words = array();
+ }
- $min_chars = $CONFIG->search_info['min_chars'];
- // if > ft_min_word == not running in literal mode.
- if ($needle >= $min_chars) {
- // clean out any words that are ignored by mysql
- foreach ($words as $i => $word) {
- if (strlen($word) < $min_chars) {
- unset ($words[$i]);
- }
- }
+ // if haystack < $max_length return the entire haystack w/formatting immediately
+ if ($haystack_length <= $max_length) {
+ $return = search_highlight_words($words, $haystack);
+
+ return $return;
}
- $substr_counts = array();
- $str_pos = array();
- // get the full count of matches.
+ // get the starting positions and lengths for all matching words
+ $starts = array();
+ $lengths = array();
foreach ($words as $word) {
- $word = strtolower($word);
- $count = substr_count($haystack, $word);
- $word_len = strlen($word);
+ $word = elgg_strtolower($word);
+ $count = elgg_substr_count($haystack_lc, $word);
+ $word_len = elgg_strlen($word);
+ $haystack_len = elgg_strlen($haystack_lc);
// find the start positions for the words
- // get the context for words based upon
if ($count > 1) {
- $str_pos[$word] = array();
$offset = 0;
- while (FALSE !== $pos = strpos($haystack, $word, $offset)) {
- $str_pos[$word][] = $pos;
+ while (FALSE !== $pos = elgg_strpos($haystack_lc, $word, $offset)) {
+ $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+ $starts[] = $start;
+ $stop = $pos + $word_len + $min_match_context;
+ $lengths[] = $stop - $start;
$offset += $pos + $word_len;
+
+ if ($offset >= $haystack_len) {
+ break;
+ }
}
} else {
- $str_pos[$word] = array(strpos($haystack, $word));
+ $pos = elgg_strpos($haystack_lc, $word);
+ $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+ $starts[] = $start;
+ $stop = $pos + $word_len + $min_match_context;
+ $lengths[] = $stop - $start;
}
- $substr_counts[$word] = $count;
}
-//A test with multiple words and now more in the subject too because words need to be everywhere
-
- // sort by order of occurence
- krsort($substr_counts);
- $full_count = array_sum($substr_counts);
+ $offsets = search_consolidate_substrings($starts, $lengths);
+ // figure out if we can adjust the offsets and lengths
+ // in order to return more context
+ $total_length = array_sum($offsets);
+ $add_length = 0;
+ if ($total_length < $max_length && $offsets) {
+ $add_length = floor((($max_length - $total_length) / count($offsets)) / 2);
+ $starts = array();
+ $lengths = array();
+ foreach ($offsets as $offset => $length) {
+ $start = ($offset - $add_length > 0) ? $offset - $add_length : 0;
+ $length = $length + $add_length;
+ $starts[] = $start;
+ $lengths[] = $length;
+ }
+ $offsets = search_consolidate_substrings($starts, $lengths);
+ }
- // get full number of matches against all words to see how many we actually want to look at.
+ // sort by order of string size descending (which is roughly
+ // the proximity of matched terms) so we can keep the
+ // substrings with terms closest together and discard
+ // the others as needed to fit within $max_length.
+ arsort($offsets);
+ $return_strs = array();
+ $total_length = 0;
+ foreach ($offsets as $start => $length) {
+ $string = trim(elgg_substr($haystack, $start, $length));
+ // continue past if adding this substring exceeds max length
+ if ($total_length + $length > $max_length) {
+ continue;
+ }
+ $total_length += $length;
+ $return_strs[$start] = $string;
+ }
-// $desc = search_get_relevant_substring($entity->description, $params['query'], '<strong class="searchMatch">', '</strong>');
+ // put the strings in order of occurence
+ ksort($return_strs);
+ // add ...s where needed
+ $return = implode('...', $return_strs);
+ if (!array_key_exists(0, $return_strs)) {
+ $return = "...$return";
+ }
- $params['query'];
- // "this is"just a test "silly person"
+ // add to end of string if last substring doesn't hit the end.
+ $starts = array_keys($return_strs);
+ $last_pos = $starts[count($starts)-1];
+ if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) {
+ $return .= '...';
+ }
- // check for "s
- $words_quotes = explode('"', $needle);
+ $return = search_highlight_words($words, $return);
- $words_orig = explode(' ', $needle);
- $words = array();
+ return $return;
+}
- foreach ($words_orig as $i => $word) {
- // figure out if we have a special operand
- $operand = substr($word, 0, 1);
- switch($operand) {
- case '"':
- // find the matching " if any. else, remove the "
- if (substr_count($query, '"') < 2) {
- $words[] = substr($word, 1);
- } else {
- $word = substr($word, 1);
- $word_i = $i;
- while ('"' != strpos($words_orig[$word_i], '"')) {
- $word .= " {$words_orig[$word_i]}";
- unset($words_orig[$word_i]);
- }
+/**
+ * Takes an array of offsets and lengths and consolidates any
+ * overlapping entries, returning an array of new offsets and lengths
+ *
+ * Offsets and lengths are specified in separate arrays because of possible
+ * index collisions with the offsets.
+ *
+ * @param array $offsets
+ * @param array $lengths
+ * @return array
+ */
+function search_consolidate_substrings($offsets, $lengths) {
+ // sort offsets by occurence
+ asort($offsets, SORT_NUMERIC);
- }
+ // reset the indexes maintaining association with the original offsets.
+ $offsets = array_merge($offsets);
- break;
+ $new_lengths = array();
+ foreach ($offsets as $i => $offset) {
+ $new_lengths[] = $lengths[$i];
+ }
- case '+':
- // remove +
- $words[] = substr($word, 1);
- break;
+ $lengths = $new_lengths;
- case '~':
- case '-':
- // remove this from highlighted list.
+ $return = array();
+ $count = count($offsets);
+ for ($i=0; $i<$count; $i++) {
+ $offset = $offsets[$i];
+ $length = $lengths[$i];
+ $end_pos = $offset + $length;
+ // find the next entry that doesn't overlap
+ while (array_key_exists($i+1, $offsets) && $end_pos > $offsets[$i+1]) {
+ $i++;
+ if (!array_key_exists($i, $offsets)) {
break;
+ }
+ $end_pos = $lengths[$i] + $offsets[$i];
}
- }
- // pick out " queries
- if (substr_count($query, '"') >= 2) {
+ $length = $end_pos - $offset;
+ // will never have a colliding offset, so can return as a single array
+ $return[$offset] = $length;
}
- // ignore queries starting with -
-
-
- // @todo figure out a way to "center" the matches within the max_length.
- // if only one match, its context is $context + $max_length / 2
- // if 2 matches, its context is $context + $max_length / 4
- // if 3 matches, its context is $context + $max_length / 6
- // $context per match = $min_match_context + ($max_length / $num_count_match)
-
- // if $max_length / ($matched_count * 2) < $context
- // only match against the first X matches where $context >= $context
+ return $return;
}
/**
- * Returns a matching string with $context amount of context, optionally
- * surrounded by $before and $after.
- *
- * If no match is found, restricts string to $context*2 starting from strpos 0.
+ * Safely highlights the words in $words found in $string avoiding recursion
*
- * @param str $haystack
- * @param str $needle
- * @param str $before
- * @param str $after
- * @param int $context
- * @return str
+ * @param array $words
+ * @param string $string
+ * @return string
*/
-function search_get_relevant_substring($haystack, $needle, $before = '', $after = '', $context = 75) {
- $haystack = strip_tags($haystack);
- $needle = strip_tags($needle);
-
- $pos = strpos(strtolower($haystack), strtolower($needle));
-
- if ($pos === FALSE) {
- $str = substr($haystack, 0, $context*2);
- if (strlen($haystack) > $context*2) {
- $str .= '...';
- }
+function search_highlight_words($words, $string) {
+ $i = 1;
+ $replace_html = array(
+ 'strong' => rand(10000, 99999),
+ 'class' => rand(10000, 99999),
+ 'search-highlight' => rand(10000, 99999),
+ 'search-highlight-color' => rand(10000, 99999)
+ );
- return $str;
+ foreach ($words as $word) {
+ // remove any boolean mode operators
+ $word = preg_replace("/([\-\+~])([\w]+)/i", '$2', $word);
+
+ // escape the delimiter and any other regexp special chars
+ $word = preg_quote($word, '/');
+
+ $search = "/($word)/i";
+
+ // @todo
+ // must replace with placeholders in case one of the search terms is
+ // in the html string.
+ // later, will replace the placeholders with the actual html.
+ // Yeah this is hacky. I'm tired.
+ $strong = $replace_html['strong'];
+ $class = $replace_html['class'];
+ $highlight = $replace_html['search-highlight'];
+ $color = $replace_html['search-highlight-color'];
+
+ $replace = "<$strong $class=\"$highlight $color{$i}\">$1</$strong>";
+ $string = preg_replace($search, $replace, $string);
+ $i++;
}
- $start_pos = $pos - $context;
-
- if ($start_pos < 0) {
- $start_pos = 0;
+ foreach ($replace_html as $replace => $search) {
+ $string = str_replace($search, $replace, $string);
}
- // get string from -context to +context
- $matched = substr($haystack, $start_pos, $context*2);
+ return $string;
+}
- // add elipses to front.
- if ($start_pos > 0) {
- $matched = "...$matched";
- }
+/**
+ * Returns a query with stop and too short words removed.
+ * (Unless the entire query is < ft_min_word_chars, in which case
+ * it's taken literally.)
+ *
+ * @param array $query
+ * @param str $format Return as an array or a string
+ * @return mixed
+ */
+function search_remove_ignored_words($query, $format = 'array') {
+ global $CONFIG;
+
+ // don't worry about "s or boolean operators
+ //$query = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($query)));
+ $query = stripslashes(strip_tags($query));
+
+ $words = explode(' ', $query);
- // add elipses to end.
- if ($pos + strlen($needle) + $context*2 < strlen($haystack)) {
- $matched = "$matched...";
+ $min_chars = $CONFIG->search_info['min_chars'];
+ // if > ft_min_word we're not running in literal mode.
+ if (elgg_strlen($query) >= $min_chars) {
+ // clean out any words that are ignored by mysql
+ foreach ($words as $i => $word) {
+ if (elgg_strlen($word) < $min_chars) {
+ unset ($words[$i]);
+ }
+ }
}
- // surround if needed
- if ($before || $after) {
- $matched = str_ireplace($needle, $before . $needle . $after, $matched);
+ if ($format == 'string') {
+ return implode(' ', $words);
}
- return $matched;
+ return $words;
}
/**
- * Passes entities, count, and original params to the view functions for
+ * Passes results, and original params to the view functions for
* search type.
*
- * @param array $entities
- * @param int $count
+ * @param array $results
* @param array $params
+ * @param string $view_type = list, entity or layout
* @return string
*/
-function search_get_listing_html($entities, $count, $params) {
- if (!is_array($entities) || !$count) {
- return FALSE;
+function search_get_search_view($params, $view_type) {
+ switch ($view_type) {
+ case 'list':
+ case 'entity':
+ case 'layout':
+ break;
+
+ default:
+ return FALSE;
}
$view_order = array();
- // check if there's a special search view for this type:subtype
+ // check if there's a special search list view for this type:subtype
if (isset($params['type']) && $params['type'] && isset($params['subtype']) && $params['subtype']) {
- $view_order[] = "search/{$params['type']}/{$params['subtype']}/listing";
+ $view_order[] = "search/{$params['type']}/{$params['subtype']}/$view_type";
}
// also check for the default type
if (isset($params['type']) && $params['type']) {
- $view_order[] = "search/{$params['type']}/listing";
+ $view_order[] = "search/{$params['type']}/$view_type";
}
// check search types
if (isset($params['search_type']) && $params['search_type']) {
- $view_order[] = "search/{$params['search_type']}/listing";
+ $view_order[] = "search/{$params['search_type']}/$view_type";
}
- // finally default to a search listing default
- $view_order[] = "search/listing";
-
- $vars = array(
- 'entities' => $entities,
- 'count' => $count,
- 'params' => $params
- );
+ // finally default to a search list default
+ $view_order[] = "search/$view_type";
foreach ($view_order as $view) {
if (elgg_view_exists($view)) {
- return elgg_view($view, $vars);
+ return $view;
}
}
@@ -309,50 +392,111 @@ function search_get_listing_html($entities, $count, $params) {
* @param array $params Original search params
* @return str
*/
-function search_get_where_sql($table, $fields, $params) {
+function search_get_where_sql($table, $fields, $params, $use_fulltext = TRUE) {
global $CONFIG;
$query = $params['query'];
// add the table prefix to the fields
foreach ($fields as $i => $field) {
- $fields[$i] = "$table.$field";
+ if ($table) {
+ $fields[$i] = "$table.$field";
+ }
}
+
+ $where = '';
// if query is shorter than the min for fts words
// it's likely a single acronym or similar
// switch to literal mode
- if (strlen($query) < $CONFIG->search_info['min_chars']) {
+ if (elgg_strlen($query) < $CONFIG->search_info['min_chars']) {
$likes = array();
+ $query = sanitise_string($query);
foreach ($fields as $field) {
$likes[] = "$field LIKE '%$query%'";
}
$likes_str = implode(' OR ', $likes);
- $where = "($table.guid = e.guid AND ($likes_str))";
+ $where = "($likes_str)";
} else {
- // if using advanced or paired "s, switch into boolean mode
- if ((isset($params['advanced_search']) && $params['advanced_search']) || substr_count($query, '"') >= 2 ) {
+ // if we're not using full text, rewrite the query for bool mode.
+ // exploiting a feature(ish) of bool mode where +-word is the same as -word
+ if (!$use_fulltext) {
+ $query = '+' . str_replace(' ', ' +', $query);
+ }
+
+ // if using advanced, boolean operators, or paired "s, switch into boolean mode
+ $booleans_used = preg_match("/([\-\+~])([\w]+)/i", $query);
+ $advanced_search = (isset($params['advanced_search']) && $params['advanced_search']);
+ $quotes_used = (elgg_substr_count($query, '"') >= 2);
+
+ if (!$use_fulltext || $booleans_used || $advanced_search || $quotes_used) {
$options = 'IN BOOLEAN MODE';
} else {
- $options = 'IN NATURAL LANGUAGE MODE';
+ // natural language mode is default and this keyword isn't supported in < 5.1
+ //$options = 'IN NATURAL LANGUAGE MODE';
+ $options = '';
}
-
+
// if short query, use query expansion.
- if (strlen($query) < 6) {
- $options .= ' WITH QUERY EXPANSION';
- }
- // if query is shorter than the ft_min_word_len switch to literal mode.
+ // @todo doesn't seem to be working well.
+// if (elgg_strlen($query) < 5) {
+// $options .= ' WITH QUERY EXPANSION';
+// }
+ $query = sanitise_string($query);
+
$fields_str = implode(',', $fields);
- $where = "($table.guid = e.guid AND (MATCH ($fields_str) AGAINST ('$query' $options)))";
+ $where = "(MATCH ($fields_str) AGAINST ('$query' $options))";
}
return $where;
}
-function search_get_query_where_sql($table, $query) {
- // if there are multiple "s or 's it's a literal string.
-}
+/**
+ * Returns ORDER BY sql for insertion into elgg_get_entities().
+ *
+ * @param str $entities_table Prefix for entities table.
+ * @param str $type_table Prefix for the type table.
+ * @param str $sort ORDER BY part
+ * @param str $order ASC or DESC
+ * @return str
+ */
+function search_get_order_by_sql($entities_table, $type_table, $sort, $order) {
+
+ $on = NULL;
+
+ switch ($sort) {
+ default:
+ case 'relevance':
+ // default is relevance descending.
+ // ascending relevancy is silly and complicated.
+ $on = '';
+ break;
+ case 'created':
+ $on = "$entities_table.time_created";
+ break;
+ case 'updated':
+ $on = "$entities_table.time_updated";
+ break;
+ case 'action_on':
+ // @todo not supported yet in core
+ $on = '';
+ break;
+ case 'alpha':
+ // @todo not support yet because both title
+ // and name columns are used for this depending
+ // on the entity, which we don't always know. >:O
+ break;
+ }
+ $order = strtolower($order);
+ if ($order != 'asc' && $order != 'desc') {
+ $order = 'DESC';
+ }
-/** Register init system event **/
+ if ($on) {
+ $order_by = "$on $order";
+ } else {
+ $order_by = '';
+ }
-register_elgg_event_handler('init','system','search_init'); \ No newline at end of file
+ return $order_by;
+}