39 #include "fts0types.ic"
47 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
49 #define RANK_DOWNGRADE (-1.0F)
50 #define RANK_UPGRADE (1.0F)
55 #define MAX_PROXIMITY_ITEM 128
58 #define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
59 #define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
62 #define RANKING_WORDS_INIT_LEN 4
65 static const double FTS_NORMALIZE_COEFF = 0.0115F;
69 typedef std::map<std::string, ulint> word_map_t;
70 typedef std::vector<std::string> word_vector_t;
112 que_t* read_nodes_graph;
118 ibool collect_positions;
248 fts_query_index_fetch_nodes(
258 fts_query_filter_doc_ids(
267 ibool calc_doc_count);
276 fts_query_find_doc_id(
297 __attribute__((nonnull, warn_unused_result));
306 fts_phrase_or_proximity_search(
321 fts_proximity_get_positions(
336 fts_query_terms_in_document(
374 for (i = 0; i < n_rows; ++
i) {
379 for (j = 0; j < n_cols; ++j) {
381 printf(
"%2lu ", FTS_ELEM(table, n_cols, i, j));
404 ulint
size = (r + 1) * (c + 1) *
sizeof(ulint);
411 for (i = r; i >= 0; --
i) {
414 for (j = c; j >= 0; --j) {
416 if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
418 FTS_ELEM(table, c, i, j) = 0;
420 }
else if (p1[i] == p2[j]) {
422 FTS_ELEM(table, c, i, j) = FTS_ELEM(
423 table, c, i + 1, j + 1) + 1;
430 FTS_ELEM(table, c, i + 1, j),
431 FTS_ELEM(table, c, i, j + 1));
433 FTS_ELEM(table, c, i, j) = value;
438 len = FTS_ELEM(table, c, 0, 0);
440 fts_print_lcs_table(table, r, c);
441 printf(
"\nLen=%lu\n", len);
455 fts_query_compare_rank(
479 #ifdef FTS_UTF8_DEBUG
497 memcpy(str.
f_str, src, len);
500 str.
f_str[len] =
'\0';
544 fts_ranking_words_create(
549 ranking->
words =
static_cast<byte*
>(
551 ranking->
words_len = RANKING_WORDS_INIT_LEN;
576 fts_ranking_words_add(
585 word_map_t::iterator it;
589 it = query->
word_map->lower_bound(word);
596 std::pair<std::string, ulint>(word, pos));
602 byte_offset = pos / CHAR_BIT;
604 byte* words = ranking->
words;
607 while (byte_offset >= words_len) {
611 ranking->
words =
static_cast<byte*
>(
618 ut_ad(byte_offset < ranking->words_len);
619 bit_offset = pos % CHAR_BIT;
620 ranking->
words[byte_offset] |= 1 << bit_offset;
628 fts_ranking_words_get_next(
636 ulint max_pos = ranking->
words_len * CHAR_BIT;
639 while (*pos < max_pos) {
640 ulint byte_offset = *pos / CHAR_BIT;
641 ulint bit_offset = *pos % CHAR_BIT;
643 if (ranking->
words[byte_offset] & (1 << bit_offset)) {
653 ut_ad(*pos < query->word_vector->size());
654 *word = (byte*)query->
word_vector->at((
size_t)*pos).c_str();
667 fts_query_add_word_freq(
679 memset(&word_freq, 0,
sizeof(word_freq));
681 word_freq.
word =
static_cast<byte*
>(
685 memcpy(word_freq.
word, word, len);
697 + SIZEOF_RBT_NODE_ADD
709 fts_query_add_doc_freq(
718 if (
rbt_search(doc_freqs, &parent, &doc_id) != 0) {
721 memset(&doc_freq, 0,
sizeof(doc_freq));
726 parent.last =
rbt_add_node(doc_freqs, &parent, &doc_freq);
740 fts_query_union_doc_id(
759 fts_ranking_words_create(query, &ranking);
773 fts_query_remove_doc_id(
802 fts_query_change_ranking(
820 ranking->
rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
824 if (ranking->
rank >= 1.0F) {
825 ranking->
rank = 1.0F;
826 }
else if (ranking->
rank <= -1.0F) {
827 ranking->
rank = -1.0F;
838 fts_query_intersect_doc_id(
866 new_ranking.
words = NULL;
872 if (ranking->
words == NULL) {
879 rank += ranking->
rank;
882 }
else if (rank <= -1.0F) {
891 new_ranking.
rank = rank;
892 new_ranking.
doc_id = doc_id;
895 &new_ranking) != 0) {
896 if (new_ranking.
words == NULL) {
897 fts_ranking_words_create(query, &new_ranking);
903 ranking->
words = NULL;
907 &parent, &new_ranking);
919 fts_query_free_doc_ids(
932 if (ranking->
words) {
933 ranking->
words = NULL;
955 fts_query_add_word_to_document(
964 if (query->
flags == FTS_OPT_RANKING) {
982 if (ranking != NULL) {
983 fts_ranking_words_add(query, ranking, (
char*)word);
991 fts_query_check_node(
1018 query->
error = fts_query_filter_doc_ids(
1019 query, token->
f_str, word_freqs, node,
1020 node->
ilist, ilist_size, TRUE);
1029 fts_cache_find_wildcard(
1045 strncpy((
char*) term, (
char*) token->
f_str, srch_text.
f_len);
1046 term[srch_text.
f_len] =
'\0';
1047 srch_text.
f_str = term;
1055 ibool forward = FALSE;
1058 cur_node = parent.last;
1061 index_cache->
charset, &srch_text, &word->
text) == 0) {
1063 nodes = word->
nodes;
1065 for (i = 0; nodes && i < ib_vector_size(nodes); ++
i) {
1072 ib_vector_get_const(nodes, i));
1080 word_freqs = rbt_value(
1084 query->
error = fts_query_filter_doc_ids(
1085 query, srch_text.
f_str,
1089 if (query->
error != DB_SUCCESS) {
1098 index_cache->
words, cur_node);
1102 index_cache->
words, cur_node);
1114 cur_node = parent.last;
1125 static __attribute__((nonnull, warn_unused_result))
1127 fts_query_difference(
1138 #ifdef FTS_INTERNAL_DIAG_PRINT
1139 fprintf(stderr,
"DIFFERENCE: Searching: '%.*s'\n",
1140 (
int) token->f_len, token->f_str);
1143 if (query->doc_ids) {
1144 n_doc_ids = rbt_size(query->doc_ids);
1148 if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1153 que_t* graph = NULL;
1157 rw_lock_x_lock(&cache->
lock);
1162 ut_a(index_cache != NULL);
1165 if (query->cur_node->term.wildcard
1166 && query->flags != FTS_PROXIMITY
1167 && query->flags != FTS_PHRASE) {
1168 fts_cache_find_wildcard(query, index_cache, token);
1172 for (i = 0; nodes && i < ib_vector_size(nodes)
1173 && query->error == DB_SUCCESS; ++
i) {
1177 ib_vector_get_const(nodes, i));
1179 fts_query_check_node(query, token, node);
1183 rw_lock_x_unlock(&cache->
lock);
1186 if (query->error != DB_SUCCESS) {
1188 return(query->error);
1197 trx, &graph, &query->fts_index_table, token, &fetch);
1200 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1201 if (error != DB_SUCCESS) {
1202 query->error = error;
1205 fts_que_graph_free(graph);
1209 ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1211 return(query->error);
1217 static __attribute__((nonnull, warn_unused_result))
1219 fts_query_intersect(
1224 trx_t* trx = query->trx;
1229 #ifdef FTS_INTERNAL_DIAG_PRINT
1230 fprintf(stderr,
"INTERSECT: Searching: '%.*s'\n",
1231 (
int) token->f_len, token->f_str);
1236 if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1237 ulint n_doc_ids = 0;
1242 que_t* graph = NULL;
1246 ut_a(!query->intersection);
1248 n_doc_ids = rbt_size(query->doc_ids);
1255 query->total_size += SIZEOF_RBT_CREATE;
1259 if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1264 doc_id = rbt_value(
doc_id_t, node);
1265 query->lower_doc_id = *doc_id;
1268 doc_id = rbt_value(
doc_id_t, node);
1269 query->upper_doc_id = *doc_id;
1272 query->lower_doc_id = 0;
1273 query->upper_doc_id = 0;
1278 rw_lock_x_lock(&cache->
lock);
1284 ut_a(index_cache != NULL);
1286 if (query->cur_node->term.wildcard) {
1288 fts_cache_find_wildcard(query, index_cache, token);
1292 for (i = 0; nodes && i < ib_vector_size(nodes)
1293 && query->error == DB_SUCCESS; ++
i) {
1297 ib_vector_get_const(nodes, i));
1299 fts_query_check_node(query, token, node);
1303 rw_lock_x_unlock(&cache->
lock);
1306 if (query->error != DB_SUCCESS) {
1308 return(query->error);
1317 trx, &graph, &query->fts_index_table, token, &fetch);
1320 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1321 if (error != DB_SUCCESS) {
1322 query->error = error;
1325 fts_que_graph_free(graph);
1327 if (query->error == DB_SUCCESS) {
1330 fts_query_free_doc_ids(query, query->doc_ids);
1331 query->doc_ids = query->intersection;
1332 query->intersection = NULL;
1334 ut_a(!query->multi_exist || (query->multi_exist
1335 && rbt_size(query->doc_ids) <= n_doc_ids));
1339 return(query->error);
1357 rw_lock_x_lock(&cache->
lock);
1363 ut_a(index_cache != NULL);
1366 && query->
flags != FTS_PROXIMITY
1367 && query->
flags != FTS_PHRASE) {
1369 fts_cache_find_wildcard(query, index_cache, token);
1376 for (i = 0; nodes && i < ib_vector_size(nodes)
1377 && query->
error == DB_SUCCESS; ++
i) {
1381 ib_vector_get_const(nodes, i));
1383 fts_query_check_node(query, token, node);
1387 rw_lock_x_unlock(&cache->
lock);
1389 return(query->
error);
1395 static __attribute__((nonnull, warn_unused_result))
1403 ulint n_doc_ids = 0;
1404 trx_t* trx = query->trx;
1405 que_t* graph = NULL;
1411 #ifdef FTS_INTERNAL_DIAG_PRINT
1412 fprintf(stderr,
"UNION: Searching: '%.*s'\n",
1413 (
int) token->f_len, token->f_str);
1416 if (query->doc_ids) {
1417 n_doc_ids = rbt_size(query->doc_ids);
1420 if (token->f_len == 0) {
1421 return(query->error);
1426 ut_ad(*token->f_str !=
'%');
1428 fts_query_cache(query, token);
1437 trx, &graph, &query->fts_index_table, token, &fetch);
1440 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1441 if (error != DB_SUCCESS) {
1442 query->error = error;
1445 fts_que_graph_free(graph);
1447 if (query->error == DB_SUCCESS) {
1450 ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1454 if (query->doc_ids) {
1455 n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1459 return(query->error);
1468 fts_query_process_doc_id(
1475 if (query->
flags == FTS_OPT_RANKING) {
1479 switch (query->
oper) {
1481 fts_query_union_doc_id(query, doc_id, rank);
1485 fts_query_intersect_doc_id(query, doc_id, rank);
1489 fts_query_remove_doc_id(query, doc_id);
1493 fts_query_change_ranking(query, doc_id, TRUE);
1497 fts_query_union_doc_id(query, doc_id, rank);
1498 fts_query_change_ranking(query, doc_id, TRUE);
1502 fts_query_union_doc_id(query, doc_id, rank);
1503 fts_query_change_ranking(query, doc_id, FALSE);
1528 ut_a(!rbt_empty(doc_ids));
1549 query->
error = fts_query_process_doc_id(
1552 if (query->
error != DB_SUCCESS) {
1553 return(query->
error);
1558 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1559 fts_query_add_word_to_document(query, ranking->
doc_id,
1567 fts_query_free_doc_ids(query, query->
doc_ids);
1580 fts_query_skip_word(
1586 while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1598 fts_query_match_phrase_terms(
1616 for (i = 1; ptr < end && i < ib_vector_size(tokens); ) {
1625 phrase->
charset, ptr, (byte*) end,
1628 if (match.
f_len > 0) {
1631 ib_vector_get_const(tokens, i));
1636 phrase->
charset, token, &cmp_str);
1641 && (distance == ULINT_UNDEFINED
1642 || distance == 0)) {
1657 ut_a(distance != ULINT_UNDEFINED);
1659 ptr = fts_query_skip_word(ptr, end);
1663 if (distance != ULINT_UNDEFINED && distance > 0) {
1674 ut_a(i <= ib_vector_size(tokens));
1677 if (i == ib_vector_size(tokens)) {
1678 phrase->
found = TRUE;
1681 return(phrase->
found);
1690 fts_proximity_is_word_in_range(
1701 for (ulint i = 0; i < proximity_pos->
n_pos; i++) {
1702 ulint cur_pos = proximity_pos->
min_pos[
i];
1708 while (cur_pos <= proximity_pos->max_pos[i]) {
1716 start + total_len, &str, &offset);
1737 if (n_word && n_word <= phrase->distance) {
1750 fts_query_match_phrase(
1763 const byte* end = start + cur_len;
1769 ut_a(ib_vector_size(tokens) > 0);
1770 ut_a(ib_vector_size(positions) > 0);
1773 ib_vector_get_const(tokens, 0));
1777 for (i = phrase->
match->
start; i < ib_vector_size(positions); ++
i) {
1785 pos = *(ulint*) ib_vector_get_const(positions, i);
1787 if (pos == ULINT_UNDEFINED) {
1791 if (pos < prev_len) {
1800 ptr = match.
f_str = start + pos;
1808 phrase->
charset, start + pos, (byte*) end,
1811 if (match.
f_len == 0) {
1818 phrase->
charset, first, &cmp_str) == 0) {
1822 if (ib_vector_size(phrase->
tokens) == 1) {
1823 phrase->
found = TRUE;
1830 if (fts_query_match_phrase_terms(phrase, &ptr,
1837 return(phrase->
found);
1845 fts_query_fetch_document(
1855 ulint total_len = 0;
1856 byte* document_text = NULL;
1860 phrase->
found = FALSE;
1869 byte* data =
static_cast<byte*
>(
1870 dfield_get_data(dfield));
1883 if (field_len != UNIV_SQL_NULL) {
1884 total_len += field_len + 1;
1891 phrase->
heap, total_len));
1893 if (!document_text) {
1902 byte* data =
static_cast<byte*
>(
1903 dfield_get_data(dfield));
1914 if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
1916 memcpy(document_text + prev_len, data, cur_len);
1920 fts_query_match_phrase(
1922 static_cast<byte*>(data),
1928 if (phrase->
found) {
1936 prev_len += cur_len + 1;
1941 ut_ad(prev_len <= total_len);
1943 phrase->
found = fts_proximity_is_word_in_range(
1944 phrase, document_text, total_len);
1947 return(phrase->
found);
1970 for (i = 0; exp && !select->
found; ++
i) {
1972 void* data = dfield_get_data(dfield);
1977 if (len != UNIV_SQL_NULL && len != 0) {
1985 if (len != UNIV_SQL_NULL && len != 0) {
1987 fts_query_find_doc_id(select, data, len);
2005 static __attribute__((nonnull, warn_unused_result))
2007 fts_query_find_term(
2021 trx_t* trx = query->trx;
2023 trx->
op_info =
"fetching FTS index matching nodes";
2026 info = (*graph)->info;
2031 select.
found = FALSE;
2034 select.
word_freq = fts_query_add_word_freq(query, word->f_str);
2054 &query->fts_index_table,
2056 "DECLARE FUNCTION my_func;\n"
2057 "DECLARE CURSOR c IS"
2058 " SELECT doc_count, ilist\n"
2060 " WHERE word LIKE :word AND "
2061 " first_doc_id <= :min_doc_id AND "
2062 " last_doc_id >= :max_doc_id\n"
2063 " ORDER BY first_doc_id;\n"
2067 "WHILE 1 = 1 LOOP\n"
2068 " FETCH c INTO my_func();\n"
2069 " IF c % NOTFOUND THEN\n"
2079 if (error == DB_SUCCESS) {
2086 fprintf(stderr,
" InnoDB: Warning: lock wait "
2087 "timeout reading FTS index. "
2092 fprintf(stderr,
" InnoDB: Error: %lu "
2093 "while reading FTS index.\n", error);
2101 *found = select.
found;
2123 ulint* total = user_arg;
2129 void* data = dfield_get_data(dfield);
2132 if (len != UNIV_SQL_NULL && len != 0) {
2145 static __attribute__((nonnull, warn_unused_result))
2147 fts_query_total_docs_containing_term(
2157 trx_t* trx = query->trx;
2159 trx->
op_info =
"fetching FTS index document count";
2173 &query->fts_index_table,
2175 "DECLARE FUNCTION my_func;\n"
2176 "DECLARE CURSOR c IS"
2177 " SELECT doc_count\n"
2179 " WHERE word = :word "
2180 " ORDER BY first_doc_id;\n"
2184 "WHILE 1 = 1 LOOP\n"
2185 " FETCH c INTO my_func();\n"
2186 " IF c % NOTFOUND THEN\n"
2195 if (error == DB_SUCCESS) {
2202 fprintf(stderr,
" InnoDB: Warning: lock wait "
2203 "timeout reading FTS index. "
2208 fprintf(stderr,
" InnoDB: Error: %lu "
2209 "while reading FTS index.\n", error);
2216 fts_que_graph_free(graph);
2224 static __attribute__((nonnull, warn_unused_result))
2226 fts_query_terms_in_document(
2236 trx_t* trx = query->trx;
2238 trx->
op_info =
"fetching FTS document term count";
2250 query->fts_index_table.suffix =
"DOC_ID";
2253 &query->fts_index_table,
2255 "DECLARE FUNCTION my_func;\n"
2256 "DECLARE CURSOR c IS"
2259 " WHERE doc_id = :doc_id "
2263 "WHILE 1 = 1 LOOP\n"
2264 " FETCH c INTO my_func();\n"
2265 " IF c % NOTFOUND THEN\n"
2274 if (error == DB_SUCCESS) {
2281 fprintf(stderr,
" InnoDB: Warning: lock wait "
2282 "timeout reading FTS doc id table. "
2287 fprintf(stderr,
" InnoDB: Error: %lu "
2288 "while reading FTS doc id table.\n",
2296 fts_que_graph_free(graph);
2305 static __attribute__((nonnull, warn_unused_result))
2307 fts_query_match_document(
2318 memset(&phrase, 0x0,
sizeof(phrase));
2320 phrase.
match = match;
2323 phrase.
charset = get_doc->index_cache->charset;
2325 get_doc->index_cache->index->table);
2328 *found = phrase.
found = FALSE;
2332 fts_query_fetch_document, &phrase);
2334 if (error != DB_SUCCESS) {
2336 fprintf(stderr,
"InnoDB: Error: (%s) matching document.\n",
2339 *found = phrase.
found;
2351 static __attribute__((nonnull, warn_unused_result))
2353 fts_query_is_in_proximity_range(
2361 fts_cache_t* cache = query->index->table->fts->cache;
2365 memset(&get_doc, 0x0,
sizeof(get_doc));
2366 memset(&phrase, 0x0,
sizeof(phrase));
2368 rw_lock_x_lock(&cache->
lock);
2370 rw_lock_x_unlock(&cache->
lock);
2379 phrase.
found = FALSE;
2383 fts_query_fetch_document, &phrase);
2385 if (err != DB_SUCCESS) {
2387 "Error: (%s) in verification phase of proximity "
2392 if (get_doc.get_document_graph) {
2393 fts_que_graph_free(get_doc.get_document_graph);
2394 get_doc.get_document_graph = NULL;
2399 return(err == DB_SUCCESS && phrase.
found);
2406 static __attribute__((nonnull, warn_unused_result))
2408 fts_query_search_phrase(
2422 fts_cache_t* cache = query->index->table->fts->cache;
2424 n_matched = ib_vector_size(query->matched);
2427 memset(&get_doc, 0x0,
sizeof(get_doc));
2429 rw_lock_x_lock(&cache->
lock);
2436 rw_lock_x_unlock(&cache->
lock);
2438 #ifdef FTS_INTERNAL_DIAG_PRINT
2440 fprintf(stderr,
" Start phrase search\n");
2446 for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++
i) {
2448 ibool found = FALSE;
2455 if (match->
doc_id != 0) {
2457 query->error = fts_query_match_document(
2458 orig_tokens, &get_doc,
2459 match, query->distance, &found);
2461 if (query->error == DB_SUCCESS && found) {
2464 query->error = fts_query_process_doc_id(query,
2466 if (query->error != DB_SUCCESS) {
2470 for (z = 0; z < ib_vector_size(tokens); z++) {
2474 fts_query_add_word_to_document(
2484 if (get_doc.get_document_graph) {
2485 fts_que_graph_free(get_doc.get_document_graph);
2486 get_doc.get_document_graph = NULL;
2489 return(query->error);
2495 static __attribute__((nonnull, warn_unused_result))
2497 fts_query_phrase_search(
2505 ulint len = phrase->f_len;
2511 charset = query->fts_index_table.charset;
2513 heap_alloc = ib_heap_allocator_create(heap);
2515 tokens = ib_vector_create(heap_alloc,
sizeof(
fts_string_t), 4);
2516 orig_tokens = ib_vector_create(heap_alloc,
sizeof(
fts_string_t), 4);
2518 if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2519 query->flags = FTS_PROXIMITY;
2521 query->flags = FTS_PHRASE;
2525 while (cur_pos < len) {
2526 fts_cache_t* cache = query->index->table->fts->cache;
2534 reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
2535 reinterpret_cast<const byte*>(phrase->f_str) + len,
2536 &result_str, &offset);
2549 ib_vector_push(tokens, NULL));
2551 token->
f_str =
static_cast<byte*
>(
2560 &parent, token) != 0
2565 fts_query_add_word_freq(query, token->
f_str);
2567 ib_vector_pop(tokens);
2575 ib_vector_push(orig_tokens, NULL));
2582 num_token = ib_vector_size(tokens);
2583 ut_ad(ib_vector_size(orig_tokens) >= num_token);
2586 if (num_token > 0) {
2589 trx_t* trx = query->trx;
2591 que_t* graph = NULL;
2597 if (!query->matched) {
2600 heap_alloc = ib_heap_allocator_create(heap);
2602 if (!(query->flags & FTS_PROXIMITY)
2603 && !(query->flags & FTS_PHRASE)) {
2604 query->matched = ib_vector_create(
2608 ut_a(num_token < MAX_PROXIMITY_ITEM);
2609 query->match_array =
2613 sizeof(query->matched));
2615 for (i = 0; i < num_token; i++) {
2616 query->match_array[
i] =
2622 query->matched = query->match_array[0];
2631 for (i = 0; i < num_token; i++) {
2636 if (query->flags & FTS_PROXIMITY
2637 || query->flags & FTS_PHRASE) {
2638 query->matched = query->match_array[
i];
2642 trx, &graph, &query->fts_index_table,
2646 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2647 if (error != DB_SUCCESS) {
2648 query->error = error;
2651 fts_que_graph_free(graph);
2654 fts_query_cache(query, token);
2656 if (!(query->flags & FTS_PHRASE)
2657 && !(query->flags & FTS_PROXIMITY)) {
2664 || query->error != DB_SUCCESS) {
2671 if (ib_vector_size(orig_tokens) == 1
2676 n_matched = ib_vector_size(query->match_array[0]);
2678 for (i = 0; i < n_matched; i++) {
2681 query->match_array[0], i));
2683 query->error = fts_query_process_doc_id(
2684 query, match->
doc_id, 0);
2685 if (query->error != DB_SUCCESS) {
2689 fts_query_add_word_to_document(
2698 if (query->flags & FTS_PROXIMITY) {
2699 fts_phrase_or_proximity_search(query, tokens);
2710 matched = fts_phrase_or_proximity_search(query, tokens);
2711 query->matched = query->match_array[0];
2715 ut_ad(query->error == DB_SUCCESS);
2716 query->error = fts_query_search_phrase(
2717 query, orig_tokens, tokens);
2724 if (query->error != DB_SUCCESS) {
2733 query->matched = NULL;
2735 return(query->error);
2741 static __attribute__((nonnull, warn_unused_result))
2748 switch (query->oper) {
2753 query->error = fts_query_union(query, token);
2757 query->error = fts_query_intersect(query, token);
2761 query->error = fts_query_difference(query, token);
2768 return(query->error);
2777 fts_query_get_token(
2783 byte* new_ptr = NULL;
2789 token->
f_len = str_len;
2795 token->
f_len = str_len + 1;
2800 token->
f_str[str_len] =
'%';
2803 new_ptr = token->
f_str;
2831 switch (node->
type) {
2846 query->collect_positions = TRUE;
2848 query->
error = fts_query_phrase_search(query, &token);
2850 query->collect_positions = FALSE;
2853 fts_query_free_doc_ids(query, query->
doc_ids);
2864 fts_query_add_word_freq(query, node->
term.
ptr);
2866 ptr = fts_query_get_token(node, &token);
2867 query->
error = fts_query_execute(query, &token);
2882 return(query->
error);
2895 fts_ast_callback visitor,
2903 bool will_be_ignored =
false;
2910 if (!node || !node->
next) {
2914 cur_oper = node->
oper;
2917 parent_doc_ids = query->
doc_ids;
2931 arg, &will_be_ignored);
2935 query->
oper = cur_oper;
2936 subexpr_doc_ids = query->
doc_ids;
2939 query->
doc_ids = parent_doc_ids;
2942 if (error == DB_SUCCESS && !rbt_empty(subexpr_doc_ids)) {
2943 error = fts_merge_doc_ids(query, subexpr_doc_ids);
2951 fts_query_free_doc_ids(query, subexpr_doc_ids);
2962 fts_query_find_doc_id(
2976 while (decoded < len && !select->found) {
2992 if (min_pos == 0 && last_pos > select->
min_pos) {
3001 decoded = ptr - (byte*) data;
3006 if (doc_id == select->
doc_id && min_pos > 0) {
3011 doc_freq = fts_query_add_doc_freq(
3015 if (doc_freq->
freq == 0) {
3016 doc_freq->
freq = freq;
3019 select->
found = TRUE;
3024 return(select->
found);
3034 fts_query_filter_doc_ids(
3043 ibool calc_doc_count)
3045 byte* ptr =
static_cast<byte*
>(data);
3051 while (decoded < len) {
3066 if (calc_doc_count) {
3071 if (query->collect_positions) {
3076 ib_vector_push(query->
matched, NULL));
3080 heap_alloc = ib_vector_allocator(query->
matched);
3085 heap_alloc,
sizeof(ulint), 64);
3089 +
sizeof(ulint) * 64;
3098 if (query->collect_positions) {
3099 ib_vector_push(match->
positions, &last_pos);
3106 last_pos = (ulint) -1;
3108 if (query->collect_positions) {
3109 ut_a(match != NULL);
3110 ib_vector_push(match->
positions, &last_pos);
3115 doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3118 if (doc_freq->
freq == 0) {
3119 doc_freq->
freq = freq;
3126 decoded = ptr - (byte*) data;
3130 if (!query->collect_positions) {
3132 fts_query_process_doc_id(query, doc_id, 0);
3135 fts_query_add_word_to_document(query, doc_id, word);
3154 fts_query_read_node(
3172 memset(&node, 0,
sizeof(node));
3186 term[word->
f_len] = 0;
3204 byte* data =
static_cast<byte*
>(
3205 dfield_get_data(dfield));
3208 ut_a(len != UNIV_SQL_NULL);
3241 error = fts_query_filter_doc_ids(
3242 query, word_freq->
word, word_freq,
3243 &node, data, len, FALSE);
3266 fts_query_index_fetch_nodes(
3277 void* data = dfield_get_data(dfield);
3280 key.
f_str =
static_cast<byte*
>(data);
3281 key.
f_len = dfield_len;
3288 if (query->
error != DB_SUCCESS) {
3300 fts_query_calculate_idf(
3318 if (total_docs == word_freq->
doc_count) {
3324 word_freq->
idf = log10(1.0001);
3326 word_freq->
idf = log10(
3333 fprintf(stderr,
"'%s' -> " UINT64PF
"/" UINT64PF
3346 fts_query_calculate_ranking(
3359 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3366 ut_ad(word != NULL);
3382 weight = (double) doc_freq->
freq * word_freq->
idf;
3392 fts_query_add_ranking(
3401 if (
rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3406 ranking->
rank += new_ranking->
rank;
3434 new_ranking.
doc_id = doc_id;
3442 return(ranking->
rank);
3452 fts_query_prepare_result(
3460 bool result_is_null =
false;
3462 if (result == NULL) {
3465 memset(result, 0x0,
sizeof(*result));
3471 result_is_null =
true;
3474 if (query->
flags == FTS_OPT_RANKING) {
3501 ranking.
words = NULL;
3525 fts_query_calculate_ranking(query, ranking);
3532 ranking->
words = NULL;
3534 if (!result_is_null) {
3545 if (result_is_null) {
3559 fts_query_get_result(
3564 if (rbt_size(query->
doc_ids) > 0 || query->
flags == FTS_OPT_RANKING) {
3566 result = fts_query_prepare_result(query, result);
3570 memset(result, 0,
sizeof(*result));
3585 if (query->read_nodes_graph) {
3586 fts_que_graph_free(query->read_nodes_graph);
3598 fts_query_free_doc_ids(query, query->
doc_ids);
3635 memset(query, 0,
sizeof(*query));
3652 memset(&state, 0x0,
sizeof(state));
3677 fts_query_can_optimize(
3684 if (flags & FTS_EXPAND) {
3692 query->
flags = FTS_OPT_RANKING;
3704 fts_query_str_preprocess(
3706 const byte* query_str,
3715 bool in_phrase =
false;
3720 str_len = query_len * charset->casedn_multiply + 1;
3721 str_ptr =
static_cast<byte*
>(
ut_malloc(str_len));
3724 charset, const_cast<char*>(reinterpret_cast<const char*>(
3725 query_str)), query_len,
3726 reinterpret_cast<char*>(str_ptr), str_len);
3728 ut_ad(*result_len < str_len);
3730 str_ptr[*result_len] = 0;
3733 if (!boolean_mode) {
3741 while (cur_pos < *result_len) {
3747 charset, str_ptr + cur_pos, str_ptr + *result_len,
3756 for (byte* ptr = str_ptr + cur_pos; ptr < str.
f_str; ptr++) {
3757 if ((
char) (*ptr) ==
'"' ) {
3758 in_phrase = !in_phrase;
3763 if (cur_pos > 0 && str.
f_str - str_ptr - cur_pos == 1
3765 char* last_op =
reinterpret_cast<char*
>(
3768 if (*last_op ==
'-' || *last_op ==
'+') {
3789 const byte* query_str,
3801 ulint start_time_ms;
3802 bool will_be_ignored =
false;
3804 boolean_mode = flags & FTS_BOOL;
3807 memset(&query, 0x0,
sizeof(query));
3809 query_trx->
op_info =
"FTS query";
3813 query.
trx = query_trx;
3833 query.
error = DB_SUCCESS;
3844 #ifdef FTS_DOC_STATS_DEBUG
3845 if (ft_enable_diag_print) {
3846 error = fts_get_total_word_count(
3849 if (error != DB_SUCCESS) {
3853 fprintf(stderr,
"Total docs: " UINT64PF
" Total words: %lu\n",
3858 query.fts_common_table.
suffix =
"DELETED";
3862 NULL, &query.fts_common_table, query.
deleted);
3864 if (error != DB_SUCCESS) {
3868 query.fts_common_table.
suffix =
"DELETED_CACHE";
3871 NULL, &query.fts_common_table, query.
deleted);
3873 if (error != DB_SUCCESS) {
3888 lc_query_str_len = query_len * charset->casedn_multiply + 1;
3889 lc_query_str =
static_cast<byte*
>(
ut_malloc(lc_query_str_len));
3892 charset, (
char*) query_str, query_len,
3893 (
char*) lc_query_str, lc_query_str_len);
3895 ut_ad(result_len < lc_query_str_len);
3897 lc_query_str[result_len] = 0;
3901 lc_query_str = fts_query_str_preprocess(
3902 query_str, query_len, &result_len, charset, boolean_mode);
3913 if (fts_query_parse(&query, lc_query_str, result_len)) {
3917 fts_query_can_optimize(&query, flags);
3919 DBUG_EXECUTE_IF(
"fts_instrument_result_cache_limit",
3927 &query, &will_be_ignored);
3931 if (query.
error == DB_SUCCESS && (flags & FTS_EXPAND)) {
3932 query.
error = fts_expand_query(index, &query);
3936 if (query.
error == DB_SUCCESS) {
3937 fts_query_calculate_idf(&query);
3942 if (query.
error == DB_SUCCESS) {
3943 *result = fts_query_get_result(&query, *result);
3946 error = query.
error;
3951 memset(*result, 0,
sizeof(**result));
3957 ulint diff_time =
ut_time_ms() - start_time_ms;
3958 fprintf(stderr,
"FTS Search Processing time: %ld secs:"
3959 " %ld millisec: row(s) %d \n",
3960 diff_time / 1000, diff_time % 1000,
3961 (*result)->rankings_by_id
3962 ? (
int) rbt_size((*result)->rankings_by_id)
3967 "Full Search Memory: "
3968 "%lu (bytes), Row: %lu .",
3970 (*result)->rankings_by_id
3971 ? rbt_size((*result)->rankings_by_id)
3976 fts_query_free(&query);
4062 fprintf(stderr,
"doc_ids info, doc_id: %ld \n",
4063 (ulint) ranking->
doc_id);
4067 while (fts_ranking_words_get_next(query, ranking, &pos, &value)) {
4068 fprintf(stderr,
"doc_ids info, value: %s \n", value);
4080 static __attribute__((nonnull, warn_unused_result))
4094 if (!rbt_size(query->doc_ids)) {
4101 rw_lock_x_lock(&index->table->fts->cache->lock);
4103 rw_lock_x_unlock(&index->table->fts->cache->lock);
4113 query->total_size += SIZEOF_RBT_CREATE;
4115 fts_print_doc_id(query);
4120 node =
rbt_next(query->doc_ids, node)) {
4125 ulint prev_token_size;
4126 ulint estimate_size;
4128 prev_token_size = rbt_size(result_doc.
tokens);
4147 while (fts_ranking_words_get_next(query, ranking, &pos,
4159 fprintf(stderr,
" InnoDB: Error: Did not "
4160 "find word %s in doc %ld for query "
4161 "expansion search.\n", str.
f_str,
4162 (ulint) ranking->
doc_id);
4168 estimate_size = (rbt_size(result_doc.
tokens) - prev_token_size)
4171 query->total_size += estimate_size;
4186 fts_query_add_word_freq(query, mytoken->
text.
f_str);
4187 error = fts_query_union(query, &mytoken->
text);
4189 if (error != DB_SUCCESS) {
4207 fts_phrase_or_proximity_search(
4216 ibool matched = FALSE;
4217 ulint num_token = ib_vector_size(tokens);
4219 ibool end_list = FALSE;
4222 n_matched = ib_vector_size(query->
match_array[0]);
4227 for (i = 0; i < n_matched; i++) {
4231 ulint qualified_pos_buf[MAX_PROXIMITY_ITEM * 2];
4233 qualified_pos.
min_pos = &qualified_pos_buf[0];
4234 qualified_pos.
max_pos = &qualified_pos_buf[MAX_PROXIMITY_ITEM];
4242 for (j = 1; j < num_token; j++) {
4246 while (match[j]->doc_id < match[0]->doc_id
4254 if (match[j]->doc_id > match[0]->doc_id) {
4256 if (query->
flags & FTS_PHRASE) {
4265 if (match[j]->doc_id != match[0]->doc_id) {
4267 if (query->
flags & FTS_PHRASE) {
4272 for (s = i + 1; s < n_matched;
4274 match[0] =
static_cast<
4293 if (j != num_token) {
4301 if (query->
flags & FTS_PHRASE) {
4303 }
else if (fts_proximity_get_positions(
4304 match, num_token, ULINT_MAX, &qualified_pos)) {
4309 if (fts_query_is_in_proximity_range(
4310 query, match, &qualified_pos)) {
4312 query->
error = fts_query_process_doc_id(
4313 query, match[0]->doc_id, 0);
4314 if (query->
error != DB_SUCCESS) {
4320 for (ulint z = 0; z < num_token; z++) {
4324 fts_query_add_word_to_document(
4325 query, match[0]->doc_id,
4349 fts_proximity_get_positions(
4361 ulint idx[MAX_PROXIMITY_ITEM];
4362 ulint num_pos[MAX_PROXIMITY_ITEM];
4365 qualified_pos->
n_pos = 0;
4367 ut_a(num_match < MAX_PROXIMITY_ITEM);
4377 for (i = 0; i < num_match; i++) {
4383 num_pos[
i] = ib_vector_size(match[i]->positions);
4389 while (idx[min_idx] < num_pos[min_idx]) {
4390 ulint position[MAX_PROXIMITY_ITEM];
4391 ulint min_pos = ULINT_MAX;
4396 for (i = 0; i < num_match; i++) {
4397 position[
i] = *(ulint*) ib_vector_get_const(
4398 match[i]->positions, idx[i]);
4400 if (position[i] == ULINT_UNDEFINED) {
4404 if (position[i] < min_pos) {
4405 min_pos = position[
i];
4409 if (position[i] > max_pos) {
4410 max_pos = position[
i];
4416 if (max_pos - min_pos <= distance
4417 && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4422 qualified_pos->
min_pos[qualified_pos->
n_pos] = min_pos;
4423 qualified_pos->
max_pos[qualified_pos->
n_pos] = max_pos;
4424 qualified_pos->
n_pos++;
4432 ut_ad(qualified_pos->
n_pos <= MAX_PROXIMITY_ITEM);
4434 return(qualified_pos->
n_pos != 0);