Sphinx简介
Sphinx全文检索方案架构图
Sphinx工作流程图
Sphinx工作原理
Sphinx中文分词
GBK编码的数据源支持
采用Chih-Hao Tsai MMSEG算法的中文分词器
Sphinx安装步骤
1. 下载源码
https://github.com/zwxhenu/coreseek
git clone https://github.com/zwxhenu/coreseek
2. 安装mmseg3中文分词
yum -y install gcc gcc-c++ autoconf python python-devel libiconv libtool
cd mmseg-3.2.14
./configure --prefix=/usr/local/mmseg3
make
make install
yum -y install libtool
aclocal
libtoolize --force
automake --add-missing
autoconf
autoheader
make clean
./configure --prefix=/usr/local/mmseg3
make
make install
删除Makefile.am中的data/uni.lib
automake
./configure --prefix=/usr/local/mmseg3
make
make install
cd /usr/local/mmseg3/etc/
/usr/local/mmseg3/bin/mmseg -u unigram.txt
cp unigram.txt.uni uni.lib
3. 安装coreseek
yum -y install expat expat-devel
sh buildconf.sh
&& automake --add-missing \
AM_INIT_AUTOMAKE([-Wall foreign])
m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
T val = ExprEval ( this->m_pArg, tMatch ); 为T val = this->ExprEval ( this->m_pArg, tMatch );
yum install mysql-community-embedded-devel.x86_64 mysql-community-devel.x86_64 mysql++-devel.x86_64
将LIBS = -lm -lz -lexpat -L/usr/local/lib –lpthread修改成:
LIBS = -lm -lz -lexpat -liconv -L/usr/local/lib -lpthread
make;
make install
4. 命令行测试mmseg分词,coresekk搜索
cd testpack
cat var/test/test.xml #此时应该正确显示中文
/usr/local/mmseg3/bin/mmseg -d /usr/local/mmseg3/etc var/test/test.xml
/usr/local/coreseek/bin/indexer -c etc/csft.conf --all
/usr/local/coreseek/bin/search -c etc/csft.conf 网络搜索
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
Copyright (c) 2007-2011,
Beijing Choice Software Technologies Inc (http://www.coreseek.com)
using config file 'etc/csft.conf'...
index 'xml': query '网络搜索 ': returned 1 matches of 1 total in 0.003 sec
displaying matches:
1. document=1, weight=1, published=Thu Apr 1 22:20:07 2010, author_id=1
words:
1. '网络': 1 documents, 1 hits
2. '搜索': 2 documents, 5 hits</code></pre>
5. 配置sphinx与mysql
source threads
{
type = mysql
sql_host = localhost
sql_user = root
sql_pass = mysql57@fangstar
sql_db = discuz
sql_port = 3306 # optional, default is 3306
sql_sock = /var/lib/mysql/mysql.sock
sql_query_pre = SET NAMES utf8
#sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_pre = CREATE TABLE IF NOT EXISTS pre_common_sphinxcounter ( indexid INTEGER PRIMARY KEY NOT NULL,maxid INTEGER NOT NULL)
sql_query_pre = REPLACE INTO pre_common_sphinxcounter SELECT 1, MAX(tid)-10 FROM pre_forum_thread
sql_query = SELECT t.tid AS id,t.tid,t.subject,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \
FROM pre_forum_thread AS t \
WHERE t.tid>=$start AND t.tid<=$end
sql_query_range = SELECT (SELECT MIN(tid) FROM pre_forum_thread),maxid FROM pre_common_sphinxcounter WHERE indexid=1
sql_range_step = 4096
sql_attr_uint = tid
sql_attr_uint = digest
sql_attr_uint = displayorder
sql_attr_uint = authorid
sql_attr_uint = special
sql_attr_timestamp = lastpost
sql_query_info = SELECT * FROM pre_forum_thread WHERE tid=$id
}
index threads
{
source = threads
path = /usr/local/coreseek/var/data/threads
docinfo = extern
mlock = 0
morphology = none
min_word_len = 1
charset_type = zh_cn.utf-8
charset_dictpath = /usr/local/mmseg3/etc/
min_prefix_len = 0
min_infix_len = 1
ngram_len = 0
html_strip = 0
}
source threads_minute: threads
{
sql_query_pre =
sql_query_pre = SET NAMES utf8
sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_range = SELECT maxid+1,(SELECT MAX(tid) FROM pre_forum_thread) FROM pre_common_sphinxcounter WHERE indexid=1
}
#threads_minute
index threads_minute : threads
{
source = threads_minute
path = /usr/local/coreseek/var/data/threads_minute #windows下最好用全路径
}
#posts
source posts
{
type = mysql
sql_host = localhost
sql_user = root
sql_pass = mysql57@fangstar
sql_db = discuz
sql_port = 3306
sql_query_pre = SET NAMES utf8
# sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_pre = REPLACE INTO pre_common_sphinxcounter SELECT 2, MAX(pid)-2 FROM pre_forum_post
sql_query = SELECT p.pid AS id,p.tid,p.subject,p.message,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \
FROM pre_forum_post AS p LEFT JOIN pre_forum_thread AS t USING(tid) \
WHERE p.pid>=$start AND p.pid<=$end
sql_query_range = SELECT (SELECT MIN(pid) FROM pre_forum_post),maxid FROM pre_common_sphinxcounter WHERE indexid=2
sql_range_step = 4096
sql_attr_uint = tid
sql_attr_uint = digest
sql_attr_uint = displayorder
sql_attr_uint = authorid
sql_attr_uint = special
sql_attr_timestamp =lastpost
sql_query_info = SELECT * FROM pre_forum_post WHERE pid=$id
}
#posts
index posts
{
source = posts
path = /usr/local/coreseek/var/data/posts #windows下最好用全路径
docinfo = extern
mlock = 0
morphology = none
min_word_len = 1
html_strip = 0
charset_dictpath = /usr/local/mmseg3/etc/ #BSD、Linux环境下设置,/符号结尾
charset_type = zh_cn.utf-8
#charset_debug = 0
ngram_len = 0
}
#posts_minute
source posts_minute : posts
{
sql_query_pre =
sql_query_pre = SET NAMES utf8
# sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_range = SELECT maxid+1,(SELECT MAX(pid) FROM pre_forum_post) FROM pre_common_sphinxcounter WHERE indexid=2
}
#posts_minute
index posts_minute : posts
{
source = posts_minute
path = /usr/local/coreseek/var/data/posts_minute #windows下最好用全路径
}
indexer
{
mem_limit = 256M
}
searchd
{
listen = 9312
listen = /tmp/sphinx.sock
log = /var/log/spinhx/searchd.log
query_log = /var/log/spinhx/query.log
read_timeout = 5
client_timeout = 300
max_children = 30
pid_file = /var/run/searchd.pid
max_matches = 1000
seamless_rotate = 1
preopen_indexes = 0
unlink_old = 1
mva_updates_pool = 1M
max_packet_size = 8M
max_filters = 256
max_filter_values = 4096
}
6. 启动服务,创建索引
/usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/sphinx.conf -all
/usr/local/coreseek/bin/searchd -c /usr/local/coreseek/etc/sphinx.conf
/usr/local/coreseek/bin/search -c /usr/local/coreseek/etc/sphinx.conf aaa
crontab -e
* */4 * * * /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/sphinx.conf -all --rotate
/usr/local/coreseek/bin/searchd -c /usr/local/coreseek/etc/sphinx.conf --stop
discuz 后台配置
参考文献
discuz论坛配置开启Sphinx全文搜索
coreseek sphinx mmseg mysql 全文检索 安装 配置
Coreseek/Sphinx安装测试配置指南
Discuzx3 使用sphinx实现全文搜索功能
千万级Discuz!数据全文检索方案(Sphinx)
centos安装coreseek




