簡體   English   中英

如何在Solr中索引插件字段?

[英]How to index the plugin field in nutch with solr?

我集成了nutch / solr / hbase來構建搜索引擎,它工作得很好,除了schma.xml中的某些文件未索引到solr。 schema.xml如下所示:

<schema name="nutch" version="1.5">
    <types>
    <fieldType name="string" class="solr.StrField" sortMissingLast="true"
        omitNorms="true"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
        omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
        omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
        omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="text" class="solr.TextField"
        positionIncrementGap="100">
        <analyzer>
            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
            <filter class="solr.StopFilterFactory"
                ignoreCase="true" words="stopwords.txt"/>
            <filter class="solr.WordDelimiterFilterFactory"
                generateWordParts="1" generateNumberParts="1"
                catenateWords="1" catenateNumbers="1" catenateAll="0"
                splitOnCaseChange="1"/>
            <filter class="solr.LowerCaseFilterFactory"/>
            <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
        </analyzer>
      </fieldType>
      <fieldType name="url" class="solr.TextField"
        positionIncrementGap="100">
        <analyzer>
            <tokenizer class="solr.StandardTokenizerFactory"/>
            <filter class="solr.LowerCaseFilterFactory"/>
            <filter class="solr.WordDelimiterFilterFactory"/>
        <analyzer>
            <tokenizer class="solr.StandardTokenizerFactory"/>
            <filter class="solr.LowerCaseFilterFactory"/>
            <filter class="solr.WordDelimiterFilterFactory"
                generateWordParts="1" generateNumberParts="1"/>
        </analyzer>
    </fieldType>
</types>
<fields>
    <field name="id" type="string" stored="true" indexed="true"/>

    <!-- core fields -->
    <field name="batchId" type="string" stored="true" indexed="false"/>
    <field name="digest" type="string" stored="true" indexed="false"/>
    <field name="boost" type="float" stored="true" indexed="false"/>

    <!-- fields for index-basic plugin -->
    <field name="host" type="url" stored="false" indexed="true"/>
    <field name="url" type="url" stored="true" indexed="true"
        required="true"/>
    <field name="content" type="text" stored="true" indexed="true"/>
    <field name="title" type="text" stored="true" indexed="true"/>
    <field name="cache" type="string" stored="true" indexed="false"/>
    <field name="tstamp" type="date" stored="true" indexed="true"/>

    <field name="_version_" type="long" indexed="true" stored="true"/>
    <!-- fields for index-anchor plugin -->
    <field name="anchor" type="string" stored="true" indexed="true"
        multiValued="true"/>

    <!-- fields for index-more plugin -->
    <field name="type" type="string" stored="true" indexed="true"
        multiValued="true"/>
    <field name="contentLength" type="long" stored="true"
        indexed="true"/>
    <field name="lastModified" type="date" stored="true"
        indexed="true"/>
    <field name="date" type="date" stored="true" indexed="true"/>

    <!-- fields for languageidentifier plugin -->
    <field name="lang" type="string" stored="true" indexed="true"/>

    <!-- fields for subcollection plugin -->
    <field name="subcollection" type="string" stored="true"
        indexed="true" multiValued="true"/>

    <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
    <field name="author" type="string" stored="true" indexed="true"/>
    <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
    <field name="feed" type="string" stored="true" indexed="true"/>
    <field name="publishedDate" type="date" stored="true"
        indexed="true"/>
    <field name="updatedDate" type="date" stored="true"
        indexed="true"/>

    <!-- fields for creativecommons plugin -->
    <field name="cc" type="string" stored="true" indexed="true"
        multiValued="true"/>

    <!-- fields for tld plugin -->
    <field name="tld" type="string" stored="false" indexed="false"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>content</defaultSearchField>
<solrQueryParser defaultOperator="OR"/>
</schema>

“-核心字段-”和“-索引基本插件-的字段”中的字段已索引到solr,但其他字段,例如“-索引錨插件-的字段-”中的字段-索引更多插件的字段-則不是。

這是什么問題?

也許您忘了激活nutch-default或nutch-site文件中的那些插件。

<property>
 <name>plugin.includes</name>
 <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more)|scoring- opic|urlnormalizer-(pass|regex|basic)</value>
</property>

然后,您可能還要將它們添加到solrindex-mapping.xml文件中。

<fields>
 <field dest="content" source="content"/>
 <field dest="title" source="title"/>
 <field dest="host" source="host"/>
 <field dest="segment" source="segment"/>
 <field dest="boost" source="boost"/>
 <field dest="digest" source="digest"/>
 <field dest="tstamp" source="tstamp"/>
 <field dest="anchor" source="anchor"/>
 <field dest="type" source="type"/>
 <field dest="id" source="url"/>
 <copyField source="url" dest="url"/>
</fields>
<uniqueKey>id</uniqueKey>

編譯Nutch並進行新的爬網,您應該能夠在solr中看到index-more和index-anchor字段。

在您的情況下,在nutch-site.xml或nutch-default.xml中進行此配置

<property>
<name>plugin.includes</name>
<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
</property>

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM