问题
I'm going nuts trying to figure out how to get the Data Import Handler's splitBy
construct to work. I was expecting it to split the input column into a multivalued field. Here's a test case to reproduce the problem:
import java.io.File;
import java.io.IOException;
import java.sql.SQLException;
import static org.junit.Assert.*;
import javax.sql.DataSource;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.core.CoreContainer;
import org.hsqldb.jdbc.JDBCDataSource;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class TestSplitBy {
SolrServer server;
File configPath = new File(FileUtils.getTempDirectory(), Long.toString(System.nanoTime()));
String solrconfig_xml = "<config><luceneMatchVersion>LUCENE_41</luceneMatchVersion><requestHandler name=\"search\" class=\"solr.SearchHandler\" default=\"true\"><lst name=\"defaults\"><str name=\"fl\">*</str><str name=\"df\">id</str></lst></requestHandler><requestHandler name=\"/dataimport\" class=\"org.apache.solr.handler.dataimport.DataImportHandler\"><lst name=\"defaults\"><str name=\"config\">data-config.xml</str></lst></requestHandler></config>";
String data_config_xml = "<dataConfig>" +
"<dataSource url=\"jdbc:hsqldb:mem:testdb\" user=\"SA\" driver=\"org.hsqldb.jdbc.JDBCDriver\" />" +
"<document>" +
"<entity name=\"item\" transformer=\"RegexTransformer\" query=\"SELECT * FROM test\">" +
"<field column=\"type\" name=\"type\" splitBy=\",\" />" +
"</entity>" +
"</document>" +
"</dataConfig>";
String schema_xml = "<schema version=\"1.3\" name=\"test\">" +
"<types>" +
"<fieldType name=\"string\" class=\"solr.StrField\" sortMissingLast=\"true\" omitNorms=\"true\" />" +
"</types>" +
"<fields>" +
"<field stored=\"true\" name=\"id\" type=\"string\" />" +
"<field stored=\"true\" name=\"type\" type=\"string\" multiValued=\"true\"/>" +
"</fields>" +
"<uniqueKey>id</uniqueKey>" +
"</schema>";
DataSource getDataSource() {
JDBCDataSource ds = new JDBCDataSource();
ds.setUser("SA");
ds.setUrl("mem:testdb");
return ds;
}
void populateDb(DataSource ds) {
QueryRunner runner = new QueryRunner(ds);
try {
runner.update("DROP TABLE test IF EXISTS");
runner.update("CREATE TABLE test(id INTEGER, type VARCHAR(256));");
runner.update("INSERT INTO test VALUES 1, 'foo,bar,baz'");
} catch (SQLException e) {
System.err.println(e);
}
}
void writeSolrConfig() throws IOException {
File corePath = new File(configPath, "collection1");
corePath.mkdir();
File confPath = new File(corePath, "conf");
confPath.mkdir();
FileUtils.write(new File(confPath, "data-config.xml"), data_config_xml);
FileUtils.write(new File(confPath, "schema.xml"), schema_xml);
FileUtils.write(new File(confPath, "solrconfig.xml"), solrconfig_xml);
}
void startSolr() {
System.setProperty("solr.solr.home", configPath.getAbsolutePath());
CoreContainer.Initializer initializer = new CoreContainer.Initializer();
CoreContainer coreContainer = initializer.initialize();
server = new EmbeddedSolrServer(coreContainer, "collection1");
}
@Before
public void setup() throws IOException {
populateDb(getDataSource());
writeSolrConfig();
startSolr();
}
@After
public void tearDown() {
server.shutdown();
FileUtils.deleteQuietly(configPath);
}
@Test
public void testSplitBy() throws Exception {
SolrQuery query = new SolrQuery();
query.set("qt", "/dataimport");
query.setParam("command", "full-import");
QueryResponse response = server.query(query);
Thread.sleep(500);
response = server.query(new SolrQuery("*:*"));
for (SolrDocument doc: response.getResults()) {
assertNotNull(doc.getFieldValues("type"));
assertEquals(3, doc.getFieldValues("type").size());
}
}
}
And the POM for the test case:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>solr</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Solr Sanity</name>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr</artifactId>
<version>4.1.0</version>
<type>war</type>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-dataimporthandler</artifactId>
<version>4.1.0</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>4.1.0</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>commons-dbutils</groupId>
<artifactId>commons-dbutils</artifactId>
<version>1.5</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>org.hsqldb</groupId>
<artifactId>hsqldb</artifactId>
<version>2.2.9</version>
<type>jar</type>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
<version>2.5</version>
</dependency>
</dependencies>
</project>
Any insight on how to get those types to split correctly into multiple values?
回答1:
Turns out there were a couple of issues with this unit test:
HSQL's column names are case sensitive (and default to upper case).
If the Solr field name and the db column name are identical an extra token with the entire db value is also added.
The field definition should look like:
<field column="solrField" splitBy="," sourceColName="TYPE" />
And in general - when using the RegexTransformer
to mix single valued fields from a DB with multivalued fields:
If using
splitBy
then thecolumn
attribute is the name of the Solr field. ThesourceColName
is the database columnIf not using
splitBy
then thecolumn
attribute is the database column name and thename
attribute is the Solr field.
来源:https://stackoverflow.com/questions/14512055/splitting-database-column-into-multivalued-solr-field