
[英]Error running spark and cassandra using datastax spark-cassandra java connector
[英]Spark-Cassandra Maven project with java source making scala-lib calls
我对Spark和Cassandra都是新手,需要一些指导。 我正在设置一个使用Spark v1.3.1和Cassandra v2.0.14的Maven项目。 我正在尝试以下操作:
1)使用以下方法与Oracle DB建立连接以进行数据输入; 利用Spark 1.3.0的新DataFrames: http : //www.sparkexpert.com/2015/03/28/loading-database-data-into-spark-using-data-sources-api/
2)使用spark-cassandra-connector在后者之间建立连接; 在github上找到。
3)一旦我在DataFrame中拥有DB数据,我就应该能够转换为JavaRDD类型并推送到Cassandra键空间,如下所示: http ://www.datastax.com/dev/blog/accessing-cassandra-from-spark- 在Java的
4)简而言之:[Oracle DB] <--- [Spark] --- [spark-cassandra-connector] ---> [Cassandra]
我的问题是在Java代码中的Scala-lib调用过程中(上面的第1步)。 更具体地说,在加载函数调用期间:DataFrame jdbcDF = sqlContext.load(“ jdbc”,options);
运行时错误:java.lang.ClassNotFoundException:scala.collection.GenTraversableOnce $ class”
尽管在我的pom.xml文件中尝试了推荐的2.10.X Scala的多个不同版本,但仍出现上述错误。 根据我先前的研究,我认为这可能是Spark-Scala兼容性问题。 我还读到我需要在类路径中包含scala-lib.jar,但是我不确定如何使用Maven做到这一点。 有任何想法吗? 我在下面包含了pom.xml和Java代码:
的pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.dev</groupId> <artifactId>spark-cassandra</artifactId> <version>0.0.1-SPARK-CASSANDRA</version> <dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>1.3.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>1.3.1</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.35</version> </dependency> <dependency> <groupId>com.oracle</groupId> <artifactId>ojdbc6</artifactId> <version>11.2.0</version> </dependency> <dependency> <groupId>com.datastax.spark</groupId> <artifactId>spark-cassandra-connector_2.10</artifactId> <version>1.0.0-rc4</version> </dependency> <dependency> <groupId>com.datastax.spark</groupId> <artifactId>spark-cassandra-connector-java_2.10</artifactId> <version>1.0.0-rc4</version> </dependency> <dependency> <groupId>com.datastax.cassandra</groupId> <artifactId>cassandra-driver-core</artifactId> <version>2.1.5</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>1.3.1</version> </dependency> <dependency> <groupId>com.dev.cassandra</groupId> <artifactId>spark-cassandra</artifactId> <version>1.0</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.10.3</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-compiler</artifactId> <version>2.10.3</version> </dependency> <!-- <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-reflect</artifactId> <version>2.10.0-M1</version> </dependency> --> <!-- <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-swing</artifactId> <version>2.10.0-M1</version> </dependency> --> </dependencies> <build> <pluginManagement> <plugins> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.1.5</version> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.3</version> <configuration> <source>1.7</source> <target>1.7</target> <mainClass>com.dev.cassandra.Main</mainClass> <cleanupDaemonThreads>false</cleanupDaemonThreads> <compilerArgument>-Xlint:all</compilerArgument> <showWarnings>true</showWarnings> <showDeprecation>true</showDeprecation> </configuration> </plugin> </plugins> </pluginManagement> <plugins> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <executions> <execution> <id>scala-compile-first</id> <phase>process-resources</phase> <goals> <goal>add-source</goal> <goal>compile</goal> </goals> </execution> <execution> <id>scala-test-compile</id> <phase>process-test-resources</phase> <goals> <goal>testCompile</goal> </goals> </execution> </executions> </plugin> <!-- Plugin to create a single jar that includes all dependencies <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <version>2.4</version> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> <archive> <manifest> <mainClass>com.dev.cassandra.Main</mainClass> </manifest> </archive> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> --> </plugins> </build> </project>
JAVA代码:
package com.dev.cassandra; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.sql.*; import org.apache.spark.*; import org.apache.spark.SparkConf; import org.apache.spark.api.java.*; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import oracle.jdbc.*; import com.datastax.spark.connector.cql.CassandraConnector; import static com.datastax.spark.connector.CassandraJavaUtil.*; public class Main implements Serializable { private static final org.apache.log4j.Logger LOGGER = org.apache.log4j.Logger.getLogger(Main.class); private static final String JDBC_DRIVER = "oracle.jdbc.driver.OracleDriver"; private static final String JDBC_USERNAME = "XXXXXO01"; private static final String JDBC_PWD = "XXXXXO01"; private static final String JDBC_CONNECTION_URL = "jdbc:oracle:thin:" + JDBC_USERNAME + "/" + JDBC_PWD + "@CONNECTION VALUES"; private transient SparkConf conf; private Main(SparkConf conf) { this.conf = conf; } private void run() { JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); generateData(sc); compute(sc); showResults(sc); sc.stop(); } private void generateData(JavaSparkContext sc) { SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); System.out.println("AFTER SQL CONTEXT"); //Data source options Map<String, String> options = new HashMap<>(); options.put("driver", JDBC_DRIVER); options.put("url", JDBC_CONNECTION_URL); options.put("dbtable","(SELECT * FROM XXX_SAMPLE_TABLE WHERE ROWNUM <=5)"); CassandraConnector connector = CassandraConnector.apply(sc.getConf()); try{ Class.forName(JDBC_DRIVER); System.out.println("BEFORE jdbcDF"); //Load JDBC query result as DataFrame DataFrame jdbcDF = sqlContext.load("jdbc", options); System.out.println("AFTER jdbcDF"); List<Row> tableRows = jdbcDF.collectAsList(); System.out.println("AFTER tableRows"); for (Row tableRow : tableRows) { System.out.println(); LOGGER.info(tableRow); System.out.println(); } }catch(Exception e){ //Handle errors for Class.forName e.printStackTrace(); } } private void compute(JavaSparkContext sc) { } private void showResults(JavaSparkContext sc) { } public static void main(String[] args) throws InterruptedException { if (args.length != 2) { System.err.println("Syntax: com.datastax.spark.dev.cassandra <Spark Master URL> <Cassandra contact point>"); System.exit(1); } //JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]")); SparkConf conf = new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"); //SparkConf conf = new SparkConf(); //conf.setAppName("SparkJdbcDs"); //conf.setMaster(args[0]); //conf.set("spark.cassandra.connection.host", args[1]); Main app = new Main(conf); app.run(); } }
提前致谢!
pom.xml正在请求某些Spark JAR的Scala 2.11版本
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-sql_2.11</artifactId>
以及另一个Spark JAR和Cassandra连接器JAR的Scala 2.10版本。
<artifactId>spark-streaming_2.10</artifactId>
<artifactId>spark-cassandra-connector_2.10</artifactId>
<artifactId>spark-cassandra-connector-java_2.10</artifactId>
(基于Scala Artifact ID的命名约定,该约定以您要为其构建的Scala版本结尾。)
它们需要(a)彼此和(b)与您实际使用的Scala版本保持一致。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.