簡體   English   中英

從數據庫中提取多列數據並使用Spark寫入文件?

[英]extract multiple columns data from db and write to a file using spark?

在spark代碼下方,垂直而非水平保存數據的人可以幫忙嗎? 如何將結果集輸出保存到Spark中的文件? 用戶名,密碼,dburl來自spring框架配置值。

例如:-1,2,3

預期:-1,2,3

 package com.kali.db /** * Created by kalit_000 on 05/12/2015. */ import java.util import org.apache.spark.SparkConf import org.apache.log4j.Logger import org.apache.log4j.Level import org.apache.spark._ import java.sql.{ResultSet, DriverManager, Connection} import org.apache.spark.rdd.{JdbcRDD, RDD} import org.springframework.context.support.ClassPathXmlApplicationContext import scala.collection.mutable.ListBuffer case class SqlMulti(driver:String,url:String,username:String,password:String,sql:String) object SqlMultiExample { def main (args: Array[String]):Unit= { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[1]").setAppName("MultipleSqlColumns").set("spark.hadoop.validateOutputSpecs", "false") val sc = new SparkContext(conf) //read the application context file val ctx = new ClassPathXmlApplicationContext("multiplecolumns.xml") val DBinfo = ctx.getBean("SqlTest").asInstanceOf[SqlMulti] /*assign class values to variables*/ val driver = DBinfo.driver val url = DBinfo.url val username = DBinfo.username val password = DBinfo.password val query = DBinfo.sql var connection: Connection = null val sqlquery = DBinfo.sql println("DB Driver:-%s".format(driver)) println("DB Url:-%s".format(url)) println("Username:-%s".format(username)) println("Password:-%s".format(password)) println("Query:-%s".format(query)) try { Class.forName(driver) connection = DriverManager.getConnection(url, username, password) val statement = connection.createStatement() val resultSet = statement.executeQuery(query) resultSet.setFetchSize(10); val columnnumber = resultSet.getMetaData().getColumnCount.toInt /*OP COLUMN NAMES*/ var i = 0.toInt; for (i <- 1 to columnnumber.toInt) { val columnname = resultSet.getMetaData().getColumnName(i) println("Column Names are:- %s".format(columnname)) } /*OP DATA*/ while (resultSet.next()) { var list = new java.util.ArrayList[String]() for (i <- 1 to columnnumber.toInt) { list.add(resultSet.getString(i)) //println(list) sc.parallelize(list.toString.replace("null", "N/A")).saveAsTextFile("C:\\\\Users\\\\kalit_000\\\\Desktop\\\\typesafe\\\\scaladbop\\\\op.txt") } } } catch { case e: Exception => e.printStackTrace } connection.close() sc.stop() } } 

我重新編寫了代碼以使用read.jdbc,這已經滿足了我的所有要求

 package com.kali.db /** * Created by kalit_000 on 06/12/2015. */ import java.util.Properties import org.apache.spark.SparkConf import org.apache.log4j.Logger import org.apache.log4j.Level import org.apache.spark._ import org.apache.spark.rdd.{JdbcRDD, RDD} import org.apache.spark.sql.DataFrame import org.springframework.context.support.ClassPathXmlApplicationContext case class SparkSqlValueClassMPP(driver:String,url:String,username:String,password:String,table:String,opdelimeter:String,lowerbound:String,upperbound:String,numberofparitions:String,parallelizecolumn:String) object SparkDBExtractorMPP { def main (args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[*]").setAppName("SparkDBExtractorMPP").set("spark.hadoop.validateOutputSpecs", "false") val sc = new SparkContext(conf) def opfile(value:DataFrame,delimeter:String):RDD[String]= { value.map(x => x.toString.replace("[","").replace("]","").replace(",",delimeter)) } //read the application context file val ctx = new ClassPathXmlApplicationContext("sparkDBExtractorMpp.xml") val DBinfo = ctx.getBean("SparkSQLDBExtractorMPP").asInstanceOf[SparkSqlValueClassMPP] val driver = DBinfo.driver val url = DBinfo.url val username = DBinfo.username val password = DBinfo.password val table = DBinfo.table val opdelimeter=DBinfo.opdelimeter val lowerbound=DBinfo.lowerbound.toInt val upperbound=DBinfo.upperbound.toInt val numberofpartitions=DBinfo.numberofparitions.toInt val parallelizecolumn=DBinfo.parallelizecolumn println("DB Driver:-%s".format(driver)) println("DB Url:-%s".format(url)) println("Username:-%s".format(username)) println("Password:-%s".format(password)) println("Table:-%s".format(table)) println("Opdelimeter:-%s".format(opdelimeter)) println("Lowerbound:-%s".format(lowerbound)) println("Upperbound:-%s".format(upperbound)) println("Numberofpartitions:-%s".format(numberofpartitions)) println("Parallelizecolumn:-%s".format(parallelizecolumn)) try { val props=new Properties() props.put("user",username) props.put("password",password) props.put("driver",driver) val sqlContext = new org.apache.spark.sql.SQLContext(sc) val df = sqlContext.read.jdbc(url,table,parallelizecolumn,lowerbound,upperbound,numberofpartitions,props) df.show(10) opfile(df,opdelimeter).saveAsTextFile("C:\\\\Users\\\\kalit_000\\\\Desktop\\\\typesafe\\\\scaladbop\\\\op.txt") } catch { case e: Exception => e.printStackTrace } sc.stop() } } 

為了使此代碼高度可配置,我正在使用Java spring框架

 <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd"> <beans> <bean id="queryProps" class="org.springframework.beans.factory.config.PropertiesFactoryBean"> </bean> <bean id="SparkSQLDBExtractorMPP" class="com.kali.db.SparkSqlValueClassMPP"> <constructor-arg value="com.microsoft.sqlserver.jdbc.SQLServerDriver" /> <constructor-arg value="jdbc:sqlserver://localhost;user=admin;password=oracle;database=AdventureWorks2014" /> <constructor-arg value="admin" /> <constructor-arg value="oracle" /> <constructor-arg value="(select top 100 CustomerID,StoreID,TerritoryID,AccountNumber,ModifiedDate from customer ) as customer" /> <constructor-arg value="~" /> <constructor-arg value="1" /> <constructor-arg value="100" /> <constructor-arg value="8" /> <constructor-arg value="CustomerID" /> </bean> </beans> 

項目現在在github中

https://github.com/kali786516/ScalaDB

由於“ list.add(resultSet.getString(i))”,輸出數據被垂直保存,只需將每列線性插入列表即可。

如果希望在單行中輸出“數據庫行”,則應構建一個列表,並且每個元素都是“數據庫行”而不是列的字符串表示形式。 和“數據庫行”需要轉換為具有正確格式的字符串。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM