简体   繁体   中英

extract multiple columns data from db and write to a file using spark?

Below spark code saving data vertically instead of horizontally can anyone help ? How to save result set output to a file in spark ? username,password,dburl are coming from spring framework config values.

ex:- 1, 2, 3

expected:- 1,2,3

 package com.kali.db /** * Created by kalit_000 on 05/12/2015. */ import java.util import org.apache.spark.SparkConf import org.apache.log4j.Logger import org.apache.log4j.Level import org.apache.spark._ import java.sql.{ResultSet, DriverManager, Connection} import org.apache.spark.rdd.{JdbcRDD, RDD} import org.springframework.context.support.ClassPathXmlApplicationContext import scala.collection.mutable.ListBuffer case class SqlMulti(driver:String,url:String,username:String,password:String,sql:String) object SqlMultiExample { def main (args: Array[String]):Unit= { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[1]").setAppName("MultipleSqlColumns").set("spark.hadoop.validateOutputSpecs", "false") val sc = new SparkContext(conf) //read the application context file val ctx = new ClassPathXmlApplicationContext("multiplecolumns.xml") val DBinfo = ctx.getBean("SqlTest").asInstanceOf[SqlMulti] /*assign class values to variables*/ val driver = DBinfo.driver val url = DBinfo.url val username = DBinfo.username val password = DBinfo.password val query = DBinfo.sql var connection: Connection = null val sqlquery = DBinfo.sql println("DB Driver:-%s".format(driver)) println("DB Url:-%s".format(url)) println("Username:-%s".format(username)) println("Password:-%s".format(password)) println("Query:-%s".format(query)) try { Class.forName(driver) connection = DriverManager.getConnection(url, username, password) val statement = connection.createStatement() val resultSet = statement.executeQuery(query) resultSet.setFetchSize(10); val columnnumber = resultSet.getMetaData().getColumnCount.toInt /*OP COLUMN NAMES*/ var i = 0.toInt; for (i <- 1 to columnnumber.toInt) { val columnname = resultSet.getMetaData().getColumnName(i) println("Column Names are:- %s".format(columnname)) } /*OP DATA*/ while (resultSet.next()) { var list = new java.util.ArrayList[String]() for (i <- 1 to columnnumber.toInt) { list.add(resultSet.getString(i)) //println(list) sc.parallelize(list.toString.replace("null", "N/A")).saveAsTextFile("C:\\\\Users\\\\kalit_000\\\\Desktop\\\\typesafe\\\\scaladbop\\\\op.txt") } } } catch { case e: Exception => e.printStackTrace } connection.close() sc.stop() } } 

I re wrote my code to use read.jdbc this had solved all my requirements

 package com.kali.db /** * Created by kalit_000 on 06/12/2015. */ import java.util.Properties import org.apache.spark.SparkConf import org.apache.log4j.Logger import org.apache.log4j.Level import org.apache.spark._ import org.apache.spark.rdd.{JdbcRDD, RDD} import org.apache.spark.sql.DataFrame import org.springframework.context.support.ClassPathXmlApplicationContext case class SparkSqlValueClassMPP(driver:String,url:String,username:String,password:String,table:String,opdelimeter:String,lowerbound:String,upperbound:String,numberofparitions:String,parallelizecolumn:String) object SparkDBExtractorMPP { def main (args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[*]").setAppName("SparkDBExtractorMPP").set("spark.hadoop.validateOutputSpecs", "false") val sc = new SparkContext(conf) def opfile(value:DataFrame,delimeter:String):RDD[String]= { value.map(x => x.toString.replace("[","").replace("]","").replace(",",delimeter)) } //read the application context file val ctx = new ClassPathXmlApplicationContext("sparkDBExtractorMpp.xml") val DBinfo = ctx.getBean("SparkSQLDBExtractorMPP").asInstanceOf[SparkSqlValueClassMPP] val driver = DBinfo.driver val url = DBinfo.url val username = DBinfo.username val password = DBinfo.password val table = DBinfo.table val opdelimeter=DBinfo.opdelimeter val lowerbound=DBinfo.lowerbound.toInt val upperbound=DBinfo.upperbound.toInt val numberofpartitions=DBinfo.numberofparitions.toInt val parallelizecolumn=DBinfo.parallelizecolumn println("DB Driver:-%s".format(driver)) println("DB Url:-%s".format(url)) println("Username:-%s".format(username)) println("Password:-%s".format(password)) println("Table:-%s".format(table)) println("Opdelimeter:-%s".format(opdelimeter)) println("Lowerbound:-%s".format(lowerbound)) println("Upperbound:-%s".format(upperbound)) println("Numberofpartitions:-%s".format(numberofpartitions)) println("Parallelizecolumn:-%s".format(parallelizecolumn)) try { val props=new Properties() props.put("user",username) props.put("password",password) props.put("driver",driver) val sqlContext = new org.apache.spark.sql.SQLContext(sc) val df = sqlContext.read.jdbc(url,table,parallelizecolumn,lowerbound,upperbound,numberofpartitions,props) df.show(10) opfile(df,opdelimeter).saveAsTextFile("C:\\\\Users\\\\kalit_000\\\\Desktop\\\\typesafe\\\\scaladbop\\\\op.txt") } catch { case e: Exception => e.printStackTrace } sc.stop() } } 

to make this code highly configurable I am using Java spring framework

 <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd"> <beans> <bean id="queryProps" class="org.springframework.beans.factory.config.PropertiesFactoryBean"> </bean> <bean id="SparkSQLDBExtractorMPP" class="com.kali.db.SparkSqlValueClassMPP"> <constructor-arg value="com.microsoft.sqlserver.jdbc.SQLServerDriver" /> <constructor-arg value="jdbc:sqlserver://localhost;user=admin;password=oracle;database=AdventureWorks2014" /> <constructor-arg value="admin" /> <constructor-arg value="oracle" /> <constructor-arg value="(select top 100 CustomerID,StoreID,TerritoryID,AccountNumber,ModifiedDate from customer ) as customer" /> <constructor-arg value="~" /> <constructor-arg value="1" /> <constructor-arg value="100" /> <constructor-arg value="8" /> <constructor-arg value="CustomerID" /> </bean> </beans> 

Project is in github now

https://github.com/kali786516/ScalaDB

The output data is saved vertically because of "list.add(resultSet.getString(i))" just insert each column linearly into the list.

If expected to output "db row" in single line, You should build a list and each element is string representation of "db row" not columns. and "db row" need to be converted to string with proper format.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM