package sparkscala2.test
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions.regexp_extract
object example1 {
//case class MyType(Field_Name: String)
// case class click(rowtime:Map[String,String])
case class click(rowtime:String,key:String,ip:String,userid:String,remote_user:String,time:String,_time:String,request:String,status:String,bytes:String,referrer:String,agent:String)
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop\\")
val spark = SparkSession.builder().appName("test").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("Error")
import spark.implicits._
val rdd = spark.sparkContext.textFile("file:///C://Users//User//Desktop//test1.txt")
val clean_rdd=rdd.map(x=>x.replace("value: {","")).map(x=>x.replace("}","")).map(x=>x.replace("\"",""))
val schema_rdd=clean_rdd.map(x=>x.split(",")).map(x=>click(x(0).split(":")(1),x(1).split(":")(1),x(2).split(":")(1),x(3).split(":")(1),x(4).split(":")(1),x(5).split(":")(1),
x(6).split(":")(1),x(7).split(":")(1),x(8).split(":")(1),x(9).split(":")(1),x(10).split(":")(1),x(11).split(":")(1)))
val final_df=schema_rdd.toDF()
final_df.show(false)
}
}
input file : test1.txt
rowtime: 2020/06/11 10:38:42.449 Z, key: 222.90.225.227, value: {"ip":"222.90.225.227","userid":12,"remote_user":"-","time":"1","_time":1,"request":"GET /images/logo-small.png HTTP/1.1","status":"302","bytes":"1289","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
rowtime: 2020/06/11 10:38:42.528 Z, key: 111.245.174.248, value: {"ip":"111.245.174.248","userid":30,"remote_user":"-","time":"11","_time":11,"request":"GET /site/login.html HTTP/1.1","status":"302","bytes":"14096","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
rowtime: 2020/06/11 10:38:42.705 Z, key: 122.152.45.245, value: {"ip":"122.152.45.245","userid":11,"remote_user":"-","time":"21","_time":21,"request":"GET /images/logo-small.png HTTP/1.1","status":"407","bytes":"4196","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
output:
+--------------+----------------+---------------+------+-----------+----+-----+-----------------------------------+------+-----+--------+-------------------------------------------------------------------+
|rowtime |key |ip |userid|remote_user|time|_time|request |status|bytes|referrer|agent |
+--------------+----------------+---------------+------+-----------+----+-----+-----------------------------------+------+-----+--------+-------------------------------------------------------------------+
| 2020/06/11 10| 222.90.225.227 |222.90.225.227 |12 |- |1 |1 |GET /images/logo-small.png HTTP/1.1|302 |1289 |- |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML|
| 2020/06/11 10| 111.245.174.248|111.245.174.248|30 |- |11 |11 |GET /site/login.html HTTP/1.1 |302 |14096|- |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML|
| 2020/06/11 10| 122.152.45.245 |122.152.45.245 |11 |- |21 |21 |GET /images/logo-small.png HTTP/1.1|407 |4196 |- |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML|
+--------------+----------------+---------------+------+-----------+----+-----+-----------------------------------+------+-----+--------+-------------------------------------------------------------------+
I have tried the above code, but in the output, the rowtime and agent column data are not completely displayed due to semicolon and comma in the data. rowtime column has semicolon in the data and so the remaining data didn't got displayed, since I have done split with semicolon to separate the key/value and for agent column we have comma in the data and the so the remaining data didn't get displayed, because in the beginning as I have used split with comma.
Is there any way that I can use the regex function while mapping with the case class schema? Or any other way?
Instead of using case classes and RDD you can read file as dataset/dataframe then do transformation to get the desired result.
val spark = SparkSession.builder().master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
// Assuming you have fixed structure. You can improvise this regex as required
val regex = "([a-z-A-Z]+:\\s+)([\\s+\\d:\\./]+Z)([,\\s+a-z-A-Z]+:\\s+)([\\d\\.]+)([,\\s+]+[a-z-A-Z]+:)(.*)"
/*
([a-z-A-Z]+:\\s+) --> Matches rowtime:
([\\s+\\d:\\./]+Z) --> Matches rowtime value eg. 2020/06/11 10:38:42.449 Z
([,\\s+a-z-A-Z]+:\\s+) --> Matches , key:
([\\d\\.]+) --> Matches content of key e.g 222.90.225.227
([,\\s+]+[a-z-A-Z]+:) --> Matches , value:
(.*) --> Matches content of value field which is in json
*/
// Read file as dataframe and using regex and grouping ID extract column content
var df = spark.read.textFile("sample.txt")
.select(regexp_extract('value, regex, 2).as("rowtime"),
regexp_extract('value, regex, 4).as("key"),
regexp_extract('value, regex, 6).as("value"))
// Since value is json we can make use from_json to create struct field
df = df.withColumn("value", from_json('value, schema_of_json(df.select("value").first().getString(0))))
// select all the column including nested columns of value column
df.select("rowtime", "key", "value.*").show(false)
+-------------------------+---------------+-----+-------------------------------------------------------------------------------------------------------------------+-----+---------------+--------+-----------+-----------------------------------+------+----+------+
|rowtime |key |_time|agent |bytes|ip |referrer|remote_user|request |status|time|userid|
+-------------------------+---------------+-----+-------------------------------------------------------------------------------------------------------------------+-----+---------------+--------+-----------+-----------------------------------+------+----+------+
|2020/06/11 10:38:42.449 Z|222.90.225.227 |1 |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36|1289 |222.90.225.227 |- |- |GET /images/logo-small.png HTTP/1.1|302 |1 |12 |
|2020/06/11 10:38:42.528 Z|111.245.174.248|11 |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36|14096|111.245.174.248|- |- |GET /site/login.html HTTP/1.1 |302 |11 |30 |
|2020/06/11 10:38:42.705 Z|122.152.45.245 |21 |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36|4196 |122.152.45.245 |- |- |GET /images/logo-small.png HTTP/1.1|407 |21 |11 |
+-------------------------+---------------+-----+-------------------------------------------------------------------------------------------------------------------+-----+---------------+--------+-----------+-----------------------------------+------+----+------+
Your data almost looks like JSON but missing some double-quotes for few fields and the final flower bracket wrapper.
Try the below - more of spark-sql approach
Mimic the dataframe using hardcoded string. You can get this from file.
val df = Seq("""rowtime: 2020/06/11 10:38:42.528 Z, key: 111.245.174.248, value: {"ip":"111.245.174.248","userid":30,"remote_user":"-","time":"11","_time":11,"request":"GET /site/login.html HTTP/1.1","status":"302","bytes":"14096","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"} """,
"""rowtime: 2020/06/11 10:38:42.528 Z, key: 111.245.174.248, value: {"ip":"111.245.174.248","userid":30,"remote_user":"-","time":"11","_time":11,"request":"GET /site/login.html HTTP/1.1","status":"302","bytes":"14096","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}""",
"""rowtime: 2020/06/11 10:38:42.705 Z, key: 122.152.45.245, value: {"ip":"122.152.45.245","userid":11,"remote_user":"-","time":"21","_time":21,"request":"GET /images/logo-small.png HTTP/1.1","status":"407","bytes":"4196","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}"""
).toDF("x")
Create a temp view
df.show(false)
df.createOrReplaceTempView("df")
Input data:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|x |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|rowtime: 2020/06/11 10:38:42.528 Z, key: 111.245.174.248, value: {"ip":"111.245.174.248","userid":30,"remote_user":"-","time":"11","_time":11,"request":"GET /site/login.html HTTP/1.1","status":"302","bytes":"14096","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"} |
|rowtime: 2020/06/11 10:38:42.528 Z, key: 111.245.174.248, value: {"ip":"111.245.174.248","userid":30,"remote_user":"-","time":"11","_time":11,"request":"GET /site/login.html HTTP/1.1","status":"302","bytes":"14096","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"} |
|rowtime: 2020/06/11 10:38:42.705 Z, key: 122.152.45.245, value: {"ip":"122.152.45.245","userid":11,"remote_user":"-","time":"21","_time":21,"request":"GET /images/logo-small.png HTTP/1.1","status":"407","bytes":"4196","referrer":"-","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Now transform the single-column "x" using the regex functions to fix the double-quotes issues plus the flower brackets to make it a valid json literal
val df1 = spark.sql("""
with t1 ( select x from df ),
t2 ( select regexp_replace(x,"(rowtime|key|value):","\"$1\":") x from t1 ),
t3 ( select regexp_replace(x,"(\"rowtime\":)\\s+([^,]+),","$1 \"$2\",") x from t2 ),
t4 ( select regexp_replace(x,"(\"key\":)\\s+([^,]+),","$1 \"$2\",") x from t3 )
select '{'||x||'}' y from t4
""")
df1.printSchema()
Now each row of column "y" is a valid json literal. Convert this dataframe with single column that has json string with below dataframe step
import spark.implicits._
val df2 = spark.read.json(df1.as[String])
df2.printSchema
root
|-- key: string (nullable = true)
|-- rowtime: string (nullable = true)
|-- value: struct (nullable = true)
| |-- _time: long (nullable = true)
| |-- agent: string (nullable = true)
| |-- bytes: string (nullable = true)
| |-- ip: string (nullable = true)
| |-- referrer: string (nullable = true)
| |-- remote_user: string (nullable = true)
| |-- request: string (nullable = true)
| |-- status: string (nullable = true)
| |-- time: string (nullable = true)
| |-- userid: long (nullable = true)
Create a view on top of this
df2.createOrReplaceTempView("df2")
Now, use the spark-sql and get the needed elements for the output.
spark.sql("""
select rowtime , key, value.ip ip, value.userid userid, value.remote_user remote_user,
value.time time, value._time _time, value.request request, value.status status, value.bytes bytes,
value.referrer referrer, value.agent agent
from df2
""").show(false)
Output:
+-------------------------+---------------+---------------+------+-----------+----+-----+-----------------------------------+------+-----+--------+-------------------------------------------------------------------------------------------------------------------+
|rowtime |key |ip |userid|remote_user|time|_time|request |status|bytes|referrer|agent |
+-------------------------+---------------+---------------+------+-----------+----+-----+-----------------------------------+------+-----+--------+-------------------------------------------------------------------------------------------------------------------+
|2020/06/11 10:38:42.528 Z|111.245.174.248|111.245.174.248|30 |- |11 |11 |GET /site/login.html HTTP/1.1 |302 |14096|- |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36|
|2020/06/11 10:38:42.528 Z|111.245.174.248|111.245.174.248|30 |- |11 |11 |GET /site/login.html HTTP/1.1 |302 |14096|- |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36|
|2020/06/11 10:38:42.705 Z|122.152.45.245 |122.152.45.245 |11 |- |21 |21 |GET /images/logo-small.png HTTP/1.1|407 |4196 |- |Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36|
+-------------------------+---------------+---------------+------+-----------+----+-----+-----------------------------------+------+-----+--------+-------------------------------------------------------------------------------------------------------------------+
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.