博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
40、JSON数据源综合案例实战
阅读量:5220 次
发布时间:2019-06-14

本文共 7605 字,大约阅读时间需要 25 分钟。

一、JSON数据源综合案例实战

1、概述

Spark SQL可以自动推断JSON文件的元数据,并且加载其数据,创建一个DataFrame。可以使用SQLContext.read.json()方法,针对一个元素类型为String的RDD,或者是一个JSON文件。但是要注意的是,这里使用的JSON文件与传统意义上的JSON文件是不一样的。每行都必须,也只能包含一个,单独的,自包含的,有效的JSON对象。不能让一个JSON对象分散在多行。否则会报错。###综合性复杂案例:查询成绩为80分以上的学生的基本信息与成绩信息

students.json

{"name":"Leo", "score":85}{
"name":"Marry", "score":99}{
"name":"Jack", "score":74}

2、java案例实现

package cn.spark.study.sql;import java.util.ArrayList;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.RowFactory;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructField;import org.apache.spark.sql.types.StructType;import scala.Tuple2;/** * JSON数据源 * @author Administrator * */public class JSONDataSource {    public static void main(String[] args) {        SparkConf conf = new SparkConf()                .setAppName("JSONDataSource");          JavaSparkContext sc = new JavaSparkContext(conf);        SQLContext sqlContext = new SQLContext(sc);                // 针对json文件,创建DataFrame(针对json文件创建DataFrame)        DataFrame studentScoresDF = sqlContext.read().json(                "hdfs://spark1:9000/spark-study/students.json");                  // 针对学生成绩信息的DataFrame,注册临时表,查询分数大于80分的学生的姓名        // (注册临时表,针对临时表执行sql语句)        studentScoresDF.registerTempTable("student_scores");        DataFrame goodStudentScoresDF = sqlContext.sql(                "select name,score from student_scores where score>=80");                // (将DataFrame转换为rdd,执行transformation操作)        List
goodStudentNames = goodStudentScoresDF.javaRDD().map( new Function
() { private static final long serialVersionUID = 1L; @Override public String call(Row row) throws Exception { return row.getString(0); } }).collect(); // 然后针对JavaRDD
,创建DataFrame // (针对包含json串的JavaRDD,创建DataFrame) List
studentInfoJSONs = new ArrayList
(); studentInfoJSONs.add("{\"name\":\"Leo\", \"age\":18}"); studentInfoJSONs.add("{\"name\":\"Marry\", \"age\":17}"); studentInfoJSONs.add("{\"name\":\"Jack\", \"age\":19}"); JavaRDD
studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs); DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD); // 针对学生基本信息DataFrame,注册临时表,然后查询分数大于80分的学生的基本信息 studentInfosDF.registerTempTable("student_infos"); String sql = "select name,age from student_infos where name in ("; for(int i = 0; i < goodStudentNames.size(); i++) { sql += "'" + goodStudentNames.get(i) + "'"; if(i < goodStudentNames.size() - 1) { sql += ","; } } sql += ")"; DataFrame goodStudentInfosDF = sqlContext.sql(sql); // 然后将两份数据的DataFrame,转换为JavaPairRDD,执行join transformation // (将DataFrame转换为JavaRDD,再map为JavaPairRDD,然后进行join) JavaPairRDD
> goodStudentsRDD = goodStudentScoresDF.javaRDD().mapToPair(new PairFunction
() { private static final long serialVersionUID = 1L; @Override public Tuple2
call(Row row) throws Exception { return new Tuple2
(row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1)))); } }).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction
() { private static final long serialVersionUID = 1L; @Override public Tuple2
call(Row row) throws Exception { return new Tuple2
(row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1)))); } })); // 然后将封装在RDD中的好学生的全部信息,转换为一个JavaRDD
的格式 // (将JavaRDD,转换为DataFrame) JavaRDD
goodStudentRowsRDD = goodStudentsRDD.map( new Function
>, Row>() { private static final long serialVersionUID = 1L; @Override public Row call( Tuple2
> tuple) throws Exception { return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2); } }); // 创建一份元数据,将JavaRDD
转换为DataFrame List
structFields = new ArrayList
(); structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true)); structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true)); structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true)); StructType structType = DataTypes.createStructType(structFields); DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType); // 将好学生的全部信息保存到一个json文件中去 // (将DataFrame中的数据保存到外部的json文件中去) goodStudentsDF.write().format("json").save("hdfs://spark1:9000/spark-study/good-students"); } }####students.json { "name":"Leo", "score":85}{ "name":"Marry", "score":99}{ "name":"Jack", "score":74}

3、scala案例实现

package cn.spark.study.sqlimport org.apache.spark.SparkConfimport org.apache.spark.SparkContextimport org.apache.spark.sql.SQLContextimport org.apache.spark.sql.types.StructTypeimport org.apache.spark.sql.types.StructFieldimport org.apache.spark.sql.types.StringTypeimport org.apache.spark.sql.types.IntegerTypeimport org.apache.spark.sql.Rowimport org.apache.spark.sql.types.LongType/** * @author Administrator */object JSONDataSource {    def main(args: Array[String]): Unit = {    val conf = new SparkConf()        .setAppName("JSONDataSource")      val sc = new SparkContext(conf)    val sqlContext = new SQLContext(sc)        // 创建学生成绩DataFrame    val studentScoresDF = sqlContext.read.json("hdfs://spark1:9000/spark-study/students.json")        // 查询出分数大于80分的学生成绩信息,以及学生姓名    studentScoresDF.registerTempTable("student_scores")    val goodStudentScoresDF = sqlContext.sql("select name,score from student_scores where score>=80")    val goodStudentNames = goodStudentScoresDF.rdd.map { row => row(0) }.collect()          // 创建学生基本信息DataFrame    val studentInfoJSONs = Array("{\"name\":\"Leo\", \"age\":18}",         "{\"name\":\"Marry\", \"age\":17}",        "{\"name\":\"Jack\", \"age\":19}")    val studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs, 3);    val studentInfosDF = sqlContext.read.json(studentInfoJSONsRDD)          // 查询分数大于80分的学生的基本信息    studentInfosDF.registerTempTable("student_infos")        var sql = "select name,age from student_infos where name in ("    for(i <- 0 until goodStudentNames.length) {      sql += "'" + goodStudentNames(i) + "'"      if(i < goodStudentNames.length - 1) {        sql += ","      }    }    sql += ")"          val goodStudentInfosDF = sqlContext.sql(sql)        // 将分数大于80分的学生的成绩信息与基本信息进行join    val goodStudentsRDD =         goodStudentScoresDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("score")) }            .join(goodStudentInfosDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("age")) })        // 将rdd转换为dataframe    val goodStudentRowsRDD = goodStudentsRDD.map(        info => Row(info._1, info._2._1.toInt, info._2._2.toInt))                  val structType = StructType(Array(        StructField("name", StringType, true),        StructField("score", IntegerType, true),        StructField("age", IntegerType, true)))              val goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType)          // 将dataframe中的数据保存到json中    goodStudentsDF.write.format("json").save("hdfs://spark1:9000/spark-study/good-students-scala")    }  }

转载于:https://www.cnblogs.com/weiyiming007/p/11286837.html

你可能感兴趣的文章
单片机编程
查看>>
python应用:爬虫实例(静态网页)
查看>>
迅为iTOP-4418开发板兼容八核6818开发板介绍
查看>>
com.fasterxml.jackson.databind.JsonMappingException
查看>>
【UVa 540】Team Queue
查看>>
排序算法(二)
查看>>
Python内置函数(36)——iter
查看>>
HTML标签_1
查看>>
jsp组成元素
查看>>
排序算法(转)
查看>>
windows自带的可生成各种数据库连接字符串工具打开方法
查看>>
Python命名规范
查看>>
滚动条
查看>>
程序员的自我修养九Windows下的动态链接
查看>>
Codeforces Round #361 (Div. 2)
查看>>
细说WebSocket - Node篇
查看>>
[洛谷1485] 火枪打怪
查看>>
Extjs控件之 grid打印功能
查看>>
枚举类型(不常用)递归
查看>>
ETL
查看>>