package cn.allengao.exercise import org.apache.spark.{SparkConf, SparkContext} object SparkRDDTest2 { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SparkRDDTest2").setMaster("local") val sc = new SparkContext(conf) //指定为2个分区 val rdd1 = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7), 2) //设定一个函数,设定分区的ID索引,数值 val func1 = (index: Int, iter: Iterator[(Int)]) => { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } //查看每个分区的信息 val res1 = rdd1.mapPartitionsWithIndex(func1) //用aggregate,指定初始值,对rdd1进行聚合操作,先进行局部求和,再进行全局求和 val res2 = rdd1.aggregate(0)(_ + _, _ + _) //将局部分区中最大的数找出来再进行求和 val res3 = rdd1.aggregate(0)(math.max(_, _), _ + _) //每个分区都以10为初始值,10用了3次 val res4 = rdd1.aggregate(10)(_ + _, _ + _) /* 运行结果:ArrayBuffer([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:1, val: 4], [partID:1, val: 5], [partID:1, val: 6], [partID:1, val: 7]) */ // println(res1.collect().toBuffer) //运行结果:28 // println(res2) //运行结果:10 // println(res3) //运行结果:58 // println(res4) val rdd2 = sc.parallelize(List("a", "b", "c", "d", "e", "f"), 2) val res5 = rdd2.aggregate("|")(_ + _, _ + _) //运行结果:||abc|def // println(res5) val rdd3 = sc.parallelize(List("12", "23", "345", "4567"), 2) //两个分区先计算出字符串的最大长度,然后合成字符串 val res6 = rdd3.aggregate("")((x, y) => math.max(x.length, y.length).toString, (x, y) => x + y) //运行结果:24 或者 42,体现了并行化的特点 // println(res6) val rdd4 = sc.parallelize(List("12", "23", "345", ""), 2) val res7 = rdd4.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y) //运行结果:01 或者 10,值"0".toString的长度为0,"0".toString.length的长度为1 /* math.min("".length, "12".length ) 的结果是:0 , math.min("0".length, "23".length ) 的结果是:1 math.min("".length, "345".length) 的结果是:0 , math.min("0".length, "".length) 的结果是:0 */ // println(res7) val rdd5 = sc.parallelize(List("12", "23", "", "345"), 2) val res8 = rdd5.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y) //运行结果:11 /* math.min("".length, "12".length ) 的结果是:0 , math.min("0".length, "23".length ) 的结果是:1 math.min("".length, "".length) 的结果是:0 , math.min("0".length, "345".length) 的结果是:1 */ // println(res8) //aggregateByKey可以先进行局部操作,再进行全局操作。 val pairRDD = sc.parallelize(List(("cat", 2), ("cat", 5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)), 2) def func2(index: Int, iter: Iterator[(String, Int)]): Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } // println(pairRDD.mapPartitionsWithIndex(func2).collect.toBuffer) //把每种类型的最大的次数取出来 //运行结果:ArrayBuffer((dog,12), (cat,17), (mouse,6)) println(pairRDD.aggregateByKey(0)(math.max(_, _), _ + _).collect.toBuffer) //运行结果:ArrayBuffer((dog,12), (cat,22), (mouse,20)) println(pairRDD.aggregateByKey(10)(math.max(_, _), _ + _).collect.toBuffer) /* pairRDD.aggregateByKey(0)(_ + _ , _ + _).collect与pairRDD.reduceByKey( _ + _).collect, 这两个方法执行结果是一样的,实际上底层都是调用的同一个方法:combineByKey */ } }