[spark] rdd의 stats 함수

scala 2017. 3. 24. 19:24


spark rdd에 간단한 통계 기능(count, mean, stdev, max, min)이 있고 이를 한 번에 묶는 stats 함수가 있다.



scala> val a = sc.parallelize(List("111", "222"))

a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at <console>:24



scala> val ints = a.map(string => string.toInt)

ints: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[1] at map at <console>:26


scala> val stats = ints.stats()

stats: org.apache.spark.util.StatCounter = (count: 2, mean: 166.500000, stdev: 55.500000, max: 222.000000, min: 111.000000)


scala> stats.count

res0: Long = 2


scala> stats.mean

res1: Double = 166.5


scala> stats.stdev

res2: Double = 55.5


scala> stats.max

res3: Double = 222.0


scala> stats.min

res4: Double = 111.0

Posted by '김용환'
,