Correlation
Correlation
import org.apache.spark.ml.linalg.{Matrix, Vectors}
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row
val data = Seq(
//Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
Vectors.dense(1.0, 0.0, 0.0,-2.0),
Vectors.dense(4.0, 5.0, 0.0, 3.0),
Vectors.dense(6.0, 7.0, 0.0, 8.0),
//Vectors.sparse(4, Seq((0, 9.0), (3, 1.0)))
Vectors.dense(9.0, 0.0, 0.0, 1.0)
)
//data.foreach(println)
val df = data.map(Tuple1.apply).toDF("features")
//val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
//df.select("features").show(false)
//df.show(false)
//println(Correlation.corr(df,"features").head)
//println("")
val coeff1 = Correlation.corr(df, "features").head match {
case Row(coeff1: Matrix) => coeff1
}
//println(coeff1)
println(s"Pearson correlation matrix:n $coeff1\n")
//val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
val coeff2 = Correlation.corr(df, "features","spearman").head match {
case Row(coeff2: Matrix) => coeff2
}
//println(Correlation.corr(df, "features","spearman").head)
//println("")
println(s"Spearman correlation matrix:n $coeff2\n")
/*
Pearson correlation matrix:n 1.0 0.055641488407465814 NaN 0.4004714203168137
0.055641488407465814 1.0 NaN 0.9135958615342522
NaN NaN 1.0 NaN
0.4004714203168137 0.9135958615342522 NaN 1.0
Spearman correlation matrix:n 1.0 0.10540925533894532 NaN 0.40000000000000174
0.10540925533894532 1.0 NaN 0.9486832980505141
NaN NaN 1.0 NaN
0.40000000000000174 0.9486832980505141 NaN 1.0
*/Hypothesis testing
Last updated