算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】
-------------------------------------------------------------------------------------笔者追求算法实现,不喜欢大篇幅叙述原理,有关KPCA理论推荐查看该篇博客https://blog.csdn.net/zjuPeco/article/details/77510981PCA降维欢迎前往笔者上一篇博客:ht...
·
--------------------------------------------------------------------------
笔者追求算法实现,不喜欢大篇幅叙述原理,有关KPCA理论推荐查看该篇博客
https://blog.csdn.net/zjuPeco/article/details/77510981
PCA降维欢迎前往笔者上一篇博客:
https://blog.csdn.net/Java_Man_China/article/details/89331554
LDA降维欢迎前往笔者上一篇博客:
https://blog.csdn.net/Java_Man_China/article/details/89504514
-------------------------------------------------------------------- ----
import breeze.linalg.{DenseMatrix, DenseVector, eig}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{LabeledPoint, StandardScaler, VectorAssembler}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ArrayBuffer
/** PCA and LDA can be used to lower the linear datasets,but for the non-linear datasets,we need to
* borrow kernel function ,So this code show how to use KPCA to solve the non-linear data
* Data Source : http://archive.ics.uci.edu/ml/datasets/Wine
* @author XiaoTangBao
* @date 2019/4/29 14:04
* @version 1.0
*/
object KPCA {
def main(args: Array[String]): Unit = {
//屏蔽日志
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
val spark = SparkSession.builder().master("local[4]").appName("KPCA").getOrCreate()
val data = spark.sparkContext.textFile("G:\\mldata\\kpca_test.txt").map(str => str.split(','))
.map(arr => arr.map(str => str.toDouble)).map(arr => Row(arr(0),arr(1),arr(2)))
//定义Schema和featuresArray
val schema = StructType(List(StructField("label",DoubleType,true),StructField("x1",DoubleType,true),StructField("x2",DoubleType,true)))
val featuresArray = Array("x1","x2")
val df = spark.createDataFrame(data,schema)
//定义转化器
val va = new VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
val ndf = va.transform(df).select("label","features")
//rbf核函数参数
val gama = 15.0
//降维后的最终纬度
val dim = 2
val n1 = run(ndf,dim,15.0)
val arr = ArrayBuffer[(Double,Double)]()
for(i<-0 until n1.cols) arr.append((n1(0,i),n1(1,i)))
arr.foreach(tp =>println(tp._1))
println("**************")
arr.foreach(tp =>println(tp._2))
}
/**
* the method attempts to lower the dimensionality by the RBF
* @param data the ioriginal data which in high dimensions, each col of the data replace one record.
* @param k the final dimensions
* @param gama the only one paramter of RBF
*/
def run(df:DataFrame, k:Int, gama:Double)= {
//标准化处理数据,标准化后不再需要去中心化
val stdf = new StandardScaler().setInputCol("features").setOutputCol("Scaledfeatures")
.setWithMean(true).setWithStd(true).fit(df).transform(df)
.select("label","Scaledfeatures")
.withColumnRenamed("Scaledfeatures","features")
val trainData = stdf.select("features").rdd.map(row => row.toString())
.map(str => str.replace('[', ' '))
.map(str => str.replace(']', ' '))
.map(str => str.trim).map(str => str.split(','))
.map(arr => arr.map(str => str.toDouble)).collect()
val labels = stdf.select("label").rdd.map(row => row.toString())
.map(str => str.replace('[', ' '))
.map(str => str.replace(']', ' '))
.map(str => str.trim).map(str => str.toDouble).collect()
//特征列数
val tzz = trainData(0).length
//生成新的带label的数据
val labArr = ArrayBuffer[LabeledPoint]()
for (i <- 0 until trainData.length) labArr.append(LabeledPoint(labels(i), Vectors.dense(trainData(i))))
//总样本组成的大型矩阵
val allData = labArr.map(lab => lab.features).map(vec => vec.toArray).flatMap(x => x).toArray
val big_Matrx = new DenseMatrix[Double](tzz, trainData.length, allData)
//计算样本的核矩阵
var kMatrix = DenseMatrix.zeros[Double](big_Matrx.cols,big_Matrx.cols)
for(i<-0 until kMatrix.rows){
val vi = big_Matrx(::,i)
for(j<-0 until kMatrix.cols){
kMatrix(i,j) = rbf(vi,big_Matrx(::,j),gama)
}
}
//聚集核矩阵
var LMatrix = DenseMatrix.zeros[Double](kMatrix.rows,kMatrix.cols)
for(i<-0 until LMatrix.cols) LMatrix(::,i) := 1.0 / kMatrix.rows
kMatrix = kMatrix - LMatrix * kMatrix - kMatrix * LMatrix + LMatrix * kMatrix * LMatrix
//计算样本核矩阵的特征值和特征向量
val eigValues = eig(kMatrix).eigenvalues
//此处返回的eigVectors已经单位化了
val eigVectors = eig(kMatrix).eigenvectors
//选取最大的k个特征值对应的特征向量
val label_eig = DenseMatrix.horzcat(eigVectors.t,eigValues.toDenseMatrix.t)
var strArr = ArrayBuffer[String]()
for(i<-0 until label_eig.rows) strArr.append(label_eig.t(::,i).toString)
for(i<-0 until strArr.length){
strArr(i) = strArr(i).replace("DenseVector(","").replace(')',' ').trim()
}
val da = ArrayBuffer[LabeledPoint]()
for(str <- strArr){
val arr = str.split(',').map(string => string.toDouble)
val lab = arr.takeRight(1)(0)
val value = arr.take(arr.length -1)
val labPoint = LabeledPoint(lab,Vectors.dense(value))
da.append(labPoint)
}
val result = da.sortBy(labPoint => labPoint.label).reverse.take(k).map(lab => lab.features).map(vec => vec.toArray)
var rt = DenseMatrix.zeros[Double](result.length,result(0).length)
for(i<-0 until rt.rows){
for(j<-0 until rt.cols){
rt(i,j) = result(i)(j)
}
}
rt
}
def rbf(v1:DenseVector[Double],v2:DenseVector[Double],gama:Double)={
val index_cof = (v1 - v2) dot (v1 - v2)
val result = math.exp((-1.0) * gama * index_cof)
result
}
}
采用KPCA,分别对葡萄酒数据进行了降维处理,结果如下图所示:
与Python调库结果对比,发现两者基本一致
同时为了对比线性降维的效果,采用PCA和LDA分别对数据进行了降维,结果如下图所示:
实验结果表明:对于非线性可分数据,PCA和LDA降维效果不理想,而KPCA对于非线性数据,降维效果明显。
更多推荐
已为社区贡献1条内容
所有评论(0)