__author__ = "Luca Puggini"
import numpy as np
import sklearn as sk
import pandas as pd
from sklearn.decomposition import SparsePCA, PCA
import random
class IterativeSPCA():
def __init__(self, Npc=None, alpha=1, max_iter=5000, tol=1e-8):
self.Npc = Npc #number of components
self.alpha = alpha # penalty value
self.max_iter = max_iter # max number of iteration
self.tol = tol # accepted tollerance error
def spca_iterate(self, u_old, v_old):
# this is the equivalent of the nipals algorithm but designed in order to
# obtain a sparse loading
v_new = np.zeros(shape=v_old.shape)
u_new = np.zeros(shape=u_old.shape)
for iteration in range(self.max_iter): #repeat the iteration non more than max iter
y = np.dot(self.X.T, u_old).squeeze()
v_new = (np.sign(y)*np.maximum(np.abs(y)-self.alpha, 0)).squeeze()
norm_v = np.linalg.norm(v_new)**2
if norm_v == 0: #if norm v is 0 it means that no components have been selected. So you should reduce the penalty lambda
break
x_times_v = np.dot(self.X, v_new)
x_times_v.shape = (self.m, 1)
u_new = x_times_v/np.linalg.norm(x_times_v)
if np.linalg.norm(v_new)==0:#check again if the norm is 0
break
if np.linalg.norm(v_new - v_old)<self.tol or np.linalg.norm(-v_new - v_old) < self.tol:#check if there is convergence
break
u_old = u_new
v_old = v_new
if iteration == self.max_iter-1:
print("No Convergence. Error!!!")
norm_v = np.linalg.norm(v_new)
v_new.shape = (self.n, 1)
v_final = sk.preprocessing.normalize(v_new, axis=0)
u_final = u_new * norm_v
return v_final, u_final
def fit(self, X):
self.m, self.n = X.shape
self.components_ = np.zeros(shape=(self.n,self.Npc))
self.sT = np.zeros(shape=(self.m,self.Npc))
self.X = X
for i in range(self.Npc):
pca = PCA(n_components=1).fit(self.X) #this can be imrpoved using a nipals version of PCA
v = pca.components_
v = sk.preprocessing.normalize(v)
u = self.X.dot(v.T)
s = np.linalg.norm(u)
u_old = u
v_old = v*s
v_final, u_final = self.spca_iterate(u_old=u_old, v_old=v_old)
self.sT[:, i] = u_final.squeeze()
self.components_[:, i] = v_final.squeeze()
u_final.shape = (self.m, 1)
self.X = self.X-np.dot(u_final, v_final.T)
return self.components_, self.sT
print("start")
# Set the dataset
from sklearn.datasets import load_boston, load_iris
boston = load_boston()
iris = load_iris()
#X = boston['data']
X = np.random.random((100, 20))
#X = iris['data']
X = sk.preprocessing.scale(X) # scale the data
alpha = 1 # set the penalty
nPCs = 2 # set the number of components
import time
start_time = time.time()
a = IterativeSPCA(Npc=nPCs, alpha=alpha)
sP, sT = a.fit(X[:])
time1 = time.time()
spca = SparsePCA(n_components=nPCs, alpha=alpha, ridge_alpha=0)
spca.fit(X[:])
time2 = time.time()
import pandas as pd
df = pd.DataFrame(columns=['it_spc1','spc1'])
df['it_spc1'] = a.components_[:,0]
df['spc1'] = spca.components_.T[:,0]/np.linalg.norm(spca.components_.T[:,0])
df['it_spc2'] = a.components_[:,1]
df['spc2'] = spca.components_.T[:,1]/np.linalg.norm(spca.components_.T[:,1])
print(df)
print("time iterative SPCA=", time1-start_time, "time SparsePCA=", time2-time1)
print("finish!")