“原型”是指样本空间中具有代表性的点,原型聚类称为“基于原型的聚类”,此类算法假设聚类结构能通过一组原型刻画,在现实聚类任务中极为常用。通常情况下,算法先对原型进行初始化,然后对原型进行迭代更新求解。采用不同原型表示、不同求解方式,将产生不同算法。
一、k均值聚类
k均值聚类代码示例:
import numpy as np
import matplotlib.pyplot as plt
def k_means(data, k):
# 随机初始化k个簇中心
centroids = data[np.random.choice(len(data), k, replace=false)]
# 初始化簇分配和簇距离数组
cluster_assign = np.zeros(len(data))
cluster_dist = np.zeros(len(data))
# 迭代计算簇中心和簇分配
for _ in range(100):
# 计算每个数据点到每个簇中心的距离
for i, x in enumerate(data):
dists = np.linalg.norm(x - centroids, axis=1)
cluster_assign[i] = np.argmin(dists)
cluster_dist[i] = np.min(dists)
# 计算每个簇的新中心
for j in range(k):
centroids[j] = np.mean(data[cluster_assign == j], axis=0)
return cluster_assign, cluster_dist, centroids
# 生成随机数据
data = np.random.rand(100, 2)
# 聚类并绘制结果
cluster_assign, _, centroids = k_means(data, 3)
plt.scatter(data[:,0], data[:,1], c=cluster_assign)
plt.scatter(centroids[:,0], centroids[:,1], marker='x', s=200, linewidths=3, color='r')
plt.show()
二、学习向量量化
学习向量量化代码示例:
import numpy as np
class lvq:
def __init__(self, k, alpha):
self.k = k # 聚类数量
self.alpha = alpha # 学习率
self.centers = none # 聚类中心
def fit(self, x, y, epochs):
self.centers = x[np.random.choice(x.shape[0], self.k, replace=false)]
for epoch in range(epochs):
for i, x in enumerate(x):
# 计算距离并找到最近的聚类中心
distances = np.sum((x - self.centers) ** 2, axis=1)
j = np.argmin(distances)
# 更新最近的聚类中心
if y[i] == j:
self.centers[j] += self.alpha * (x - self.centers[j])
else:
self.centers[j] -= self.alpha * (x - self.centers[j])
def predict(self, x):
distances = np.sum((x[:, np.newaxis] - self.centers) ** 2, axis=2)
return np.argmin(distances, axis=1)
三、高斯混合聚类
高斯混合聚类代码示例:
import numpy as np
from scipy.stats import multivariate_normal
class gmm:
def __init__(self, n_clusters, max_iter=100):
self.n_clusters = n_clusters
self.max_iter = max_iter
def fit(self, x):
n_samples, n_features = x.shape
# 初始化高斯分布的参数
self.means = np.random.randn(self.n_clusters, n_features)
self.covs = np.array([np.eye(n_features)] * self.n_clusters)
self.weights = np.ones(self.n_clusters) / self.n_clusters
for _ in range(self.max_iter):
# e 步:计算每个数据点属于每个高斯分布的概率
probs = np.zeros((n_samples, self.n_clusters))
for i in range(self.n_clusters):
probs[:, i] = self.weights[i] * multivariate_normal.pdf(x, self.means[i], self.covs[i])
probs /= probs.sum(axis=1, keepdims=true)
# m 步:更新每个高斯分布的参数
for i in range(self.n_clusters):
weight_i = probs[:, i].sum()
self.weights[i] = weight_i / n_samples
self.means[i] = np.sum(probs[:, i].reshape(-1, 1) * x, axis=0) / weight_i
diff = x - self.means[i]
self.covs[i] = np.dot(probs[:, i] * diff.t, diff) / weight_i
def predict(self, x):
probs = np.zeros((x.shape[0], self.n_clusters))
for i in range(self.n_clusters):
probs[:, i] = self.weights[i] * multivariate_normal.pdf(x, self.means[i], self.covs[i])
return np.argmax(probs, axis=1)
版权声明:本文内容由互联网用户贡献,该文观点仅代表作者本人。本站仅提供信息存储服务,不拥有所有权,不承担相关法律责任。
如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 2386932994@qq.com 举报,一经查实将立刻删除。
发表评论