我试图理解scikit-learn高斯混合模型实现的结果.看一下下面的例子:
#!/opt/local/bin/python import numpy as np import matplotlib.pyplot as plt from sklearn.mixture import GaussianMixture # Define simple gaussian def gauss_function(x, amp, x0, sigma): return amp * np.exp(-(x - x0) ** 2. / (2. * sigma ** 2.)) # Generate sample from three gaussian distributions samples = np.random.normal(-0.5, 0.2, 2000) samples = np.append(samples, np.random.normal(-0.1, 0.07, 5000)) samples = np.append(samples, np.random.normal(0.2, 0.13, 10000)) # Fit GMM gmm = GaussianMixture(n_components=3, covariance_type="full", tol=0.001) gmm = gmm.fit(X=np.expand_dims(samples, 1)) # Evaluate GMM gmm_x = np.linspace(-2, 1.5, 5000) gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1, 1))) # Construct function manually as sum of gaussians gmm_y_sum = np.full_like(gmm_x, fill_value=0, dtype=np.float32) for m, c, w in zip(gmm.means_.ravel(), gmm.covariances_.ravel(), gmm.weights_.ravel()): gmm_y_sum += gauss_function(x=gmm_x, amp=w, x0=m, sigma=np.sqrt(c)) # Normalize so that integral is 1 gmm_y_sum /= np.trapz(gmm_y_sum, gmm_x) # Make regular histogram fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[8, 5]) ax.hist(samples, bins=50, normed=True, alpha=0.5, color="#0070FF") ax.plot(gmm_x, gmm_y, color="crimson", lw=4, label="GMM") ax.plot(gmm_x, gmm_y_sum, color="black", lw=4, label="Gauss_sum") # Annotate diagram ax.set_ylabel("Probability density") ax.set_xlabel("Arbitrary units") # Draw legend plt.legend() plt.show()
在这里,我首先生成由高斯构造的样本分布,然后将高斯混合模型拟合到这些数据.接下来,我想计算一些给定输入的概率.方便的是,scikit实现提供了score_samples
这样做的方法.现在我想了解这些结果.我一直认为,我可以从GMM拟合中获取高斯参数并通过对它们求和来构造相同的分布,然后将积分归一化为1.但是,正如您在图中所看到的,样本来自该score_samples
方法与原始数据(蓝色直方图)完美匹配(红线),手动构造的分布(黑线)不适合.我想了解我的想法出错了,以及为什么我不能通过将GMM拟合的高斯相加来自己构建分布!?!非常感谢任何输入!
以防万一将来有人想知道同样的事情:一个人必须规范化各个组成部分,而不是总和:
import numpy as np import matplotlib.pyplot as plt from sklearn.mixture import GaussianMixture # Define simple gaussian def gauss_function(x, amp, x0, sigma): return amp * np.exp(-(x - x0) ** 2. / (2. * sigma ** 2.)) # Generate sample from three gaussian distributions samples = np.random.normal(-0.5, 0.2, 2000) samples = np.append(samples, np.random.normal(-0.1, 0.07, 5000)) samples = np.append(samples, np.random.normal(0.2, 0.13, 10000)) # Fit GMM gmm = GaussianMixture(n_components=3, covariance_type="full", tol=0.001) gmm = gmm.fit(X=np.expand_dims(samples, 1)) # Evaluate GMM gmm_x = np.linspace(-2, 1.5, 5000) gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1, 1))) # Construct function manually as sum of gaussians gmm_y_sum = np.full_like(gmm_x, fill_value=0, dtype=np.float32) for m, c, w in zip(gmm.means_.ravel(), gmm.covariances_.ravel(), gmm.weights_.ravel()): gauss = gauss_function(x=gmm_x, amp=1, x0=m, sigma=np.sqrt(c)) gmm_y_sum += gauss / np.trapz(gauss, gmm_x) * w # Make regular histogram fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[8, 5]) ax.hist(samples, bins=50, normed=True, alpha=0.5, color="#0070FF") ax.plot(gmm_x, gmm_y, color="crimson", lw=4, label="GMM") ax.plot(gmm_x, gmm_y_sum, color="black", lw=4, label="Gauss_sum", line) # Annotate diagram ax.set_ylabel("Probability density") ax.set_xlabel("Arbitrary units") # Make legend plt.legend() plt.show()