Python检测数据中的异常值

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# 创建虚拟数据集
np.random.seed(42)
data = {
    'feature1': np.random.normal(0, 1, 1000),
    'feature2': np.random.normal(0, 1, 1000)
}

# 添加一些异常值
data['feature1'][np.random.randint(0, 1000, 50)] = np.random.uniform(10, 20, 50)
data['feature2'][np.random.randint(0, 1000, 50)] = np.random.uniform(10, 20, 50)

df = pd.DataFrame(data)

# 标准化数据
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# 构建模型
model = IsolationForest(contamination=0.05)
model.fit(df_scaled)

# 检测异常值
df['anomaly'] = model.predict(df_scaled)
df['anomaly'] = df['anomaly'].map({1: 0, -1: 1})  # 1表示正常值,-1表示异常值

# 可视化结果
plt.scatter(df['feature1'], df['feature2'], c=df['anomaly'], cmap='coolwarm')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Anomaly Detection')
plt.show()