python数据收集,整理,变换、分析–随机森林

#数据集下载地址wine quality dataset:https://www.kaggle.com/datasets/shelvigarg/wine-quality-dataset
#分析时请删除type第一列type
#代码win11专业版(21H2)+python3.8.5运行通过
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
try:
    wine = pd.read_csv('C:\\Users\\czliu\\Documents\\python\\winequalityN.csv',sep=',')
    
except:
    print("Cannot find the file!")
    
wine.info() #显示数据信息

'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6497 non-null   object 
 1   fixed acidity         6487 non-null   float64
 2   volatile acidity      6489 non-null   float64
 3   citric acid           6494 non-null   float64
 4   residual sugar        6495 non-null   float64
 5   chlorides             6495 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6488 non-null   float64
 10  sulphates             6493 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB
==================================
dataframe 缺失值的处理:
df.dropna(how='all')#删除所有内容均为缺失值的行
df.dropna(axis=1) #丢弃有缺失值的列
df.dropna(axis=1, how = 'all') #丢弃所有列中所有值均缺失的列
df.dropna(axis=0, subset=['name', 'age'])#丢弃name和age这两列中有缺失值的行
'''
wine.duplicated().sum()  #显示重复记录路数
wine = wine.drop_duplicates()  #去除重复数据

print('-'*25+'描述统计计算')
print(wine.describe())  #描述统计计算
'''
       fixed acidity  volatile acidity  ...      alcohol      quality
count    6487.000000       6489.000000  ...  6497.000000  6497.000000
mean        7.216579          0.339691  ...    10.491801     5.818378
std         1.296750          0.164649  ...     1.192712     0.873255
min         3.800000          0.080000  ...     8.000000     3.000000
25%         6.400000          0.230000  ...     9.500000     5.000000
50%         7.000000          0.290000  ...    10.300000     6.000000
75%         7.700000          0.400000  ...    11.300000     6.000000
max        15.900000          1.580000  ...    14.900000     9.000000

[8 rows x 12 columns]
'''
wine.quality.value_counts()  #对qualiy分组计数
print('-'*25)
'''
6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64
'''
wine.quality.value_counts().plot(kind='pie',autopct ='%.2f')

print(wine.corr().quality)

'''
相关系数计算
fixed acidity          -0.077031
volatile acidity       -0.265953
citric acid             0.085706
residual sugar         -0.036825
chlorides              -0.200886
free sulfur dioxide     0.055463
total sulfur dioxide   -0.041385
density                -0.305858
pH                      0.019366
sulphates               0.038729
alcohol                 0.444319
quality                 1.000000
Name:quality, dtype: float64
'''
sns.barplot(x ='quality',y='volatile acidity',data=wine) 
sns.barplot(x ='quality',y='alcohol',data=wine)

#数据分组变换,产生新变量label
from sklearn.preprocessing import LabelEncoder
bins = (2,4,6,11) #zuokai
group_names = ['low','medium','high']
wine['quality_lb'] = pd.cut(wine['quality'],bins=bins,labels = group_names)
lb_quality = LabelEncoder()
wine['label']=lb_quality.fit_transform(wine['quality_lb'])
print(wine.label.value_counts())
wine_copy = wine.copy()
wine.drop(['quality','quality_lb'],axis=1,inplace = True)
x = wine.iloc[:,:-1]
y = wine.label
print(x)
print('-'*25)
print(y)
print('='*25)     
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
from sklearn.preprocessing import scale
X_train = scale(X_train)   #数据标化
X_test = scale(X_test)

#随机森林模型预测结果--混淆矩阵
from sklearn.metrics import confusion_matrix
rfc = RandomForestClassifier(n_estimators = 200) #建立子树的数量200
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_test,y_pred))  #混淆矩阵,行代表实际的内容(预测结果),列代表预测值(分类) 
#参见张莉cousera.org<用python玩转数据>课程,第四周第二部分。
@注意去除文件中的缺失数据