#数据集下载地址wine quality dataset:https://www.kaggle.com/datasets/shelvigarg/wine-quality-dataset
#分析时请删除type第一列type
#代码win11专业版(21H2)+python3.8.5运行通过
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
try:
wine = pd.read_csv('C:\\Users\\czliu\\Documents\\python\\winequalityN.csv',sep=',')
except:
print("Cannot find the file!")
wine.info() #显示数据信息
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 type 6497 non-null object
1 fixed acidity 6487 non-null float64
2 volatile acidity 6489 non-null float64
3 citric acid 6494 non-null float64
4 residual sugar 6495 non-null float64
5 chlorides 6495 non-null float64
6 free sulfur dioxide 6497 non-null float64
7 total sulfur dioxide 6497 non-null float64
8 density 6497 non-null float64
9 pH 6488 non-null float64
10 sulphates 6493 non-null float64
11 alcohol 6497 non-null float64
12 quality 6497 non-null int64
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB
==================================
dataframe 缺失值的处理:
df.dropna(how='all')#删除所有内容均为缺失值的行
df.dropna(axis=1) #丢弃有缺失值的列
df.dropna(axis=1, how = 'all') #丢弃所有列中所有值均缺失的列
df.dropna(axis=0, subset=['name', 'age'])#丢弃name和age这两列中有缺失值的行
'''
wine.duplicated().sum() #显示重复记录路数
wine = wine.drop_duplicates() #去除重复数据
print('-'*25+'描述统计计算')
print(wine.describe()) #描述统计计算
'''
fixed acidity volatile acidity ... alcohol quality
count 6487.000000 6489.000000 ... 6497.000000 6497.000000
mean 7.216579 0.339691 ... 10.491801 5.818378
std 1.296750 0.164649 ... 1.192712 0.873255
min 3.800000 0.080000 ... 8.000000 3.000000
25% 6.400000 0.230000 ... 9.500000 5.000000
50% 7.000000 0.290000 ... 10.300000 6.000000
75% 7.700000 0.400000 ... 11.300000 6.000000
max 15.900000 1.580000 ... 14.900000 9.000000
[8 rows x 12 columns]
'''
wine.quality.value_counts() #对qualiy分组计数
print('-'*25)
'''
6 2836
5 2138
7 1079
4 216
8 193
3 30
9 5
Name: quality, dtype: int64
'''
wine.quality.value_counts().plot(kind='pie',autopct ='%.2f')
print(wine.corr().quality)
'''
相关系数计算
fixed acidity -0.077031
volatile acidity -0.265953
citric acid 0.085706
residual sugar -0.036825
chlorides -0.200886
free sulfur dioxide 0.055463
total sulfur dioxide -0.041385
density -0.305858
pH 0.019366
sulphates 0.038729
alcohol 0.444319
quality 1.000000
Name:quality, dtype: float64
'''
sns.barplot(x ='quality',y='volatile acidity',data=wine)
sns.barplot(x ='quality',y='alcohol',data=wine)
#数据分组变换,产生新变量label
from sklearn.preprocessing import LabelEncoder
bins = (2,4,6,11) #zuokai
group_names = ['low','medium','high']
wine['quality_lb'] = pd.cut(wine['quality'],bins=bins,labels = group_names)
lb_quality = LabelEncoder()
wine['label']=lb_quality.fit_transform(wine['quality_lb'])
print(wine.label.value_counts())
wine_copy = wine.copy()
wine.drop(['quality','quality_lb'],axis=1,inplace = True)
x = wine.iloc[:,:-1]
y = wine.label
print(x)
print('-'*25)
print(y)
print('='*25)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
from sklearn.preprocessing import scale
X_train = scale(X_train) #数据标化
X_test = scale(X_test)
#随机森林模型预测结果--混淆矩阵
from sklearn.metrics import confusion_matrix
rfc = RandomForestClassifier(n_estimators = 200) #建立子树的数量200
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_test,y_pred)) #混淆矩阵,行代表实际的内容(预测结果),列代表预测值(分类)
#参见张莉cousera.org<用python玩转数据>课程,第四周第二部分。
@注意去除文件中的缺失数据