from pandas import DataFrame
print(dir(DataFrame))
import sklearn
print(help(sklearn))
from sklearn import tree
print(dir(tree))
-------------------------------
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv("data/case1_no1.csv")
df.head()
df=df.sort_values(by='Feature1', ascending=False). reset_index(drop=true)
df.head()
df.loc[9]
df.loc[0:9,'Feature1'] =df.loc[9,'Feature1']
df = df.sort_values(by='Feature1', ascending=False).reset_index(drop=True)
df.head()
df.head()
df.loc[0:9,'Feature1'] = df.loc[9,'Feature1']
df.head()
df[df['Feature2'] >=80]['Feature1']
answer=np.mean(df[df['Feature2']>=80]['Featre1'])
print (answer)
------------------------------------
df = pd.read_csv('data/case1_no2.csv')
df.head()
top_80 = round(len(df)*0.8)
top_80
df=df.sort_values(by='Feature2', ascending=False).reset_index(drop=True)
df.head()
old_df = df.iloc[:84,:]
old_std=np.std(old_df['Feature1'])
np.median(new_df['Feature1'])
np.median(new_df[new_df['Feature1'].notnull()]['Feature1'])
f1_med=np.median(new_df[new_df['Featrue1'].notnull()]['Feature1'])
new_df['Feature1]=new_df['Feature1'].fillna(f1_med)
new_df
np.std(new_df['Feature1'])
new_std = np.std(new_df['Feature1'])
solv = old_std - new_std
--------------------------------------------
df = pd.read_csv('data\case1_no3.csv')
df.head()
q1, q3 = np.percentile (df ['Feature1'], [25,75])
iqr= q3-q1
l_limit = q1 - (iqr*1.5)
u_limit = q3 + (iqr*1.5)
df['Feature1'] < l_limit
outlier_index =
np.where ((df['Feature1'] < l_limit \ df['Feature1'] > u_limit))
np.mean(df)[outlier_index]['Feature1']
--------------------------------------------------
import pandas as pd
import pandas as np
df=pd.read_csv("dat/3rd_housing.csv")
#step1 결측치 제거
new_df = df.dropna()
#step2 상위 70% 추출
index_70 = round (len(new_df)*0.7)
new_df = new_df.iloc[:, index_70]
#step3 제1사분위수 Q1 산출하기
new_df ['housing_median_age']
q1 = np.percentile(new_df['housing_median_age'], 25)
print(q1)
------------------------------------------
df=pd.read_csv("data/3rd_disease.csv")
#step1 2000, mean
df [df['year']==2000].iloc[0,1:].mean()
#step2
much_nation = df[df['year']==2000].iloc[0,1:] >= 48
print(len(much_nation))
----------------------------------------------
df_train = pd.read_csv("data/3rd_travel.csv")
df_test =pd.read_csv("data/3rd_travel.csv")
df_train.shape, df_test.shape
df_train.info()
dr_test.info()
df_train['TravelInsurance'].value_counts()
df_train
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import trarin_test_split
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
X_test=df_test.iloc[:,:-1]
y_test=df_test.iloc[:,-1]
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
rf_model=RandomForestClassifier()
rf_model.fit(X_train,y_train)
y_pred=rf_model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f'Random Forest accuracy score : {accuracy:.3}')
--------------------------------
import pandas as pd
df = pd.read_csv("..input dddd.csv")
df
print (int(abs (df['age'].quantile(.25)-df['age'].quantile(.75))))
----------------------------------
import pandas asl pd
df = pd.read_csv ("ddfdfda.csv")
f=(df['loves']+df['wows']) / df['reactions']
cond_1 = r> 0.4
cond
---------------------------------
import pandas as pd
df=pd.read_csv("dff.csv")
r = (df['loves']+df['wows'])/df['reacion']
cond_1 = r> 0.4
cond_2 = r< 0.5
cond_3 = df['type']="video"
df[cond_1 & cond_2, cond_3]
print (len(df[cond_1 & cond_2, cond_3]))
------------------------------------
import pandas as pd
df = pd.read_csv("dfdga.csv")
df
df.info()
pd.to_datetime(df['date_added'])
import pandas as pd
df = pd.read_csv("dfdf.csv")
pd.to_datetime(df['date_added'])
df['date_added']=pd.to_datetime(df['date_added'])
cond_1 = df['date_added'].dt.year==2018
cond_2 = df['date_added'].dt.month==1
cond_3 = df['country']=="United Kingdom"
df [cond_1 & cond_2 & cond_3]
print ( len (df [cond_1 & cond_2 & cond_3]))
-------------------------------
import pandas as pd
train = pd.read_csv("dddfff.csv")
test = pd.read_csv("dfdff.csv)
train.shape
test.shape
test.head()
train['Segmentaion'].value_counts
train.isnull().sum()
test.insnll().sum()
train.info()
target = train.pop('Segmentation')
test_ID = test.pop('ID')
train.head()
train.info()
train.describe(include="o")
train = pd.get_dummies(train)
test = pd.get_dummies(test)
train.info()
train=train.drop("ID",axis=1)
train.head(1)
test_ID=test.pop('ID')
test_ID
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(train,target)
pred=rf.predict(test)
pred
submit =pd.DataFrme({
'ID':test_ID,
'Segmentation':pred
})
submit.to_csv("dff.csv", index=False)