KNN on RStudio Studi Kasus

Download Data Training



Lakukan perintah dasar pada Rstudio

> getwd()
[1] "C:/Users/ACER/Documents"

> MyData <- read.csv(file="dataTraining.csv", header=TRUE, sep=",")
> View(MyData)
> names(MyData)
 [1] "Win.Loss"       "Optimism"       "Pessimism"      "PastUsed"       "FutureUsed"     "PresentUsed" 
 [7] "OwnPartyCount"  "OppPartyCount"  "NumericContent" "Extra"          "Emoti"          "Agree"       
[13] "Consc"          "Openn" 
    
> attributes(MyData)
$names
 [1] "Win.Loss"       "Optimism"       "Pessimism"      "PastUsed"       "FutureUsed"     "PresentUsed" 
 [7] "OwnPartyCount"  "OppPartyCount"  "NumericContent" "Extra"          "Emoti"          "Agree"       
[13] "Consc"          "Openn"       

$class
[1] "data.frame"

$row.names
   [1]    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21
  [22]   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40   41   42
  [43]   43   44   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63
  [64]   64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80   81   82   83   84
  [85]   85   86   87   88   89   90   91   92   93   94   95   96   97   98   99  100  101  102  103  104  105
 [106]  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126
 [127]  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146  147

> dim(MyData)
[1] 1524   14

> typeof(MyData)
[1] "list"


> data1=MyData
> data1$Win.Loss = as.factor(data1$Win.Loss)
> data1$Win.Loss
   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  [54] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 [107] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 [160] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [213] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [266] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [319] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [372] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [425] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [478] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [531] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [584] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [637] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [690] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [743] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [796] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [849] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [902] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [955] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [ reached getOption("max.print") -- omitted 524 entries ]
Levels: 0 1


> index = createDataPartition(data1$Win.Loss, p = 0.7, list = F )
Error in createDataPartition(data1$Win.Loss, p = 0.7, list = F) :
  could not find function "createDataPartition"

#jika ada kesalahan createDataPartition, maka kita butuh import library caret
> library(caret) 

> index = createDataPartition(data1$Win.Loss, p = 0.7, list = F )
> dim(index)
[1] 1068    1
> typeof(index)
[1] "integer"

> index[1:30,1]
 [1]  1  3  5  7  9 10 11 12 13 14 15 16 18 19 20 22 24 25 29 30 32 33 34 35 36 37 38 39 40 41

#atau bagian ahirnya:
>index
[993,]      1419
 [994,]      1420
 [995,]      1421
 [996,]      1422
 [997,]      1423
 [998,]      1424
 [999,]      1425
[1000,]      1426
 [ reached getOption("max.print") -- omitted 68 rows ]

> train = data1[index,]
> train
     Win.Loss    Optimism   Pessimism   PastUsed FutureUsed PresentUsed OwnPartyCount OppPartyCount
1           1 0.104504505 0.050450450 0.43814433 0.49484536  0.06701031             2             2
3           1 0.112571898 0.049301561 0.41596639 0.51680672  0.06722689             1             1
5           1 0.105826397 0.051724138 0.33426184 0.58217270  0.08356546             3             4
7           1 0.098382749 0.064016173 0.32407407 0.60185185  0.07407407             6             4
9           1 0.106107341 0.046884639 0.36335404 0.53726708  0.09937888             2             5

 > dim(train)
[1] 1068   14


> validation = data1[-index,]
> validation
     Win.Loss   Optimism   Pessimism   PastUsed FutureUsed PresentUsed OwnPartyCount OppPartyCount
2           1 0.11457521 0.059236165 0.29126214 0.62135922  0.08737864             1             4
4           1 0.10723350 0.046319797 0.46349206 0.46666667  0.06984127             1             3
6           1 0.07586207 0.034482759 0.28000000 0.52000000  0.20000000             0             0
8           1 0.10377924 0.056388722 0.36927224 0.54986523  0.08086253             2             4

> dim(validation)
[1] 456  14


> head(data1)
  Win.Loss   Optimism  Pessimism  PastUsed FutureUsed PresentUsed OwnPartyCount OppPartyCount NumericContent
1        1 0.10450450 0.05045045 0.4381443  0.4948454  0.06701031             2             2    0.001877543
2        1 0.11457521 0.05923617 0.2912621  0.6213592  0.08737864             1             4    0.001418909
3        1 0.11257190 0.04930156 0.4159664  0.5168067  0.06722689             1             1    0.002131163
4        1 0.10723350 0.04631980 0.4634921  0.4666667  0.06984127             1             3    0.001871715
5        1 0.10582640 0.05172414 0.3342618  0.5821727  0.08356546             3             4    0.002229220
6        1 0.07586207 0.03448276 0.2800000  0.5200000  0.20000000             0             0    0.003290827

  Extra Emoti Agree Consc Openn
1 4.041 4.049 3.469 2.450 2.548
2 3.446 3.633 3.528 2.402 2.831
3 3.463 4.039 3.284 2.159 2.465
4 4.195 4.661 4.007 2.801 3.067
5 4.658 4.023 3.283 2.415 2.836
6 2.843 3.563 3.075 1.769 1.479



> levels(train$Win.Loss) <- make.names(levels(factor(train$Win.Loss)))
> levels(train$Win.Loss)
[1] "X0" "X1"

> head(validation)
   Win.Loss   Optimism  Pessimism  PastUsed FutureUsed PresentUsed OwnPartyCount OppPartyCount NumericContent
2         1 0.11457521 0.05923617 0.2912621  0.6213592  0.08737864             1             4    0.001418909
4         1 0.10723350 0.04631980 0.4634921  0.4666667  0.06984127             1             3    0.001871715
6         1 0.07586207 0.03448276 0.2800000  0.5200000  0.20000000             0             0    0.003290827
8         1 0.10377924 0.05638872 0.3692722  0.5498652  0.08086253             2             4    0.002215028
17        1 0.11289199 0.05505227 0.3891051  0.5214008  0.08949416             2             7    0.001165647
21        1 0.11466373 0.03858875 0.2736842  0.6210526  0.10526316             1             7    0.003105161
   Extra Emoti Agree Consc Openn
2  3.446 3.633 3.528 2.402 2.831
4  4.195 4.661 4.007 2.801 3.067
6  2.843 3.563 3.075 1.769 1.479
8  4.027 4.631 3.920 2.417 2.291
17 4.086 4.173 3.368 2.348 2.412
21 3.770 3.858 2.874 1.949 2.006


> levels(validation$Win.Loss) <- make.names(levels(factor(validation$Win.Loss)))
> levels(validation$Win.Loss)
[1] "X0" "X1"


#Membuat Method

> repeats = 3
> numbers = 10
> tunel = 10
> set.seed(1234)
> x = trainControl(method = "repeatedcv",number = numbers,repeats = repeats,classProbs = TRUE,summaryFunction = twoClassSummary)
> dim(x)
NULL
> x
$method
[1] "repeatedcv"

$number
[1] 10

$repeats
[1] 3

$search
[1] "grid"

$p
[1] 0.75

$initialWindow
NULL

$horizon
[1] 1

$fixedWindow
[1] TRUE

$skip
[1] 0

$verboseIter
[1] FALSE

$returnData
[1] TRUE

$returnResamp
[1] "final"

$savePredictions
[1] FALSE

$classProbs
[1] TRUE

$summaryFunction
function (data, lev = NULL, model = NULL) 
{
    lvls <- levels(data$obs)
    if (length(lvls) > 2) 
        stop(paste("Your outcome has", length(lvls), "levels. The twoClassSummary() function isn't appropriate."))
    requireNamespaceQuietStop("ModelMetrics")
    if (!all(levels(data[, "pred"]) == lvls)) 
        stop("levels of observed and predicted data do not match")
    rocAUC <- ModelMetrics::auc(ifelse(data$obs == lev[2], 0, 
        1), data[, lvls[1]])
    out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"], 
        lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
    names(out) <- c("ROC", "Sens", "Spec")
    out
}
<environment: namespace:caret>

$selectionFunction
[1] "best"

$preProcOptions
$preProcOptions$thresh
[1] 0.95

$preProcOptions$ICAcomp
[1] 3

$preProcOptions$k
[1] 5

$preProcOptions$freqCut
[1] 19

$preProcOptions$uniqueCut
[1] 10

$preProcOptions$cutoff
[1] 0.9


$sampling
NULL

$index
NULL

$indexOut
NULL

$indexFinal
NULL

$timingSamps
[1] 0

$predictionBounds
[1] FALSE FALSE

$seeds
[1] NA

$adaptive
$adaptive$min
[1] 5

$adaptive$alpha
[1] 0.05

$adaptive$method
[1] "gls"

$adaptive$complete
[1] TRUE


$trim
[1] FALSE

$allowParallel
[1] TRUE

> model1 <- train(Win.Loss~. , data = train, method = "knn",preProcess = c("center","scale"),trControl = x,metric = "ROC",tuneLength = tunel)
> model1
k-Nearest Neighbors 

1068 samples
  13 predictor
   2 classes: 'X0', 'X1' 

Pre-processing: centered (13), scaled (13) 
Resampling: Cross-Validated (10 fold, repeated 3 times) 
Summary of sample sizes: 962, 961, 961, 961, 961, 961, ... 
Resampling results across tuning parameters:

  k   ROC        Sens       Spec     
   5  0.8387421  0.6956446  0.8320591
   7  0.8465032  0.6686218  0.8499611
   9  0.8435422  0.6654665  0.8417793
  11  0.8448189  0.6620790  0.8453846
  13  0.8482900  0.6532133  0.8561150
  15  0.8486402  0.6397019  0.8586946
  17  0.8441997  0.6324623  0.8653380
  19  0.8407904  0.6196864  0.8658042
  21  0.8411525  0.6236934  0.8688967
  23  0.8411559  0.6165118  0.8714530

ROC was used to select the optimal model using the largest value.
The final value used for the model was k = 15.
> plot(model1)

Nilai max saat k = 11, artinya KNN mencapai nilai tertinggi K-11
> data1[11,1]
[1] 1

Kategori 1


Cat:
> data1[1:5,1:6]
  Win.Loss  Optimism  Pessimism  PastUsed FutureUsed PresentUsed
1        1 0.1045045 0.05045045 0.4381443  0.4948454  0.06701031
2        1 0.1145752 0.05923617 0.2912621  0.6213592  0.08737864
3        1 0.1125719 0.04930156 0.4159664  0.5168067  0.06722689
4        1 0.1072335 0.04631980 0.4634921  0.4666667  0.06984127
5        1 0.1058264 0.05172414 0.3342618  0.5821727  0.08356546
> train[1:5,1:6]
  Win.Loss   Optimism  Pessimism  PastUsed FutureUsed PresentUsed
1       X1 0.10450450 0.05045045 0.4381443  0.4948454  0.06701031
3       X1 0.11257190 0.04930156 0.4159664  0.5168067  0.06722689
5       X1 0.10582640 0.05172414 0.3342618  0.5821727  0.08356546
7       X1 0.09838275 0.06401617 0.3240741  0.6018519  0.07407407
9       X1 0.10610734 0.04688464 0.3633540  0.5372671  0.09937888
> validation[1:5,1:6]
   Win.Loss   Optimism  Pessimism  PastUsed FutureUsed PresentUsed
2        X1 0.11457521 0.05923617 0.2912621  0.6213592  0.08737864
4        X1 0.10723350 0.04631980 0.4634921  0.4666667  0.06984127
6        X1 0.07586207 0.03448276 0.2800000  0.5200000  0.20000000
8        X1 0.10377924 0.05638872 0.3692722  0.5498652  0.08086253
17       X1 0.11289199 0.05505227 0.3891051  0.5214008  0.08949416
> 


NB2:
library(ROCR)
Error in library(ROCR) : there is no package called ‘ROCR’
> install.packages("ROCR",dep=T)

+++++++++++++++++++++STUDI KASUS LAIN

download datalatih: Nodal Involvement in Prostate Cancer





Komentar

Postingan populer dari blog ini

FIltering DataFrame Manual

Data Frame