From 89983825744030fc7eaad8dd9bd486d82ca9f9ba Mon Sep 17 00:00:00 2001 From: JannisGottwald <53615516+JannisGottwald@users.noreply.github.com> Date: Fri, 16 Sep 2022 12:37:17 +0300 Subject: [PATCH 1/5] Delete html directory --- .../Model_training_tuning_and_validation.html | 1479 ---------- html/bat_data_HGAM_tutorial.html | 2393 ----------------- ...-Tutorial-for-activity-classification.html | 2082 -------------- 3 files changed, 5954 deletions(-) delete mode 100644 html/Model_training_tuning_and_validation.html delete mode 100644 html/bat_data_HGAM_tutorial.html delete mode 100644 html/tRackIT-Tutorial-for-activity-classification.html diff --git a/html/Model_training_tuning_and_validation.html b/html/Model_training_tuning_and_validation.html deleted file mode 100644 index 432906c..0000000 --- a/html/Model_training_tuning_and_validation.html +++ /dev/null @@ -1,1479 +0,0 @@ - - - - -
- - - - - - - - -Small movements of tagged animals result in discernible variations in the strength of the received signal (Cochran et al. (1965); Kjos and Cochran (1970)) that reflect changes in the angle and distance between the transmitter and receiver. Kays et al. (2011) proposed a method for automatically classifying active and passive behaviour based on a threshold difference in the signal strength of successive VHF signals recorded by a customised automatic radio-tracking system. However, machine learning (ML) algorithms are optimised for the recognition of complex patterns in a dataset and are typically robust against factors that influence signal propagation, such as changes in temperature and humidity, physical contact with conspecifics and/or multipath signal propagation (Alade (2013)). Accordingly, a ML model trained with a dataset encompassing the possible diversity of signal patterns related to active and passive behaviour can be expected to perform at least as well as a threshold-based approach. In this work, we built on the methodology of Kays et al. (2011) by calibrating two random forest models (1 for data comming from only one receiver and one for data coming from at least two receivers), based on millions of data points representing the behaviours of multiple tagged individuals of two temperate bat species (Myotis bechsteinii, Nyctalus leisleri).
-The method was tested by applying it to independent data from bats, humans, and a bird species and then comparing the results with those obtained using the threshold-based approach of Kays et al. (2011).
-In order to make our work comprehensible, code and data are made available to all interested parties here (https://doi.org/10.17192/fdr/81). This resource contains the following steps:
-But before we get started:
-Although deep learning methods have been successfully applied to several ecological problems where large amounts of data are available (Christin, Hervet, and Lecomte (2019)), we use a random forest model due to the following reasons:
-For model training and tuning we use the caret
R-Package (Kuhn 2008). For the forward feature selection we use the CAST
R-Package developed by Meyer et al. 2018.
Additional packages needed are: randomForest
,ranger
, doParallel
, MLeval
, data.table
, dplyr
, plyr
Load packages
-library(caret); library(randomForest);library(ranger); library(doParallel);library(MLeval);library(CAST);library(data.table);library(dplyr);library(plyr)
Only one antenna is necessary to classify VHF signals into active vs. passive states (Kays et al. 2011). However, agreement between receivers of the same station provides additional information and can improve the reliability of the classification. Our groundtruth dataset was balanced by randomly down-sampling the activity class with the most data to the amount of data contained by the class with the least data. These balanced datasets were then split into 50% training data and 50% test data for data originating from one receiver. The same procedure was used for data derived from the signals of two receivers, resulting in two training and two test datasets. From a total of 3,243,753 VHF signals, 124,898 signals were assigned to train the two-receiver model and 294,440 signals to train the one-receiver model (Table 1).
-Since not all variables are equally important to the model and some may even be misleading,we performed a forward feature selection on 50% of the training data. The forward feature selection algorithm implemented in the R package CAST (Meyer et al. (2018)) selects the best pair of all possible two variable combinations by evaluating the performance of a k-fold cross-validation (CV). The algorithm iteratively increases the number of predictors until no improvement of the performance is achieved by adding further variables.
-######################### 1 Receiver ########################
-
-
-<-readRDS("model_tunig_and_validation/data/batsTrain_1_receiver.rds")
- data_1
-table(data_1$Class)
##
-## active passive
-## 294173 294173
-##############FFS############################
-
-<-names(data_1[, -ncol(data_1)])
- predictors
-
-<-makeCluster(10)
- cl
-registerDoParallel(cl)
-
-<- trainControl(## 10-fold CV
- ctrl method = "cv",
- number = 10)
-
-
-#run ffs model with Leave Location out CV
-set.seed(10)
-
-<- ffs(predictors=data_1[,predictors],response = data_1$Class,method="rf",
- ffsmodel metric="Kappa",
- tuneLength = 1,
- trControl=ctrl,
- verbose = TRUE)
-
-$selectedvars
- ffsmodel
-
-saveRDS(ffsmodel, "model_tunig_and_validation/models/m_r1.rds")
-
-stopCluster(cl)
Red dots display two-variables combinations, dots with the colors from yellow to pink stand for models to each of which another variable has been added. Dots with a black border mark the optimal variable combination in the respective iteration.
-<-readRDS("model_tunig_and_validation/models/m_r1.rds")
- m1
-print(m1)
## Random Forest
-##
-## 588346 samples
-## 7 predictor
-## 2 classes: 'active', 'passive'
-##
-## No pre-processing
-## Resampling: Cross-Validated (10 fold)
-## Summary of sample sizes: 529512, 529512, 529511, 529512, 529511, 529511, ...
-## Resampling results:
-##
-## Accuracy Kappa
-## 0.9631628 0.9263257
-##
-## Tuning parameter 'mtry' was held constant at a value of 2
-print(plot_ffs(m1))
plot(varImp(m1))
######################### 2 Receivers ########################
-
-<-readRDS("model_tunig_and_validation/data/batsTrain_2_receivers.rds")
- data_2
-table(data_2$Class)
##
-## active passive
-## 110274 110274
-<-names(data_2[, -ncol(data_2)])
- predictors
-##############FFS#####
-
-<-makeCluster(10)
- cl
-registerDoParallel(cl)
-
-<- trainControl(## 10-fold CV
- ctrl method = "cv",
- number = 10)
-
-#run ffs model
-set.seed(10)
-
-<- ffs(predictors=data_2[,predictors],response = data_2$Class,method="rf",
- ffsmodel metric="Kappa",
- tuneLength = 1,
- trControl=ctrl,
- verbose = TRUE)
-
-$selectedvars
- ffsmodel
-saveRDS(ffsmodel, "model_tunig_and_validation/models/m_r2.rds")
-
-stopCluster(cl)
Red dots display two-variables combinations, dots with the colors from yellow to pink stand for models to each of which another variable has been added. Dots with a black border mark the optimal variable combination in the respective iteration.
-<-readRDS("model_tunig_and_validation/models/m_r2.rds")
- m2print(m2)
## Random Forest
-##
-## 220548 samples
-## 8 predictor
-## 2 classes: 'active', 'passive'
-##
-## No pre-processing
-## Resampling: Cross-Validated (10 fold)
-## Summary of sample sizes: 198494, 198493, 198494, 198494, 198492, 198492, ...
-## Resampling results:
-##
-## Accuracy Kappa
-## 0.9740011 0.9480022
-##
-## Tuning parameter 'mtry' was held constant at a value of 2
-print(plot_ffs(m2))
plot(varImp(m2))
Random forest is an algorithm which is far less tunable than other algorithms such as support vector machines (Probst, Wright, and Boulesteix (2019)) and is known to provide good results in the default settings of existing software packages (Fernández-Delgado et al., 2014). Even though the performance gain is still low, tuning the parameter mtry provides the biggest average improvement of the AUC (0.006) (Probst et al.2018). Mtry is defined as the number of randomly drawn candidate variables out of which each split is selected when growing a tree. Here we reduce the existing predictor variables to those selected by the forward feature selection and iteratively increase the number of randomly drawn candidate variables from 1 to the total number of selcted variables. Other parameters, such as the number of trees are held constant according to default settings in the packages used.
-#reduce to ffs variables
-<-names(data_1[, c(m1$selectedvars, "Class")])
- predictors<-data_1[, predictors]
- batsTune
-#tune number of variable evaluated per tree- number of trees is 500
-<- trainControl(## 10-fold CV
- ctrl method = "cv",
- number = 10,
- verboseIter = TRUE)
- )
-
-<- expand.grid(
- tunegrid mtry = 1:(length(predictors)-1), # mtry specified here
- splitrule = "gini"
- min.node.size = 10
- ,
- )
-<- train(Class~.,
- tuned_model data=batsTune,
- method='rf',
- metric='Kappa',
- tuneGrid=tunegrid,
- ntree=1000,
- trControl=ctrl)
-
-saveRDS(tuned_model,"model_tunig_and_validation/models/m_r1_tuned.rds")
<-readRDS("model_tunig_and_validation/models/m_r1_tuned.rds")
- m1_tuned
-print(m1_tuned)
## Random Forest
-##
-## 588346 samples
-## 7 predictor
-## 2 classes: 'active', 'passive'
-##
-## No pre-processing
-## Resampling: Cross-Validated (10 fold)
-## Summary of sample sizes: 529510, 529512, 529511, 529511, 529511, 529512, ...
-## Resampling results across tuning parameters:
-##
-## mtry Accuracy Kappa
-## 1 0.9591601 0.9183202
-## 2 0.9616518 0.9233036
-## 3 0.9619646 0.9239291
-## 4 0.9618371 0.9236742
-## 5 0.9615039 0.9230079
-## 6 0.9610569 0.9221139
-## 7 0.9602819 0.9205637
-##
-## Tuning parameter 'splitrule' was held constant at a value of gini
-##
-## Tuning parameter 'min.node.size' was held constant at a value of 10
-## Kappa was used to select the optimal model using the largest value.
-## The final values used for the model were mtry = 3, splitrule = gini
-## and min.node.size = 10.
-#reduce to ffs variables
-<-names(data_2[, c(m2$selectedvars, "Class")])
- predictors<-data_2[, predictors]
- batsTune
-#tune number of variable evaluated per tree- number of trees is 1000
-<- trainControl(## 10-fold CV
- ctrl method = "cv",
- number = 10,
- verboseIter = TRUE
-
- )
-
-<- expand.grid(
- tunegrid mtry = 1:(length(predictors)-1), # mtry specified here
- splitrule = "gini"
- min.node.size = 10
- ,
- )<- train(Class~.,
- tuned_model_2 data=batsTune,
- method='rf',
- metric='Kappa',
- tuneGrid=tunegrid,
- ntree=1000,
- trControl=ctrl)
- print(tuned_model_2)
-
-
-saveRDS(tuned_model_2,"model_tunig_and_validation/models/m_r2_tuned.rds")
<-readRDS("model_tunig_and_validation/models/m_r2_tuned.rds")
- m2_tunedprint(m2_tuned)
## Random Forest
-##
-## 220548 samples
-## 8 predictor
-## 2 classes: 'active', 'passive'
-##
-## No pre-processing
-## Resampling: Cross-Validated (10 fold)
-## Summary of sample sizes: 198494, 198494, 198493, 198492, 198493, 198494, ...
-## Resampling results across tuning parameters:
-##
-## mtry Accuracy Kappa
-## 1 0.9719608 0.9439215
-## 2 0.9724187 0.9448374
-## 3 0.9717975 0.9435951
-## 4 0.9712988 0.9425976
-## 5 0.9710041 0.9420081
-## 6 0.9707139 0.9414277
-## 7 0.9703285 0.9406569
-## 8 0.9702605 0.9405209
-##
-## Tuning parameter 'splitrule' was held constant at a value of gini
-##
-## Tuning parameter 'min.node.size' was held constant at a value of 10
-## Kappa was used to select the optimal model using the largest value.
-## The final values used for the model were mtry = 2, splitrule = gini
-## and min.node.size = 10.
-Both models ( based on data from 1 receiver and 2 receivers) had very high performance metrics (Kappa, Accuracy) with slightly better results for the 2 receivers model.Tuning the mtry parameter did not increase the performance which indicates that for our use case default settings are a good choice.
-For the Validation of the model performance and applicability to species with different movement behaviour (speed etc. than bats) we generated three different data sets. + 1. We put 50% of our bat data aside + 2. We collected ground truth data of a tagged medium spotted woodpecker + 3. We simulated different movement intensities by humans carrying transmitters through the forest
-In this section we will test how well the models perform in terms of different performance metrics such as F-Score, Accuracy, ROC-AUC
-We first take a look at te 50% test data that has been put aside for evaluation. Here we actually perform the prediction using the two trained models. For the woodpecker and human walk data set we will use already predicted data that has been processed by script validation_woodpecker
and validation_human_activity
.
# Testdata 1 receiver
-<-readRDS("model_tunig_and_validation/data/batsTest_1_receiver.rds")
- Test_1print(table(Test_1$Class))
##
-## active passive
-## 294172 294172
-# Default names as expected in Caret
-$obs<-factor(Test_1$Class)
- Test_1
-#get binary prediction
-<-predict(m1, Test_1)
- pred1$pred<-factor(pred1)
- Test_1
-#probabilities
-<-predict(m1, Test_1, type="prob")
- prob<-cbind(Test_1, prob) Test_1
#calculate roc-auc
-
-<- MLeval::evalm(data.frame(prob, Test_1$obs))
- roc1 saveRDS(roc1, "model_tunig_and_validation/results/roc_1receiver.rds")
#create confusion matrix
-<- confusionMatrix(factor(Test_1$pred), factor(Test_1$Class))
- cm_r1print(cm_r1)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 282864 10218
-## passive 11308 283954
-##
-## Accuracy : 0.9634
-## 95% CI : (0.9629, 0.9639)
-## No Information Rate : 0.5
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.9268
-##
-## Mcnemar's Test P-Value : 1.15e-13
-##
-## Sensitivity : 0.9616
-## Specificity : 0.9653
-## Pos Pred Value : 0.9651
-## Neg Pred Value : 0.9617
-## Prevalence : 0.5000
-## Detection Rate : 0.4808
-## Detection Prevalence : 0.4981
-## Balanced Accuracy : 0.9634
-##
-## 'Positive' Class : active
-##
-print(cm_r1$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9615599 0.9652652 0.9651360
-## Neg Pred Value Precision Recall
-## 0.9617018 0.9651360 0.9615599
-## F1 Prevalence Detection Rate
-## 0.9633447 0.5000000 0.4807800
-## Detection Prevalence Balanced Accuracy
-## 0.4981473 0.9634126
-#
-twoClassSummary(Test_1, lev = levels(Test_1$obs))
## ROC Sens Spec
-## 0.9942587 0.9615599 0.9652652
-<- readRDS("model_tunig_and_validation/results/roc_1receiver.rds")
- roc1 print(roc1$roc)
#get model
-
-#two receivers
-<-readRDS("model_tunig_and_validation/data/batsTest_2_receivers.rds")
- Test_2
-table(Test_2$Class)
##
-## active passive
-## 110273 110273
-$obs<-Test_2$Class
- Test_2#get binary prediction
-<-predict(m2, Test_2)
- pred2$pred<-pred2
- Test_2#probabilities
-<-predict(m2, Test_2, type="prob")
- prob2<-cbind(Test_2, prob2) Test_2
#calculate roc-auc
-<- MLeval::evalm(data.frame(prob2, Test_2$obs))
- roc2
-saveRDS(roc2, "model_tunig_and_validation/results/roc_2receivers.rds")
<- confusionMatrix(factor(Test_2$pred), factor(Test_2$obs))
- cm_r2print(cm_r2)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 107571 2745
-## passive 2702 107528
-##
-## Accuracy : 0.9753
-## 95% CI : (0.9746, 0.9759)
-## No Information Rate : 0.5
-## P-Value [Acc > NIR] : <2e-16
-##
-## Kappa : 0.9506
-##
-## Mcnemar's Test P-Value : 0.5693
-##
-## Sensitivity : 0.9755
-## Specificity : 0.9751
-## Pos Pred Value : 0.9751
-## Neg Pred Value : 0.9755
-## Prevalence : 0.5000
-## Detection Rate : 0.4877
-## Detection Prevalence : 0.5002
-## Balanced Accuracy : 0.9753
-##
-## 'Positive' Class : active
-##
-print(cm_r2$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9754972 0.9751072 0.9751169
-## Neg Pred Value Precision Recall
-## 0.9754876 0.9751169 0.9754972
-## F1 Prevalence Detection Rate
-## 0.9753070 0.5000000 0.4877486
-## Detection Prevalence Balanced Accuracy
-## 0.5001950 0.9753022
-#
-twoClassSummary(data_2, lev = levels(data_2$obs))
-<- readRDS("model_tunig_and_validation/results/roc_2receivers.rds")
- roc2
-print(roc2$roc)
#two receivers
-<-readRDS("model_tunig_and_validation/data/woodpecker_groundtruth.rds")
- wp
-$obs<-as.factor(wp$observed)
- wp$pred<-as.factor(wp$prediction) wp
#create confusion matrix
-<- confusionMatrix(wp$pred, wp$obs)
- cm_wpprint(cm_wp)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 8309 31
-## passive 432 7969
-##
-## Accuracy : 0.9723
-## 95% CI : (0.9697, 0.9748)
-## No Information Rate : 0.5221
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.9447
-##
-## Mcnemar's Test P-Value : < 2.2e-16
-##
-## Sensitivity : 0.9506
-## Specificity : 0.9961
-## Pos Pred Value : 0.9963
-## Neg Pred Value : 0.9486
-## Prevalence : 0.5221
-## Detection Rate : 0.4963
-## Detection Prevalence : 0.4982
-## Balanced Accuracy : 0.9734
-##
-## 'Positive' Class : active
-##
-print(cm_wp$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9505777 0.9961250 0.9962830
-## Neg Pred Value Precision Recall
-## 0.9485776 0.9962830 0.9505777
-## F1 Prevalence Detection Rate
-## 0.9728939 0.5221313 0.4963264
-## Detection Prevalence Balanced Accuracy
-## 0.4981781 0.9733514
-print(twoClassSummary(wp, lev = levels(wp$obs)))
## ROC Sens Spec
-## 0.9982197 0.9505777 0.9961250
-<- MLeval::evalm(data.frame(wp[, c("active", "passive")], wp$obs), plots=c("r")) roc_wp
#print(roc_wp$roc)
#two receivers
-<-readRDS("model_tunig_and_validation/data/human_walk_groundtruth.rds")
- hm$obs<-factor(hm$observation)
- hm$pred<-factor(hm$prediction) hm
#create confusion matrix
-<- confusionMatrix(factor(hm$pred), factor(hm$obs))
- cm_hmprint(cm_hm)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 25787 280
-## passive 717 5870
-##
-## Accuracy : 0.9695
-## 95% CI : (0.9675, 0.9713)
-## No Information Rate : 0.8117
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.9028
-##
-## Mcnemar's Test P-Value : < 2.2e-16
-##
-## Sensitivity : 0.9729
-## Specificity : 0.9545
-## Pos Pred Value : 0.9893
-## Neg Pred Value : 0.8911
-## Prevalence : 0.8117
-## Detection Rate : 0.7897
-## Detection Prevalence : 0.7983
-## Balanced Accuracy : 0.9637
-##
-## 'Positive' Class : active
-##
-print(cm_hm$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9729475 0.9544715 0.9892584
-## Neg Pred Value Precision Recall
-## 0.8911492 0.9892584 0.9729475
-## F1 Prevalence Detection Rate
-## 0.9810352 0.8116617 0.7897042
-## Detection Prevalence Balanced Accuracy
-## 0.7982789 0.9637095
-twoClassSummary(hm, lev = levels(hm$obs))
## ROC Sens Spec
-## 0.9902507 0.9729475 0.9544715
-<- MLeval::evalm(data.frame(hm[, c("active", "passive")], hm$obs),plots=c("r")) roc_hm
#print(roc_hm$roc)
Regardless of whether the models were tested on independent test data from bats or on data from other species (human, woodpecker), the performance metrics were always close to their maxima.
-The results of the ML-based approach were compared with those of a threshold-based approach (Kays et al. 2011)by calculating the difference in the signal strength between successive signals for all three test datasets (bats, bird, humans). We applied a threshold of 4 dB which was deemed appropriate to optimally separate active and passive behaviours in previous studies (Holland et al. (2011)). In addition, the optimize-function of the R-package stats (R Core Team, 2021) was used to identify the value of the signal strength difference that separated the training dataset into active and passive with the highest accuracy. This value was also applied to all three test datasets.
-To find the threshold value that optimizes the accuracy (data is balanced) when separating the data into active and passive, we first calculated the signal strength difference of consecutive signals in the complete bats data set, than separated 50 % balanced test and train data and finally used the optimize function from the R base package to determine the best threshold.
-#get all bat data
-<-fread("model_tunig_and_validation/data/train_2020_2021.csv")
- trn
-#calculate signal strength difference per station
-<-plyr::ldply(unique(trn$station), function(x){
- dtrn
- <-trn[trn$station==x,]
- tmp<-tmp[order(tmp$timestamp),]
- tmp<-tmp%>%group_by(ID)%>%
- tmpmutate(Diff = abs(max_signal - lag(max_signal)))
- return(tmp)
-
- })
-##data clean up
-<-dtrn[!is.na(dtrn$Diff),]
- dtrn<-dtrn[!(dtrn$behaviour=="active" & dtrn$Diff==0),]
- dtrn
-##factorize
-$behaviour<-as.factor(dtrn$behaviour)
- dtrntable(dtrn$behaviour)
##
-## active passive
-## 513831 2654868
-#balance data
-set.seed(10)
-
-<-downSample(x = dtrn,
- tdowny = dtrn$behaviour)
-
-#create 50% train and test
-
-<-createDataPartition(tdown$Class, p = .5,
- trainIndex list = FALSE,
- times = 1)
-
-<- tdown[ trainIndex,]
- dtrn <- tdown[-trainIndex,]
- dtst
-#optimize seperation value based on accuracy (remeber data is balanced)
-
-<-dtrn$Diff
- value<-dtrn$behaviour
- group
-= Vectorize(function(th) mean(c("passive", "active")[(value > th) + 1] == group))
- accuracy <-optimize(accuracy, c(min(value, na.rm=TRUE), max(value, na.rm=TRUE)), maximum=TRUE)
- ac
-$maximum ac
## [1] 1.088167
-#classify data by optimized value
-$pred<-NA
- dtst$pred[dtst$Diff>ac$maximum]<-"active"
- dtst$pred[dtst$Diff<=ac$maximum]<-"passive"
- dtst
-#calc confusion matrix
-$pred<-factor(dtst$pred)
- dtst<-confusionMatrix(factor(dtst$Class), factor(dtst$pred))
- cm
-print(cm)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 198976 57939
-## passive 81121 175794
-##
-## Accuracy : 0.7294
-## 95% CI : (0.7281, 0.7306)
-## No Information Rate : 0.5451
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.4587
-##
-## Mcnemar's Test P-Value : < 2.2e-16
-##
-## Sensitivity : 0.7104
-## Specificity : 0.7521
-## Pos Pred Value : 0.7745
-## Neg Pred Value : 0.6842
-## Prevalence : 0.5451
-## Detection Rate : 0.3872
-## Detection Prevalence : 0.5000
-## Balanced Accuracy : 0.7312
-##
-## 'Positive' Class : active
-##
-print(cm$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.7103825 0.7521146 0.7744818
-## Neg Pred Value Precision Recall
-## 0.6842497 0.7744818 0.7103825
-## F1 Prevalence Detection Rate
-## 0.7410486 0.5451161 0.3872409
-## Detection Prevalence Balanced Accuracy
-## 0.5000000 0.7312485
-#4 dB value from the literature
-$pred<-NA
- dtst$pred[dtst$Diff>4]<-"active"
- dtst$pred[dtst$Diff<=4]<-"passive"
- dtst
-$pred<-factor(dtst$pred)
- dtst<-confusionMatrix(dtst$Class, dtst$pred)
- cmprint(cm)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 102253 154662
-## passive 35342 221573
-##
-## Accuracy : 0.6302
-## 95% CI : (0.6289, 0.6315)
-## No Information Rate : 0.7322
-## P-Value [Acc > NIR] : 1
-##
-## Kappa : 0.2604
-##
-## Mcnemar's Test P-Value : <2e-16
-##
-## Sensitivity : 0.7431
-## Specificity : 0.5889
-## Pos Pred Value : 0.3980
-## Neg Pred Value : 0.8624
-## Prevalence : 0.2678
-## Detection Rate : 0.1990
-## Detection Prevalence : 0.5000
-## Balanced Accuracy : 0.6660
-##
-## 'Positive' Class : active
-##
-print(cm$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.7431447 0.5889218 0.3980032
-## Neg Pred Value Precision Recall
-## 0.8624370 0.3980032 0.7431447
-## F1 Prevalence Detection Rate
-## 0.5183798 0.2677831 0.1990016
-## Detection Prevalence Balanced Accuracy
-## 0.5000000 0.6660333
-Since activity observations are not continuous but signal recording on the tRackIT-Stations is, we first have to calculate the signal strength difference on the raw data and than match it to the ground truth observations
-#list raw signals
-<-list.files("model_tunig_and_validation/data/woodpecker_raw/variables/", full.names = TRUE)
- wp
-
-#calculate signal strength difference
-<-plyr::ldply(wp, function(x){
- wp_tst
- <-fread(x)
- tmp<-tmp[order(tmp$timestamp),]
- tmp<-tmp%>%mutate(Diff = abs(max_signal - lag(max_signal)))
- tmpreturn(tmp)
-
- })
-$timestamp<-lubridate::with_tz(wp_tst$timestamp, "CET")
- wp_tst
-#get observations and merge by timestamp
-
-<-readRDS("model_tunig_and_validation/data/woodpecker_groundtruth.rds")
- wp_gtruth
-<-merge(wp_gtruth, wp_tst, all.x = TRUE) wp_tst
$pred<-NA
- wp_tst$pred[wp_tst$Diff>ac$maximum]<-"active"
- wp_tst$pred[wp_tst$Diff<=ac$maximum]<-"passive"
- wp_tst
-$pred<-factor(wp_tst$pred)
- wp_tst$observed<-factor(wp_tst$observed)
- wp_tst
-<-confusionMatrix(factor(wp_tst$observed), factor(wp_tst$pred))
- cm
-print(cm)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 8191 3822
-## passive 590 7691
-##
-## Accuracy : 0.7826
-## 95% CI : (0.7769, 0.7883)
-## No Information Rate : 0.5673
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.5757
-##
-## Mcnemar's Test P-Value : < 2.2e-16
-##
-## Sensitivity : 0.9328
-## Specificity : 0.6680
-## Pos Pred Value : 0.6818
-## Neg Pred Value : 0.9288
-## Prevalence : 0.4327
-## Detection Rate : 0.4036
-## Detection Prevalence : 0.5919
-## Balanced Accuracy : 0.8004
-##
-## 'Positive' Class : active
-##
-print(cm$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9328095 0.6680274 0.6818447
-## Neg Pred Value Precision Recall
-## 0.9287526 0.6818447 0.9328095
-## F1 Prevalence Detection Rate
-## 0.7878234 0.4326895 0.4036168
-## Detection Prevalence Balanced Accuracy
-## 0.5919484 0.8004185
-#evaluate with 4 dB value from the literature
-$pred<-NA
- wp_tst$pred[wp_tst$Diff>4]<-"active"
- wp_tst$pred[wp_tst$Diff<=4]<-"passive"
- wp_tst
-$pred<-factor(wp_tst$pred)
- wp_tst<-confusionMatrix(wp_tst$observed, wp_tst$pred)
- cmprint(cm)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 3669 8344
-## passive 191 8090
-##
-## Accuracy : 0.5794
-## 95% CI : (0.5726, 0.5862)
-## No Information Rate : 0.8098
-## P-Value [Acc > NIR] : 1
-##
-## Kappa : 0.2449
-##
-## Mcnemar's Test P-Value : <2e-16
-##
-## Sensitivity : 0.9505
-## Specificity : 0.4923
-## Pos Pred Value : 0.3054
-## Neg Pred Value : 0.9769
-## Prevalence : 0.1902
-## Detection Rate : 0.1808
-## Detection Prevalence : 0.5919
-## Balanced Accuracy : 0.7214
-##
-## 'Positive' Class : active
-##
-print(cm$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9505181 0.4922721 0.3054191
-## Neg Pred Value Precision Recall
-## 0.9769352 0.3054191 0.9505181
-## F1 Prevalence Detection Rate
-## 0.4622945 0.1902040 0.1807924
-## Detection Prevalence Balanced Accuracy
-## 0.5919484 0.7213951
-Human activity observations are also not continuous so we have to calc signal strength diff for each individual on the raw data
-<-list.dirs("model_tunig_and_validation/data/human_raw/", full.names = TRUE)
- hm_dirs<-hm_dirs[grep("variables", hm_dirs)]
- hm_dirs<-plyr::ldply(hm_dirs, function(d){
- hm_tst
- <-list.files(d, full.names = TRUE)
- fls
- <-plyr::ldply(fls, function(x){
- tmp_dat
- <-fread(x)
- tmp<-tmp[order(tmp$timestamp),]
- tmp<-tmp%>%mutate(Diff = abs(max_signal - lag(max_signal)))
- tmpreturn(tmp)
-
- })
- return(tmp_dat)})
-
-#get obesrvations and merge
-<-readRDS("model_tunig_and_validation/data/human_walk_groundtruth.rds")
- hm_gtruth<-merge(hm_gtruth, hm_tst, all.x = TRUE)
- hm_tst<-hm_tst[!duplicated(hm_tst$timestamp),] hm_tst
#evaluate based on optimized threshold
-$pred<-NA
- hm_tst$pred[hm_tst$Diff>ac$maximum]<-"active"
- hm_tst$pred[hm_tst$Diff<=ac$maximum]<-"passive"
- hm_tst
-$pred<-factor(hm_tst$pred)
- hm_tst$observed<-factor(hm_tst$observation)
- hm_tst
-<-confusionMatrix(hm_tst$observed, hm_tst$pred)
- cm
-print(cm)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 9613 2292
-## passive 143 2030
-##
-## Accuracy : 0.827
-## 95% CI : (0.8207, 0.8333)
-## No Information Rate : 0.693
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.5282
-##
-## Mcnemar's Test P-Value : < 2.2e-16
-##
-## Sensitivity : 0.9853
-## Specificity : 0.4697
-## Pos Pred Value : 0.8075
-## Neg Pred Value : 0.9342
-## Prevalence : 0.6930
-## Detection Rate : 0.6828
-## Detection Prevalence : 0.8456
-## Balanced Accuracy : 0.7275
-##
-## 'Positive' Class : active
-##
-print(cm$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9853424 0.4696900 0.8074759
-## Neg Pred Value Precision Recall
-## 0.9341924 0.8074759 0.9853424
-## F1 Prevalence Detection Rate
-## 0.8875860 0.6929962 0.6828385
-## Detection Prevalence Balanced Accuracy
-## 0.8456457 0.7275162
-#print(cm$table)
#evaluate based on 4 dB value from the literature
-
-$pred<-NA
- hm_tst$pred[hm_tst$Diff>4]<-"active"
- hm_tst$pred[hm_tst$Diff<=4]<-"passive"
- hm_tst
-$pred<-factor(hm_tst$pred)
- hm_tst<-confusionMatrix(hm_tst$observed, hm_tst$pred)
- cm
-print(cm)
## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction active passive
-## active 4851 7054
-## passive 8 2165
-##
-## Accuracy : 0.4984
-## 95% CI : (0.4901, 0.5067)
-## No Information Rate : 0.6549
-## P-Value [Acc > NIR] : 1
-##
-## Kappa : 0.1736
-##
-## Mcnemar's Test P-Value : <2e-16
-##
-## Sensitivity : 0.9984
-## Specificity : 0.2348
-## Pos Pred Value : 0.4075
-## Neg Pred Value : 0.9963
-## Prevalence : 0.3451
-## Detection Rate : 0.3446
-## Detection Prevalence : 0.8456
-## Balanced Accuracy : 0.6166
-##
-## 'Positive' Class : active
-##
-print(cm$byClass)
## Sensitivity Specificity Pos Pred Value
-## 0.9983536 0.2348411 0.4074759
-## Neg Pred Value Precision Recall
-## 0.9963185 0.4074759 0.9983536
-## F1 Prevalence Detection Rate
-## 0.5787402 0.3451485 0.3445802
-## Detection Prevalence Balanced Accuracy
-## 0.8456457 0.6165973
-print(cm$table)
## Reference
-## Prediction active passive
-## active 4851 7054
-## passive 8 2165
-When calibrating the threshold based on an adequate train data set, the approach is generally able to separate active and passive behavior but performance metrics (F1=0.79, 0.78, 0.88; bats, woodpecker, human) are between 10 and 20 points worth and more variable than our random forest model (F1= 0.97, 0.97, 0.98; bats,woodpecker,human). With F-scores between 0.46 and 0.58 the threshold value proposed in the literature performed significantly worth.
-Since only the test data set of the bats is balanced but the woodpecker data is slightly imbalanced and the human activity data set is highly imbalanced lets also take a look at a metric that takes the data distributoin into account:
-Cohen’s kappa is defined as:
-K=(p_0-p_e)/(1-p_e)
where p_0 is the overall accuracy of the model and p_e is the measure of the agreement between the model predictions and the actual class values as if happening by chance.
-Cohen’s kappa is always less than or equal to 1. Values of 0 or less, indicate that the classifier is not batter than chance. Landis and Koch (1977) provide a way to characterize values. According to their scheme a value < 0 is indicating no agreement , 0–0.20 slight agrement, 0.21–0.40 fair agreement, 0.41–0.60 moderate agreement, 0.61–0.80 substantial agreement , and 0.81–1 as almost perfect agreement.
-Kappa values based on the 4 dB separation value from the literature ranged between 0.17 (humans) and 0.26 (bats), i.e. a slight to fair agreement. For the optimized threshold Kappa values were significantly better in all cases (0.46, 0.58, 0.53; bats, woodpecker, humans); i.e. moderate agreement. However, even the best Kappa value for the threshold based approach only showed a moderate agreement while all Kappa values based on the random-forest model showed an almost perfect agreement ( 0.94, 0.94, 0.90 ; bats, woodpecker, humans ).
-This tutorial shows how probability of activity curves can be extracted from the VHF signal data after classification of 1 min intervals into active or passive states. I’m using Hierarchical Generalized Additive Models following the Pedersen et al. 2021 article.
-Before you start, make sure that you have the following packages installed: tidyverse
, data.table
,lubridate
, hms
, mgcv
, gratia
, itsadug
, mgcViz
, ggthemes
, viridis
Additionally we will use functionalities fro the tRackIT
R package that is not hosted on cran yet. The tRackIT R-package also uses dependencies that are not published on CRAN. So before we install the package, we install these dependencies
library(remotes)
-library(devtools)
-Sys.setenv("R_REMOTES_NO_ERRORS_FROM_WARNINGS" = "true")
-#fast rolling windows
-install_github("andrewuhl/RollingWindow")
-#fortran based triangulation algorithms
-install_github("barryrowlingson/telemetr")
Load the packages
- -The tRackIT-Tutorial-for-activity-classification
shows the complete workflow to get from raw signals up to activity classifications with a 1 Minute resolution. We will skip that part here and start with the classified data of bats that have been tracket from 2018 to 2021 (ècological_case_study/bats_for_activity`). There is one file per individual now, that contains nothing but the timestamp, the activity class per timestamp and the ID. We will need some more info such as timing in relation to sunset and sunrise as well as species, sex etc. To do so we will use two functionalities of the tRackIT-package. The time_of_day() function claculates timedifference tu sunset and sunrise etc, the add.ID.info() function adds meta-info for the individual to the data. This section of the tutorial can only be executed if the “ecological_case_study_bat_activity” directory has been downloaded. You can skip that part and continue with the next chunk of code.
#set coordinates for sunset/sunrise calculation
-
-Lat<-50.844868
-Lon<-8.663649
-
-#Loop through years
-for(y in c(2018, 2019, 2020, 2021)){
-#print(y)
-#get projectfile per year
-pr<-getProject(projroot=paste0("D:/data_ms_activity_classification/ecological_case_study_bat_activity/data/bats_for_activity/",y, "/"))
-
-#remove ids with no data at all
-pr$tags<-pr$tags[pr$tags$ID!="Mbec180518_2",]
-pr$tags<-pr$tags[pr$tags$ID!="mbec150155_m",]
-
-#loop through individuals
-for(id in pr$tags$ID){
-
- #print(id)
-
- anml<-getAnimal(projList = pr, animalID = id)
- #get activity classification aggregated to 1 Minute
- fls<-list.files(anml$path$classification,pattern="agg", full.names = TRUE)
-
- if(length(fls)>0){
-
- data<-data.table::fread(fls[1])
-
- #calculate daytime infos
- data<-time_of_day(data = data, Lat=Lat, Lon = Lon, tcol = "timestamp", tz = "CET", activity_period = "nocturnal")
-
- #add id info
-data<-add.ID.Info( data=data, animal=anml)
-
-#number of tracking days
-data$n_days<-as.numeric(abs(difftime(as.Date(data$start_datetime), as.Date(data$stop_datetime))))+1
-
-#binary activity classification
-data$activity<-ifelse(data$prediction=="active", 1,0)
-
-#correction of typo
-data$species[data$species=="mbec"]<-"Mbec"
-
- data.table::fwrite(data, paste0("D:/data_ms_activity_classification/ecological_case_study_bat_activity/data/all_bats_aggregated/",anml$meta$animalID, "_",as.character(y), "_aggregated_1_Minute.csv"))
-
- }
-}}
-
-
-#merge all data
- fls<-list.files("bat_data_HGAM_tutorial/all_bats_aggregated/", full.names = TRUE)
-data<-plyr::ldply(fls, function(x){data.table::fread(x)})
-
-data<-data[data$species=="Mbec" | data$species=="Nlei",]
-
-
-
-#get info which ids should be excluded
-check_ids<-data.table::fread("bat_data_HGAM_tutorial/bats_inspect_id.csv")
-excld<-check_ids[check_ids$exclude_individual=="Y",]
-
-#exclude ids from data
-data<-data[!(data$ID %in% excld$ID),]
-
-#account for rep state transition
-
-df_rep<-read.csv("bat_data_HGAM_tutorial/df_rep_state_transition.csv")
-for (id in unique(df_rep$ID)){
-
- if (df_rep$rep2[df_rep$ID==id]=="pregnant"){
-
- data$rep.state[data$ID==id & data$year==df_rep$year[df_rep$ID==id] & data$timestamp>=df_rep$start_rep1[df_rep$ID==id]]<-"lactating"}
-
- if (df_rep$rep2[df_rep$ID==id]=="post-lactating"){
-
- data$rep.state[data$ID==id & data$year==df_rep$year[df_rep$ID==id] & data$timestamp>=df_rep$start_rep2[df_rep$ID==id]]<-"post-lactating"}
-
-}
-
-
-
-#save data
-data.table::fwrite(data,"bat_data_HGAM_tutorial/all_bats_aggregated.csv")
df_1min <- fread("bat_data_HGAM_tutorial/all_bats_aggregated.csv", stringsAsFactors = T) # import data
-
-
-glimpse(df_1min)
## Rows: 818,398
-## Columns: 24
-## $ timestamp <dttm> 2020-05-15 20:14:00, 2020-05-15 20:15:00, 2020-05-15 2~
-## $ prediction <fct> active, active, active, active, active, active, active,~
-## $ ID <fct> h146474, h146474, h146474, h146474, h146474, h146474, h~
-## $ date <date> 2020-05-15, 2020-05-15, 2020-05-15, 2020-05-15, 2020-0~
-## $ sunset_date <date> 2020-05-15, 2020-05-15, 2020-05-15, 2020-05-15, 2020-0~
-## $ hour <int> 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,~
-## $ sunrise_date <date> 2020-05-16, 2020-05-16, 2020-05-16, 2020-05-16, 2020-0~
-## $ sunset <dttm> 2020-05-15 19:08:35, 2020-05-15 19:08:35, 2020-05-15 1~
-## $ sunrise <dttm> 2020-05-16 03:34:14, 2020-05-16 03:34:14, 2020-05-16 0~
-## $ time_to_rise <dbl> -7.337226, -7.320559, -7.303893, -7.287226, -7.270559, ~
-## $ time_to_set <dbl> 1.090028, 1.106695, 1.123362, 1.140028, 1.156695, 1.173~
-## $ start_datetime <dttm> 2020-05-15 20:14:00, 2020-05-15 20:14:00, 2020-05-15 2~
-## $ stop_datetime <dttm> 2020-05-29 21:58:00, 2020-05-29 21:58:00, 2020-05-29 2~
-## $ year <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2~
-## $ ydate <int> 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, ~
-## $ month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5~
-## $ t <dbl> 0, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600, 660~
-## $ species <fct> Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, M~
-## $ sex <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w~
-## $ age <fct> ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad,~
-## $ weight <dbl> 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, ~
-## $ rep.state <fct> pregnant, pregnant, pregnant, pregnant, pregnant, pregn~
-## $ n_days <int> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,~
-## $ activity <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1~
-df_1min$species<-as.character(df_1min$species)
-df_1min$species[df_1min$species=="Mbec"]<-"M.bechsteinii"
-df_1min$species[df_1min$species=="Nlei"]<-"N.leisleri"
-df_1min$species<-as.factor(df_1min$species)
-
-df_1min$month_f <- factor(month(df_1min$date))
-df_1min$year_f <- factor(df_1min$year)
-df_1min$date <- date(df_1min$date)
-df_1min$hour <- hour(df_1min$timestamp)
-df_1min$week <- week(df_1min$timestamp)
-
-min_set <- min(df_1min$time_to_set)
-max_set <- max(df_1min$time_to_set)
-df_1min$start.event <- ifelse(df_1min$time_to_set==min_set,T,F) # Identify start of the time series
-df_1min$ydate_f <- as.factor(df_1min$ydate)
-df_1min$date_f <- as.factor(df_1min$date)
-
-K.time_of_day <- length(unique(df_1min$time_to_rise))
-
-df_1min <- df_1min %>% data.frame()
Exclude retagged individuals and extract sample sizes
-df_1min<- df_1min[!(df_1min$ID =="Nlei20211" & df_1min$ydate >= 200),]
-df_1min<- df_1min[!(df_1min$ID =="h146487" & df_1min$ydate >= 150),]
-
-# Sample sizes
-df_1min %>%
- group_by(species) %>%
- summarise(nID = n_distinct(ID),
- nObs = n(),
- meanDays = mean(n_days))
## # A tibble: 2 x 4
-## species nID nObs meanDays
-## <fct> <int> <int> <dbl>
-## 1 M.bechsteinii 52 577977 18.4
-## 2 N.leisleri 20 204443 21.3
-
-## nID nObs meanDays
-## 1 72 782420 19.15002
-NOTE Some individuals were tagged twice within the same year. We want to avoid these situations and reduce the sampling period to the first tagging event. The full details of this analysis can be found under filename
, section 0.2.
We can plot visually inspect the data by presenting the probability of activity over time of the day. Here, it is easier to calculate this probability by time intervals of 15 minutes for easier viuslization
-df_15min <- df_1min %>%
- #filter(species == "M.bechsteinii" | species == "N.leisleri") %>%
- mutate(interval = as_hms(floor_date(timestamp, unit = "15minutes"))) %>%
- group_by(ID, species, year, ydate, hour, interval) %>%
- summarise(n_intervals = length(activity),
- n_active = length(activity[activity == 1]),
- n_passive = length(activity[activity == 0]),
- time_to_set = mean(time_to_set)) # calculate average time to sunset for that 15 minute interval
-
-df_15min %>%
- ggplot(aes(x = time_to_set, y = n_active/n_intervals, color = species)) +
- geom_point(alpha = 0.1) +
- geom_smooth() + scale_color_wsj() + theme_bw(14) +
- facet_wrap(~species) + geom_hline(yintercept = 0.5,linetype = "dashed") +
- ylim(0, 1) + ylab("Activity probability (15 min interval)") +
- xlab("Time to sunset (h)")
We can also inspect each tagged individual in the same way
-df_15min %>%
- filter(species == "N.leisleri") %>%
- ggplot(aes(x = time_to_set, y = n_active/n_intervals)) +
- geom_point(alpha = 0.2) +
- geom_smooth() + scale_color_wsj() + theme_bw(14) +
- facet_wrap(~ID) + geom_hline(yintercept = 0.5,linetype = "dashed") +
- ylim(0, 1) + ylab("Activity probability (15 min interval)") +
- xlab("Time to sunset (h)")
df_15min %>%
- filter(species == "M.bechsteinii") %>%
- ggplot(aes(x = time_to_set, y = n_active/n_intervals)) +
- geom_point(alpha = 0.2) +
- geom_smooth() + scale_color_wsj() + theme_bw(14) +
- facet_wrap(~ID) + geom_hline(yintercept = 0.5,linetype = "dashed") +
- ylim(0, 1) + ylab("Activity probability (15 min interval)") +
- xlab("Time to sunset (h)")
We fit a Hierarchical Generalized Additive Model to compare whether Bechstein’s and Leisler’s bats differ significantly in their daily activity patterns. We assume that the probability of activity is a non-linear function of time of the day, here centered around time of sunset (t = 0). We use a binomial error term since our activity column is a string of 0 and 1 (i.e the bat is either marked as passive for that minute or active).
-We use circular spline functions to constrain the activity probability to be equal at 00:00 and 23:59 (argument bs = "cc"
). We also need to account for the fact that individuals were measured repeatedely but in differnet years of monitoring. The simplest way to do that is to include individuals and date as random intercepts with the argument bs = "re"
. Note that there are many flavors for specifying random effects with HGAMs which apply more or less penalty to the random effects and allow them to deviate from population level trends. Here, we are mainly interested in species differences and assume that there is a general time of activity function that is species specific. Note that more complex random effect structures gave functionally similar results and did not affect conclusions. Pedersen et al. 2021 provides a great and exhasutive tutorials on the different ways to approach random effects within the GAM framework. There are two other arguments that require ou attention. First, we need to account for the fact that there observations are likely to be highly autocorrelated because they are taken at 1 min intervals. This value has to be set manually and we show our procedure for investigating autocorrelation in our analysis R script (see filename
, section 1.1). Next, we need to select the degree of complexity of our smoothing terms: k
. After inspection with the k.check
function, setting k to 120 was the highest complexity we could set without overfitting the data (see filename
, section 1.2 for details).
To test for species difference in activity patterns, we fit two models. Model M0 assumes that both species have the same global activity patterns while model M1 allows both species to follow their own trend (argment by = species
within the spline function)
Set autocorrelation (r1
) and basis dimension (k
) of the smooth term
Fit model M0
-fit.gam.M0 <- bam(activity ~
- s(time_to_set, bs = c("cc"), k = k) +
- s(ID, bs = "re") + s(date_f, bs = "re"),
- rho = r1, AR.start = start.event,
- data = df_1min,
- method = "fREML", discrete=T, family = "binomial",
- knots=list(time_to_rise=c(min_set, max_set)))
-summary(fit.gam.M0)
##
-## Family: binomial
-## Link function: logit
-##
-## Formula:
-## activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") +
-## s(date_f, bs = "re")
-##
-## Parametric coefficients:
-## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) -1.7383 0.1339 -12.98 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Approximate significance of smooth terms:
-## edf Ref.df Chi.sq p-value
-## s(time_to_set) 51.84 118 4061286 <2e-16 ***
-## s(ID) 67.07 71 1976322 <2e-16 ***
-## s(date_f) 280.26 306 3084458 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## R-sq.(adj) = 0.45 Deviance explained = 38.8%
-## fREML = 8.5139e+05 Scale est. = 1 n = 782420
-
-Fit model M1
-fit.gam.M1 <- bam(activity ~
- s(time_to_set, bs = c("cc"), k = k, by = species) +
- s(ID, bs = "re") + s(date_f, bs = "re"),
- rho = r1, AR.start = start.event,
- data = df_1min,
- method = "fREML", discrete=T, family = "binomial",
- knots=list(time_to_rise=c(min_set, max_set)))
-summary(fit.gam.M1)
##
-## Family: binomial
-## Link function: logit
-##
-## Formula:
-## activity ~ s(time_to_set, bs = c("cc"), k = k, by = species) +
-## s(ID, bs = "re") + s(date_f, bs = "re")
-##
-## Parametric coefficients:
-## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) -1.7399 0.1309 -13.29 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Approximate significance of smooth terms:
-## edf Ref.df Chi.sq p-value
-## s(time_to_set):speciesM.bechsteinii 47.23 118 3571851 <2e-16 ***
-## s(time_to_set):speciesN.leisleri 45.82 118 188033 <2e-16 ***
-## s(ID) 66.32 71 1824189 <2e-16 ***
-## s(date_f) 282.01 306 2421802 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## R-sq.(adj) = 0.468 Deviance explained = 40.4%
-## fREML = 8.4389e+05 Scale est. = 1 n = 782420
-
-Compare M0 and M1
- -## Analysis of Deviance Table
-##
-## Model 1: activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") +
-## s(date_f, bs = "re")
-## Model 2: activity ~ s(time_to_set, bs = c("cc"), k = k, by = species) +
-## s(ID, bs = "re") + s(date_f, bs = "re")
-## Resid. Df Resid. Dev Df Deviance
-## 1 781994 578790
-## 2 781942 563613 52.41 15177
-
-## df AIC delta.AIC
-## fit.gam.M1 443.2209 248309.1 0.00
-## fit.gam.M0 400.6479 263401.1 15092.01
-# Calculate hourly interval data
-df_1h <- df_1min %>%
- mutate(interval = as_hms(floor_date(timestamp, unit = "60minutes"))) %>%
- group_by(ID, species, year, ydate, hour, interval) %>%
- summarise(n_intervals = length(activity),
- n_active = length(activity[activity == 1]),
- n_passive = length(activity[activity == 0]),
- time_to_set = mean(time_to_set)) # calculate average time to sunset for that 15 minute interval
-df_1h$p_act <- df_1h$n_active/df_1h$n_intervals
-
-fit.values <- evaluate_smooth(fit.gam.M1, "s(time_to_set)", n = 244,
- overall_uncertainty = T,
- unconditional = T)
-draw(fit.values)
b0 <- coef(fit.gam.M1)[1]
-Fig5 <- ggplot(data = fit.values,
- aes(x = time_to_set, y = plogis(est+b0),
- color = species, group = species))
-Fig5a <- Fig5 +
- geom_point(data = df_1h, alpha = .1,
- aes(x = time_to_set,
- y = p_act)) +
- geom_ribbon(aes(ymin = plogis(est+b0 - 2 * se) ,
- ymax = plogis(est+b0 + 2 * se)),
- fill = "grey", color = "grey") +
- geom_line(size = .5) +
- geom_hline(yintercept = 0.5, linetype = "dashed") +
- scale_color_wsj() + theme_bw(14) +
- xlab("") +
- ylab("Activity probability \n") +
- ylim(0, 1) +
- theme(legend.position = c(.1,.745))
-Fig5a
fit.delta <- difference_smooths(fit.gam.M1, "s(time_to_set)", n = 244)
-
-Fig5b <- ggplot(data = fit.delta,
- aes(x = time_to_set, y = diff)) +
- geom_ribbon(aes(ymin = lower,
- ymax = upper), color = "grey",
- alpha = 0.3) +
- geom_line() + geom_hline(yintercept = 0, linetype = "dashed") +
- theme_bw(14) + theme(legend.position = "none") +
- xlab("Time since sunset (h)") +
- ylab("Activity difference \n (M.bechsteinii - N.leisleri)")
-
-Fig5 <- Fig5a / Fig5b
-Fig5
The following section shows how different aspects of the dialy activity patterns can be extracted from the HGAM output. We focus here on determining the timinig for onset and end of the daily activity period, the timing of peak activity and the intensity of activity during the night.
-# Onset of activity up time: When does p(activity) > 0.5 first
-# End of activity time: When does p(activity) > 0.5 last
-fit.values %>% group_by(species) %>%
- filter(plogis(est+b0) > .5) %>%
- summarise(a.onset = as_hms(min(time_to_set)*60),
- a.end = as_hms(max(time_to_set)*60))
## # A tibble: 2 x 3
-## species a.onset a.end
-## <fct> <time> <time>
-## 1 M.bechsteinii 00'30.879867" 07'10.972640"
-## 2 N.leisleri 00'12.125519" 07'23.475539"
-# Peak activity: what are the two highest values for p(activity)?
-fit.values %>% group_by(species) %>%
- filter(plogis(est+b0) == max(plogis(est+b0))) %>%
- summarise(peak.a = max(plogis(est+b0)),
- peak.a.low = plogis(est+b0-2*se),
- peak.a.up = plogis(est+b0+2*se))
## # A tibble: 2 x 4
-## species peak.a peak.a.low peak.a.up
-## <fct> <dbl> <dbl> <dbl>
-## 1 M.bechsteinii 0.770 0.751 0.788
-## 2 N.leisleri 0.822 0.798 0.845
-# Peak activity timing: time of day whith maximum p(activity)
-fit.values %>% group_by(species) %>%
- filter(plogis(est+b0) == max(plogis(est+b0))) %>%
- group_by(species) %>%
- summarise(t.peak = as_hms(time_to_set*60))
## # A tibble: 2 x 2
-## species t.peak
-## <fct> <time>
-## 1 M.bechsteinii 01'33.394363"
-## 2 N.leisleri 00'37.131317"
-# Activity density: area under the curve when p(activity) > 0.5
-fit.values %>% group_by(species) %>%
- filter(plogis(est+b0) > .5) %>%
- summarise(auc = bayestestR::auc(time_to_set, plogis(est+b0),
- method = "spline"),
- auc.low = bayestestR::auc(time_to_set, plogis(est+b0-2*se),
- method = "spline"),
- auc.up = bayestestR::auc(time_to_set, plogis(est+b0+2*se),
- method = "spline"))
## # A tibble: 2 x 4
-## species auc auc.low auc.up
-## <fct> <dbl> <dbl> <dbl>
-## 1 M.bechsteinii 4.70 4.56 4.83
-## 2 N.leisleri 3.42 3.23 3.62
-We can also use HGAM to compare the timing of daily activity depending on the reproductive status of individuals within each species. The models used here are very similar to those used in section 2. with the main difference that we are now comparing a model that assumes a common activity pattern for all statuses (model 0) and one that allows reproductive statuses to have differing smoothing functions.
-Exclude individuals of unknown reproductive status
-df_1min %>% filter(rep.state != "unknown") %>%
- group_by(species) %>%
- summarise(nID = n_distinct(ID),
- nObs = n())
## # A tibble: 2 x 3
-## species nID nObs
-## <fct> <int> <int>
-## 1 M.bechsteinii 35 384536
-## 2 N.leisleri 19 203261
-df_1min %>% filter(rep.state != "unknown") %>%
- group_by(species, rep.state) %>%
- summarise(nID = n_distinct(ID),
- nObs = n())
## # A tibble: 8 x 4
-## # Groups: species [2]
-## species rep.state nID nObs
-## <fct> <fct> <int> <int>
-## 1 M.bechsteinii lactating 13 102076
-## 2 M.bechsteinii non.repro 4 68685
-## 3 M.bechsteinii post-lactating 12 135582
-## 4 M.bechsteinii pregnant 13 78193
-## 5 N.leisleri lactating 11 71560
-## 6 N.leisleri non.repro 2 23296
-## 7 N.leisleri post-lactating 7 58889
-## 8 N.leisleri pregnant 6 49516
-# Remove individuals of unknown status
-df_1min.Mb <- df_1min %>% filter(species == "M.bechsteinii" & rep.state != "unknown") %>% droplevels()
-df_1min.Nl <- df_1min %>% filter(species == "N.leisleri" & rep.state != "unknown") %>% droplevels()
fit.gam.Mb.0 <- bam(activity ~
- s(time_to_set, bs = c("cc"), k = k) +
- s(ID, bs = "re") + #, k = 25
- s(date_f, bs = "re"), #k = 173
- rho = r1, AR.start = start.event,
- data = df_1min.Mb,
- method = "fREML", discrete=T, family = "binomial",
- knots=list(time_to_rise=c(min_set, max_set)))
-summary(fit.gam.Mb.0)
##
-## Family: binomial
-## Link function: logit
-##
-## Formula:
-## activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") +
-## s(date_f, bs = "re")
-##
-## Parametric coefficients:
-## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) -1.6686 0.1146 -14.57 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Approximate significance of smooth terms:
-## edf Ref.df Chi.sq p-value
-## s(time_to_set) 45.50 118 573547 < 2e-16 ***
-## s(ID) 28.39 35 41968 0.00532 **
-## s(date_f) 226.32 252 63224 6.82e-05 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## R-sq.(adj) = 0.519 Deviance explained = 44.5%
-## fREML = 4.0871e+05 Scale est. = 1 n = 384536
-### M1: difference in reproductive status on average & in smooth ----
-fit.gam.Mb.1 <- bam(activity ~ rep.state +
- s(time_to_set, bs = c("cc"), k = k,
- by = rep.state) +
- s(ID, bs = "re") + s(date_f, bs = "re"),
- rho = r1, AR.start = start.event,
- data = df_1min.Mb,
- method = "fREML", discrete=T, family = "binomial",
- knots=list(time_to_rise=c(min_set, max_set)))
-summary(fit.gam.Mb.1)
##
-## Family: binomial
-## Link function: logit
-##
-## Formula:
-## activity ~ rep.state + s(time_to_set, bs = c("cc"), k = k, by = rep.state) +
-## s(ID, bs = "re") + s(date_f, bs = "re")
-##
-## Parametric coefficients:
-## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) -1.2606 0.1477 -8.532 < 2e-16 ***
-## rep.statenon.repro -0.7385 0.2799 -2.639 0.00832 **
-## rep.statepost-lactating -0.8867 0.1325 -6.692 2.2e-11 ***
-## rep.statepregnant -0.1065 0.1996 -0.534 0.59357
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Approximate significance of smooth terms:
-## edf Ref.df Chi.sq p-value
-## s(time_to_set):rep.statelactating 33.82 118 77911 < 2e-16 ***
-## s(time_to_set):rep.statenon.repro 30.96 118 159447 < 2e-16 ***
-## s(time_to_set):rep.statepost-lactating 38.59 118 159446 < 2e-16 ***
-## s(time_to_set):rep.statepregnant 31.39 118 58687 < 2e-16 ***
-## s(ID) 26.75 34 17596 0.0648 .
-## s(date_f) 224.82 252 44575 7.5e-06 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## R-sq.(adj) = 0.526 Deviance explained = 45.3%
-## fREML = 4.07e+05 Scale est. = 1 n = 384536
-
-## df AIC delta.AIC
-## fit.gam.Mb.1 392.2189 106210.7 0.000
-## fit.gam.Mb.0 301.9440 109867.9 3657.224
-fit.gam.Nl.0 <- bam(activity ~
- s(time_to_set, bs = c("cc"), k = k) +
- s(ID, bs = "re") + s(date_f, bs = "re"),
- rho = r1, AR.start = start.event,
- data = df_1min.Nl,
- method = "fREML", discrete=T, family = "binomial",
- knots=list(time_to_rise=c(min_set, max_set)))
-summary(fit.gam.Nl.0)
##
-## Family: binomial
-## Link function: logit
-##
-## Formula:
-## activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") +
-## s(date_f, bs = "re")
-##
-## Parametric coefficients:
-## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) -1.7416 0.3772 -4.617 3.89e-06 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Approximate significance of smooth terms:
-## edf Ref.df Chi.sq p-value
-## s(time_to_set) 45.71 118 401373 <2e-16 ***
-## s(ID) 16.94 18 2697157 <2e-16 ***
-## s(date_f) 114.41 126 4172289 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## R-sq.(adj) = 0.35 Deviance explained = 32.8%
-## fREML = 2.1573e+05 Scale est. = 1 n = 203261
-fit.gam.Nl.1 <- bam(activity ~ rep.state +
- s(time_to_set, bs = c("cc"), k = k,
- by = rep.state) +
- s(ID, bs = "re") + s(date_f, bs = "re"),
- rho = r1, AR.start = start.event,
- data = df_1min.Nl,
- method = "fREML", discrete=T, family = "binomial",
- knots=list(time_to_rise=c(min_set, max_set)))
-summary(fit.gam.Nl.1)
##
-## Family: binomial
-## Link function: logit
-##
-## Formula:
-## activity ~ rep.state + s(time_to_set, bs = c("cc"), k = k, by = rep.state) +
-## s(ID, bs = "re") + s(date_f, bs = "re")
-##
-## Parametric coefficients:
-## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) -1.3364 0.5211 -2.564 0.010336 *
-## rep.statenon.repro 1.0943 1.3571 0.806 0.420023
-## rep.statepost-lactating 0.7711 0.2588 2.980 0.002887 **
-## rep.statepregnant -2.6379 0.6794 -3.883 0.000103 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Approximate significance of smooth terms:
-## edf Ref.df Chi.sq p-value
-## s(time_to_set):rep.statelactating 36.84 118 146915 <2e-16 ***
-## s(time_to_set):rep.statenon.repro 28.29 118 17017 <2e-16 ***
-## s(time_to_set):rep.statepost-lactating 36.58 118 46074 <2e-16 ***
-## s(time_to_set):rep.statepregnant 36.46 118 58619 <2e-16 ***
-## s(ID) 15.94 17 904539 0.022 *
-## s(date_f) 112.07 126 765022 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## R-sq.(adj) = 0.367 Deviance explained = 34.3%
-## fREML = 2.1435e+05 Scale est. = 1 n = 203261
-
-## df AIC delta.AIC
-## fit.gam.Nl.1 272.7888 54313.97 0.000
-## fit.gam.Nl.0 178.7082 57287.66 2973.691
-fit.values.Mb <- evaluate_smooth(fit.gam.Mb.1, "s(time_to_set)", n = 244,
- overall_uncertainty = T,
- unconditional = T)
-
-fit.values.Nl <- evaluate_smooth(fit.gam.Nl.1, "s(time_to_set)", n = 244,
- overall_uncertainty = T,
- unconditional = T)
-
-b0 <- coef(fit.gam.Mb.1)[1]
-Fig6a <- ggplot(data = fit.values.Mb,
- aes(x = time_to_set, y = plogis(est+b0),
- fill = rep.state, group = rep.state)) +
- geom_ribbon(aes(ymin = plogis(est+b0 - 2 * se) ,
- ymax = plogis(est+b0 + 2 * se)),
- alpha = .5) +
- geom_line(size = .5) +
- geom_hline(yintercept = 0.5, linetype = "dashed") +
- scale_fill_wsj() + theme_bw(14) +
- #facet_wrap(~rep.state) +
- xlab("time since sunset (h)") +
- ylab("Activity probability \n") +
- ylim(0, 1) +
- #theme(legend.position = "none") +
- theme(legend.position = c(.15,.745),
- legend.title=element_blank()) +
- ggtitle("M.bechsteinii")
-
-b0 <- coef(fit.gam.Nl.1)[1]
-Fig6b <- ggplot(data = fit.values.Nl,
- aes(x = time_to_set, y = plogis(est+b0),
- fill = rep.state, group = rep.state)) +
- geom_ribbon(aes(ymin = plogis(est+b0 - 2 * se) ,
- ymax = plogis(est+b0 + 2 * se)),
- alpha = .5) +
- geom_line(size = .5) +
- geom_hline(yintercept = 0.5, linetype = "dashed") +
- scale_fill_wsj() + theme_bw(14) +
- #facet_wrap(~rep.state) +
- xlab("time since sunset (h)") +
- ylab("Activity probability \n") +
- ylim(0, 1) +
- #theme(legend.position = "none") +
- theme(legend.position = "none") +
- ggtitle("N.leisleri")
-
-Fig6 <- Fig6a + Fig6b
-Fig6
The tRackIT R package provides a range of functionalities for processing data recorded with automatic telemetry systems. It is specifically tailored to data recorded with one of the sensors from the tRackIT ecosystem (Höchst & Gottwald et al 2021, Gottwald and Lampe et al. 2020, Gottwald 2019), but can also be used for other systems. This tutorial will first explain the basic functionalities in terms of project and tagged individual management. Subsequently, the processing steps for classification of VHF signals into fundamental behaviours with the help of specially trained machine-learning models (Gottwald et al. 2022) will be presented. The dataset used in this tutorial was created to test the transferability of our model trained on bat movements to observed movements of a middle spotted woodpecker. See chapter validation and the corresponding publication for details. This work is conducted within the Natur 4.0 | Sensing Biodiversity project (https://www.uni-marburg.de/de/fb19/natur40). Please download the tutorial data from (doifjgkug).
-The tRackIT R-package uses dependencies that are not published on CRAN. So before we install the package, we install these dependencies
-library(remotes)
-library(devtools)
-Sys.setenv("R_REMOTES_NO_ERRORS_FROM_WARNINGS" = "true")
-#fast rolling windows
-install_github("andrewuhl/RollingWindow")
-#fortran based triangulation algorithms
-install_github("barryrowlingson/telemetr")
Now we can install the actual package from the Nature40 gitHub page.
- -Additional packages needed are: caret
, randomForest
, data.tree
,plyr
, ggplot2
,lubridate
,data.table
, tidyverse
Load packages
-library(caret); library(randomForest);library(data.tree)
-library(plyr);library(ggplot2); library(lubridate);library(data.table); library(tidyverse);library(tRackIT)
After the package has been installed please store the two model files from the downloaded data folder modelling/models (m_r1.rds, m_r2.rds) into the extdata folder of the tRackIT-package.
-For reasons of interchangeability of tRackIT projects the workflow requires the initiation of a project. A project folder (projroot) with various subfolders for products and reference data as well as a projectfile that contains a list in which all paths are stored, a data.frame with coordinates of tRackIT-stations and a data.frame with information on tagged individuals is created by the initProject() function. In addition, the path to absolute raw data, as created by the software running on the stations, which may be stored on an external hard disk and do not need to be integrated into the project, can be specified in the initProject() function. The epsg code of the coordinate system of the stations needs to be provided. Coordinates will be transformed to Lat/Lon.
-Since data on tagged individuals and stations may be stored in tables of different style, the names of the columns containing the most important informations must be passed to the initProject() function. Columns that are not available need to be set to NULL. (e.g. weight_col=NULL, o_col=NULL)
-#get table with station information -replace "D:/" with your root-direcory
-stations<-data.table::fread("D:/data_ms_activity_classification/validation/wood_pecker/data/reference_data/stations_mof_2021.csv" )
-#get (create) data frame with information of tagged individuals
-tags<-data.frame(ID=c("woodpecker"), species="Dendrocopos major", sex="male", age="ad", weight="20g", rep.state="breeding", start="2021-06-10", end="2021-06-14", frequency=150050, duration_min=0.012, duration_max=0.025)
-
-#initialize tRackIT project
-?initProject
-proj<-initProject(projroot = "D:/data_ms_activity_classification/validation/wood_pecker/",logger_data_raw = "D:/data_ms_activity_classification/validation/wood_pecker/woodpecker_raw/", stations= stations, s_col = "station", x_col = "X", y_col = "Y", o_col = "orientation", r_col="receiver", tags=tags, id_col = "ID", spec_col = "species", sex_col = "sex", age_col = "age", weight_col = "weight", rep_col = "rep.state", start_col = "start", end_col = "end",dmax_col = "duration_max", dmin_col = "duration_min", freq_col = "frequency", epsg=4326, tz ="CET" )
The created folder structure looks like this
- -## levelName
-## 1 projroot
-## 2 ¦--results
-## 3 ¦--R
-## 4 ¦ ¦--scripts
-## 5 ¦ °--fun
-## 6 °--data
-## 7 ¦--batch_awk
-## 8 ¦--calibration_curves
-## 9 ¦--catalogues
-## 10 ¦--correction_values
-## 11 ¦--individuals
-## 12 ¦--logger_data_csv
-## 13 ¦--models
-## 14 ¦--param_lst
-## 15 °--reference_data
-The initAnimal() function creates a subdirectory in projroot/data/individuals/ with different subdirectories for data processing products as well as an animal file containing a list of meta data information such as species, sex, age or reproductive state. Again, if any of the meta data information is not available, set it =NULL.
-#?initAnimal
-
-# initialize ids
-
-?initAnimal
-anml<-initAnimal(proj, saveAnml = TRUE, animalID = proj$tags$ID[1], species = proj$tags$species[1], age = proj$tags$age[1], sex = proj$tags$sex[1], weight = proj$tags$weight[1], rep.state = proj$tags$rep.state[1], freq = proj$tags$frequency[1], start = proj$tags$start[1], end = proj$tags$end[1], duration_min = proj$tags$duration_min[1], duration_max = proj$tags$duration_max[1] )
Folder structure looks like this:
- -## levelName
-## 1 projroot
-## 2 °--data
-## 3 °--individuals
-## 4 °--woodpecker
-## 5 ¦--bearings
-## 6 ¦--bearings_filtered
-## 7 ¦--calibrated
-## 8 ¦--classification
-## 9 ¦--filtered
-## 10 ¦--filtered_awk
-## 11 ¦--gps_timematch
-## 12 ¦--imputed
-## 13 ¦--logger_timematch
-## 14 ¦--station_timematch
-## 15 ¦--triangulations
-## 16 ¦--variables
-## 17 °--woodpecker_idFile.rds
-The work steps described above only have to be carried out once. Afterwards, the project file can simply be read into the working environment with the function getProject () and each individual with the function getAnimal().
-proj<-getProject(projroot ="D:/data_ms_activity_classification/validation/wood_pecker/",plot = TRUE)
## $animalID
-## [1] "woodpecker"
-##
-## $species
-## [1] "Dendrocopos major"
-##
-## $sex
-## [1] "male"
-##
-## $age
-## [1] "ad"
-##
-## $weight
-## [1] "20g"
-##
-## $rep.state
-## [1] "breeding"
-##
-## $freq
-## [1] "150050"
-##
-## $start
-## [1] "2021-06-10"
-##
-## $end
-## [1] "2021-06-14"
-##
-## $duration_min
-## [1] "0.012"
-##
-## $duration_max
-## [1] "0.025"
-In this step, the raw data recorded on the stations is combined into one file per station. NOTE: This part of the tutorial is only valid for data recorded with tRackIT stations. Stations of other designs may have a different data structure. On tRackIT stations, new csv files are created for the current run every time the station reboots. Over the course of one recording period, a large number of individual files can be created, which are to be combined in the next step and saved within the project structure. The read.logger.data.tRackIT() function does this for us. In order for it to do this, the path to the folder containing the raw data must have been specified in the initProject() function (argument logger_data_raw=) and the data must be present in the folder in the following structure: logger_data_raw/name-of-station/radiotracking/files.csv.
- -This step also only needs to be carried out once per project or per data collection per project.
-Processing of individuals involves
-Individuals are filterd from the raw data compiled in the prvious step by frequency, signal length, and start and end of the tagging period using the filter.tRackIT() function.
- -Here we use the activity.vars.tRackIT() function for ML-variable calculation. I initially divides the data set of each station into 5-minute intervals. For each interval, it selects the receiver with the most data entries out of 4 possible receivers. If available, the receiver with the second most entries is also selected. Then it calculates various predictor variables that mirror the variation of the signal strengths over time by applying rolling windows to the classified VHF-data recorded by the tRackIT- stations. In order to smooth out potentially distracting fluctuations in the signal, a hampel filter as well as a mean and a max filter is applied to the raw data of the main receiver in a rolling window of +/- 10 data entries. For the raw data as well as the smoothed data the variance, standard deviation, kurtosis, skewness and sum of squares, also in a rolling window of +/- 10 data points, are calculated.
- -Here we use the activity.vars.predict.tRackIT() function to apply the trained models to the data created in the step before. The two models (m_r1.rds, m_r2.rds, both>1 kb) from the directory tRackIT_activity/modelsneed to be stored in the extdata directory of the tRackIT package.
- -The classified files are stored in wood_pecker/data/individuals/woodpecker/classification/. Lets take a look:
-data<-data.table::fread(list.files(anml$path$classification, pattern="class",full.names = TRUE)[1])
-
-#cut data to BirdRack obeservation period
-data<-data[data$timestamp>="2021-06-10 06:00;00" &
- data$timestamp<="2021-06-14 04:00:00",]
-#convert to local timezone
-data$timestamp<-with_tz(data$timestamp, tzone="CET")
-
-#prepare night and day shading in plot
-data$date<-as.Date(data$timestamp)
-data$hour<-hour(data$timestamp)
-data$start<-as.POSIXct(paste0(data$date, " 05:15:00"), tz="CET")
-data$end<-as.POSIXct(paste0(data$date, " 21:30:00"), tz="CET")
-data$hour_shade =ifelse(data$hour >= 21 | data$hour <= 5, "day", "night")
-data$hour_shade[data$date=="2021-06-14"]<-"night"
-
-##plot
-ggplot()+geom_rect(data=data, (aes(xmin=start, xmax=end,ymin=min(max), ymax=max(max), fill=factor(hour_shade))))+scale_fill_manual(values=c( "grey90", "white"), guide="none")+ geom_point(data, mapping=aes(x=timestamp, y=max, color=prediction), alpha=0.6, size=2)+theme_bw(14)+ggthemes::scale_color_wsj()+theme(axis.text=element_text(size=20),axis.title=element_text(size=24,face="bold"),legend.key.size = unit(3, 'point'), legend.text = element_text(size=25), legend.title=element_text(size=20))+ggtitle("Activity pattern of the tagged woodpecker over 4 days")+ ylab("signal strength in dBW")+theme(plot.title = element_text(size=26))+ guides(colour = guide_legend(override.aes = list(size=10)))
Finally, the data can be aggregated by choosing the most frequent class value in a given time interval using the function activity.aggregate.tRackIT(). Here we chose an inteval of 1 Minute. The aggregated data is in projroot/data/individuals/woodpecker/classification/.
- -For the validation of applicability to birds we attached a transmitter on the back of a middle spotted woodpecker. Next, we set up a daylight variant of our custom-made video recorder units - “BatRack” units - in front of its nesting tree to automatically record videos of its behavior (Gottwald & Lampe et al. 2021; https://nature40.github.io/BatRack/ (vid 2)). BatRacks consisted of a VHF antenna and an video unit connected to a raspberry pi mini computer. The camera unit was automatically triggered by the VHF-signal of the woodpeckers transmitter and started recording if the VHF-signal strength exceeded a threshold of -60 dBW, i.e. when the bird flew close to its nesting tree and the BatRack-system. A typical recorded sequence consisted of flying, hopping up the stem and a very short feeding sequence where the bird sat still at the entrance of the nest. Since the feeding sequence was in most cases shorter than 3 consecutive vhf-signals we classified all signals that were simultaneously recorded by one or more of the tRackIT-stations as active. In order to generate sufficient inactive sequences, we sampled 1000 random data points from signals recorded by one or more tRackIT-stations each night between 0 and 2 a.m. over four consecutive nights during the BirdRack observation period. Lets see how well our model performs:
-##connection to project
-proj<-getProject("D:/data_ms_activity_classification/validation/wood_pecker/")
-##connect to individual
-anml<-getAnimal(projList=proj, animalID = "woodpecker")
-
-#get al classification data
-dat<-data.table::fread(list.files(paste0(anml$path$root, "/classification/"), full.names = TRUE)[2])
-
-#convert tmestamp format to local timezone
-dat$timestamp<-lubridate::with_tz(dat$timestamp, "CET")
-#get data frame with observations and corresponding videos
-tag<-fread(list.files(proj$path$ref, pattern="tagging",full.names = TRUE)[1])
-tag$start<-as.POSIXct(tag$start, tz="CET")
-tag$end<-as.POSIXct(tag$end, tz="CET")
-#select data with observations
-feed<-data.frame()
-for(i in 1:nrow(tag)){
- tmp<-dat[dat$timestamp>=tag$start[i] & dat$timestamp<=tag$end[i],]
-
- feed<-rbind(feed, tmp)
-
-}
-
-#activity observation: short flying sequence to tree--> very short feeding sequence (~3-5 sec)-->hopping sequence upwards tree-->short flying sequence from tree ---> all tagged as active
-feed<-feed[!duplicated(feed$timestamp),]
-table(feed$prediction)
##
-## active passive
-## 8310 431
-feed$observed<-"active"
-
-#passive sequences: random sample of 1000 data points between 0 and 2 am for 4 dates of observation period; 4000 data points
-sleep<-data.frame()
-for(d in c("2021-06-11", "2021-06-12","2021-06-13", "2021-06-14")){
-
- tmp<-dat[dat$timestamp>=paste0(d, " 00:00:00") & dat$timestamp<=paste0(d, " 02:00:00"),]
- tmp<-tmp[sample(nrow(tmp), 2200), ]
- sleep<-rbind(sleep, tmp)
-
-}
-table(sleep$prediction)
##
-## active passive
-## 30 8770
-sleep$observed<-"passive"
-
-#combine to groundtruth
-woodpecker_ground_truth<-rbind(feed, sleep)
-
-#calculate confusion matrix
-cm<-confusionMatrix(as.factor(woodpecker_ground_truth$prediction),as.factor(woodpecker_ground_truth$observed))
-
-cm$byClass
## Sensitivity Specificity Pos Pred Value
-## 0.9506921 0.9965909 0.9964029
-## Neg Pred Value Precision Recall
-## 0.9531573 0.9964029 0.9506921
-## F1 Prevalence Detection Rate
-## 0.9730109 0.4983182 0.4737472
-## Detection Prevalence Balanced Accuracy
-## 0.4754575 0.9736415
-##confusion matrix
-df<-cm$table %>%
- data.frame() %>%
- mutate(Prediction = factor(Prediction, levels = c("active", "passive"))) %>%
- group_by(Reference) %>%
- mutate(
- total = sum(Freq),
- TPR = Freq / total
-
- )
-
-df$Freq<-format(round(as.numeric(df$Freq), 1), nsmall=1, big.mark=",")
-
-
-
-ggplot(df,aes(Prediction, Reference, fill = TPR)) +
- geom_tile(color = "black") +
- geom_text(aes(label = str_c(Freq, ", ", round(TPR * 100), "%")), size = 8) +
- scale_fill_gradient2(low = "red", mid = "white", high = "#badb33") +
- scale_x_discrete(position = "top")+
- theme(text = element_text(size=20), plot.title = element_text(hjust = 0.5))+ggtitle("Woodpecker")