# data.table way this stuff feels faster than dplyr but isn't very FP when using := methods
# alternatively, use the .() aka list() feature and create a new table. Still faster than dplyr or plyr
# https://mran.microsoft.com/web/packages/data.table/vignettes/datatable-intro.html
library(data.table) # for fread and other data.table functions
library(tidyverse)  # for as_tibble to feed into ggplot
library(lubridate)  # for round_date
library(fasttime)   # for fastPOSIXct
dt01=fread("C:/kewoo/eai/d20171024.0930-1055.allEAI.csv")

# exploratory
str(dt01)
nrow(dt01)
names(dt01)

dt01[, startPct := round_date(as.POSIXct(start), "10 seconds")]
dt01[, endtPct := round_date(as.POSIXct(endt), "10 seconds")]
# create two new columns in the same statement, a but hard to read though because the column names
# are separated from their definitions by the := token
dt01[, c("startPct","endtPct") := list(round_date(as.POSIXct(start), "10 seconds"),
                                      round_date(as.POSIXct(endt), "10 seconds"))]
# gain speed using fasttime - what happens? Takes ~ 3 sec instead of ~ 11 sec
# https://stackoverflow.com/questions/29140416/r-data-table-fread-read-column-as-date
# https://stackoverflow.com/questions/12786335/why-is-as-date-slow-on-a-character-vector
# https://cran.r-project.org/web/packages/fasttime/fasttime.pdf
# replace as.POSIXct() with fastPOSIXct()
dt01[, c("startPct","endtPct") := list(round_date(fastPOSIXct(start), "10 seconds"),
                                      round_date(fastPOSIXct(endt), "10 seconds"))]

# new columns without using :=
dt01b <-dt01[, list(transactionid,
                   startPct = round_date(fastPOSIXct(start), "10 seconds"),
                   endtPct = round_date(fastPOSIXct(endt), "10 seconds"))]

dt02 = dt01[, list(ints = seq(startPct, endtPct, by=10)), by = transactionid] # some magic happens here
# dt02 = dt01[, list(ints = seq(startPct, endtPct, by=10)), by = correlationid] # ERROR: 'from' must be of length 1 because there are correlationid isn't unique for start/endt pairs
# filter on componentanme, list ints and componentname, group by transactionid# dt02 = dt01[componentname %like% 'AcurityConnector', list(ints = seq(startPct, endtPct, by=10), componentname), by = transactionid]
# filter on componentanme, list ints and componentname
# Wrapping "freq = .N" in a list ensures a data.table object is returned (https://mran.microsoft.com/web/packages/data.table/vignettes/datatable-intro.html)
dt03 <- dt02[, list(freq = .N), by = ints]
tb01 <- as_tibble(dt03)
ggplot() + geom_line(data=tb01, aes(x=ints,y=freq), color='blue')

# PS: tidyverse+plyr returns different results to data.table
# possibly because different date conversions are being used at the time of data load

# 20171109: chain data.tables, split over multiple lines
#           show transaction flight behaviours over time
tb01.allEAI <- dt01[, list(transactionid,
                          startPct = round_date(fastPOSIXct(start), "10 seconds"),
                          endtPct = round_date(fastPOSIXct(endt), "10 seconds"))
                   ][, list(intervals = seq(startPct, endtPct, by=10)), by = transactionid
                     ][, list(txCount = .N), by = intervals] %>% as_tibble()

tb01.AC <- dt01[componentname %like% 'AcurityConnector',
               list(transactionid,
                    startPct = round_date(fastPOSIXct(start), "10 seconds"),
                    endtPct = round_date(fastPOSIXct(endt), "10 seconds"))
               ][, list(intervals = seq(startPct, endtPct, by=10)), by = transactionid
                 ][, list(txCount = .N), by = intervals] %>% as_tibble()

ggplot() +
 geom_line(data=tb01.AC, aes(x=intervals,y=txCount), color='blue') +
 geom_line(data=tb01.allEAI, aes(x=intervals,y=txCount), color='red')