Çakışan kayıtları tespit etmek için data.table
kullanılan bir çözüm foverlaps
(@GenesRus tarafından daha önce belirtildiği gibi). Çakışan kayıtlar, kaydı maks. grup önceliği. Bu yordamın üç veya daha fazla çakışan kayıt için de çalıştığını göstermek üzere örnek verilerinize iki kayıt daha ekledim:
Düzenleme: Ben data.table
daha hızlı kod verir @ pgcudahy çözüm değiştirildi ve tercüme :
library(data.table)
library(lubridate)
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
resultDT <- setDT(times_df, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
# old approach ------------------------------------------------------------
# times_dt <- as.data.table(times_df)
# setkey(times_dt, start, stop)[, index := .I]
# overlaps_dt <- foverlaps(times_dt, times_dt, type = "any", which = TRUE)[xid != yid][, group := fifelse(xid > yid, yes = paste0(yid, "_", xid), no = paste0(xid, "_", yid))]
# overlaps_merged <- merge(times_dt, overlaps_dt, by.x = "index", by.y = "xid")[, .(delete_index = index[priority == max(priority)]), by = "group"]
# result_dt <- times_dt[!unique(overlaps_merged$delete_index)][, index := NULL]
Diğer detaylar için bakınız lütfen İçin ?foverlaps
- gibi bir çakışma olarak kabul edilen kontrol etmek uygulanan biraz daha yararlı özellik vardır maxgap
, minoverlap
ya type
(herhangi bir, içinde uç ve eşit, start).
Güncelleme - yeni kıyaslama
Unit: microseconds
expr min lq mean median uq max neval
Paul 25572.550 26105.2710 30183.930 26514.342 29614.272 153810.600 100
MKa 5100.447 5276.8350 6508.333 5401.275 5832.270 23137.879 100
pgcudahy 3330.243 3474.4345 4284.640 3556.802 3748.203 21241.260 100
ismirsehregal 711.084 913.3475 1144.829 1013.096 1433.427 2316.159 100
Karşılaştırma kodu:
#### library ----
library(dplyr)
library(lubridate)
library(igraph)
library(data.table)
library(microbenchmark)
#### data ----
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
times_tib <- as_tibble(times_df)
times_dt <- as.data.table(times_df)
#### group_interval function ----
# buffer to take a form similar to: days(1), weeks(2), etc.
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
# apply buffer period to intervals
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
# Find groups via graph theory See igraph package
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
# create a 2 column df with row (index) and group number, arrange on row number and return distinct values
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
# returns
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
#### benchmark ----
library(igraph)
library(data.table)
library(dplyr)
library(lubridate)
library(microbenchmark)
df_Paul <- df_MKa <- df_pgcudahy <- df_ismirsehregal <- times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-07 06:00:00",
"2019-10-07 06:10:00",
"2019-10-07 06:20:00",
"2019-10-08 06:00:00",
"2019-10-08 06:10:00",
"2019-10-08 06:20:00",
"2019-10-09 03:00:00",
"2019-10-09 03:10:00",
"2019-10-10 03:00:00",
"2019-10-10 03:10:00",
"2019-10-11 05:00:00",
"2019-10-11 05:00:00")
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-07 06:18:00",
"2019-10-07 06:28:00",
"2019-10-07 06:38:00",
"2019-10-08 06:18:00",
"2019-10-08 06:28:00",
"2019-10-08 06:38:00",
"2019-10-09 03:30:00",
"2019-10-09 03:20:00",
"2019-10-10 03:30:00",
"2019-10-10 03:20:00",
"2019-10-11 05:40:00",
"2019-10-11 05:40:00")
),
priority = c(5, 3, 4, 3, 4, 5, 4, 3, 4, 3, 4, 3, 1, 2, 2, 1, 3, 4)
)
benchmarks <- microbenchmark(Paul = {
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
times_tib <- as_tibble(df_Paul)
mutate(times_tib, group = group_interval(start, stop)) %>%
group_by(group) %>%
top_n(1, desc(priority)) %>%
ungroup() %>%
select(-group)
},
MKa = {
df_MKa$id <- 1:nrow(df_MKa)
# Create consolidated df which we will use to check if stop date is in between start and stop
my_df <- bind_rows(replicate(n = nrow(df_MKa), expr = df_MKa, simplify = FALSE))
my_df$stop_chk <- rep(df_MKa$stop, each = nrow(df_MKa))
# Flag if stop date sits in between start and stop
my_df$chk <- my_df$stop_chk >= my_df$start & my_df$stop_chk <= my_df$stop
my_df$chk_id <- df_MKa[match(my_df$stop_chk, df_MKa$stop), "id"]
# Using igrpah to cluster ids to create unique groups
# this will identify any overlapping groups
library(igraph)
g <- graph.data.frame(my_df[my_df$chk == TRUE, c("id", "chk_id")])
df_g <- data.frame(clusters(g)$membership)
df_g$chk_id <- row.names(df_g)
# copy the unique groups to the df
my_df$new_id <- df_g[match(my_df$chk_id, df_g$chk_id), "clusters.g..membership"]
my_df %>%
filter(chk == TRUE) %>%
arrange(priority) %>%
filter(!duplicated(new_id)) %>%
select(start, stop, priority) %>%
arrange(start)
}, pgcudahy = {
df_pgcudahy %>%
arrange(start) %>%
mutate(remove1 = ifelse((stop >= lead(start, default=FALSE)) &
(priority > lead(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
mutate(remove2 = ifelse((start <= lag(stop, default=FALSE)) &
(priority > lag(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
filter(remove1 == FALSE & remove2 == FALSE) %>%
select(1:3)
}, ismirsehregal = {
setDT(df_ismirsehregal, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
})
benchmarks
combn
, ancak çok fazla satırınız varsa pahalı olabilir.times_df %>% mutate(interval = interval(start, stop)) %>% {combn(nrow(.), 2, function(x) if (int_overlaps(.$interval[x[1]], .$interval[x[2]])) x[which.min(.$priority[x])], simplify = FALSE)} %>% unlist() %>% {slice(times_df, -.)}