knitr::opts_chunk$set(
  warning=FALSE,
  message=FALSE,
  results='hide')

In 2015, Indego, Philadelphia’s bike share program, was launched, bringing another convenient mode of transportation to the city. Indego subscribers can ride the public bicycles between any of the 130+ stations in the city, locking and unlocking them at either end of the trip. Each station has a limited amount of docks, so sometimes Indego runs into the issue where a station is full and no more bikes can be locked, or the station is empty and no bikes can be unlocked. While in a one-off situation the customer can find the nearest station that has a bike or open dock, if Indego is to serve consistent travel patterns, customers need to be able to rely on the system to have bikes at their start and empty docks at their end.

In order to rebalance the amount of bikes in each station, Indego deploys two strategies. First, they use a fleet of sprinter vans and employees to physically move bikes from where there are many to where there are few. This works well to alleviate pressure but can be cumbersome and costly. The second rebalancing tactic is the IndeHero program. Customers receive points for starting a ride at a full station and ending a ride at an empty station. This creates an open dock or leaves a bike for someone who otherwise may not have had one and would have to search for a nearby station. Once enough points are accrued, they are redeemed for extra subscription time, thus saving the customer money.

These tactics could deployed reactively, that is, once a dock is too full or empty, sending out a rebalancing van and/or offering the IndeHero points. However, with the large amount of historical ridership data, as well as data on weather and a few other factors, we can predict the amount of bikes at a station at a given time and proactively deploy these resources. This analysis attempts to predict the amount of bikes at a station one hour ahead, in order to give reccomendations for rebalancing deployment.

library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(gganimate)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(cowplot)
library(geojsonR)

options(tigris_class = "sf")
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")

palette2 <- c("#6baed6","#08519c")

palette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
palette4 <- c("#D2FBD4","#92BCAB","#527D82","#123F5A")

rides <- read.csv("C:/Users/Charlie/Documents/GIS/5thSq/IndegoMap/RideData/indego-trips-2016-q1.csv")

cd <- "C:/Users/Charlie/Documents/GIS/5thSq/IndegoMap/RideData/indego-trips"
years <- c("2022")
qs <- c("q2")

indegofull <- rides[0,]

for (year in years){
  for (q in qs){
    df <- read.csv(paste(paste(cd,paste(year,q, sep = "-"),sep = "-"),".csv", sep = ""))
    if("bike_type" %in% colnames(df)== F){
      df$bike_type <- rep(NA, nrow(df))
    }
    
    df$start_time2 <- df$start_time
    df <- separate(df, start_time2, into = c('date','time'), sep = " ")
    
    if(substr(df$date[1], 1, 2) == '20'){
      df$date <- df$date %>% as.Date(format = "%Y-%m-%d")
    }else{
      df$date <- df$date %>% as.Date(format = "%m/%d/%Y")
    }
    
    
    indegofull <- rbind(indegofull, df)
    print(paste(year,q,sep="-"))
  }
}




#bring in stations

statlatlong <- data.frame(df$start_station, df$start_lat, df$start_lon)

statlatlong <- statlatlong[!duplicated(statlatlong),]
statlatlong <- statlatlong %>% rename('Station_ID' = 'df.start_station',
                                      'lat'='df.start_lat',
                                      'lon'='df.start_lon')
stations <- read.csv("C:/Users/Charlie/Documents/GIS/5thSq/IndegoMap/indego-stations-2021-07-01.csv")


#statlatlong$Station_ID <- statlatlong$Station_ID %>% as.numeric()
stats <- left_join(x = stations, y = statlatlong)

stats <- stats %>% rename('dateopen' = "Day.of.Go_live_date") 


stats <- stats %>% filter(!is.na(lat)) %>% st_as_sf(coords = c('lon','lat'),crs = (4326))


###DOCK STATION SHIT

stats_geojson<- FROM_GeoJson(url_file_string = 'https://kiosks.bicycletransit.workers.dev/phl')

statdocks <- data.frame(totaldocks = rep(x=NA, 184), station_name = rep(x = NA,184))

stats_geojson$features[[3]]$properties$name
stats_geojson$features[[3]]$properties$totalDocks

for(i in 1:184){
  statdocks[i,] <- c(stats_geojson$features[[i]]$properties$totalDocks,
                     stats_geojson$features[[i]]$properties$name)
}


stats <- left_join(stats, statdocks, by = c('Station_Name' = 'station_name'))


#get time frame
indegofull <- indegofull %>% filter(start_station != 3000 & end_station != 3000)

#lubridate

indegofull$year <- indegofull$date %>% year()
indegofull$week <- indegofull$date %>% week()

indegofull$start_time_60 <- indegofull$start_time %>% mdy_hm() %>% floor_date(unit = '1 hour')
indegofull$end_time_60 <- indegofull$end_time %>% mdy_hm() %>% floor_date(unit = '1 hour') 

indegofull$endweek <- indegofull$end_time_60 %>% week()

#station names

indegofull <- left_join(indegofull, stats, by = c('start_station' = 'Station_ID'))
indegofull <- rename(indegofull, start_station_name = 'Station_Name')

indegofull <- left_join(indegofull, stats %>%
                          st_drop_geometry() %>% 
                          select('Station_ID', 'Station_Name'),
                        by = c('end_station' = 'Station_ID'))

indegofull <- rename(indegofull, end_station_name = 'Station_Name')



indego5week <- indegofull %>% filter(year == 2022 & week %in% seq(15,19) & endweek  %in% seq(15,19))

study.panel <- 
  expand.grid(hour_unit = unique(indego5week$start_time_60 ), 
              station_name = unique(indego5week$end_station_name))

nrow(study.panel)      

#fill panel
bike.panel.starts <- 
  indego5week %>%
  mutate(Start_Counter = 1) %>%
  right_join(study.panel, by = c('start_time_60' = 'hour_unit', 'start_station_name' = 'station_name')) %>% 
  group_by(start_time_60, start_station_name) %>%
  summarize(start_count = sum(Start_Counter, na.rm=T)) %>% 
  rename('hour_unit' = start_time_60,
         'station_name' = start_station_name)

bike.panel.ends <- 
  indego5week %>%
  mutate(End_Counter = 1) %>%
  right_join(study.panel, by = c('end_time_60' = 'hour_unit', 'end_station_name' = 'station_name')) %>% 
  group_by(end_time_60, end_station_name) %>%
  summarize(end_count = sum(End_Counter, na.rm=T))  %>% 
  rename('hour_unit' = end_time_60,
         'station_name' = end_station_name)


bike.panel <- left_join(bike.panel.starts,bike.panel.ends) %>% filter(!station_name %>% is.na())

bike.panel$net <- bike.panel$start_count - bike.panel$end_count
bike.panel$total <- bike.panel$start_count + bike.panel$end_count

bike.panel <- 
  bike.panel %>% 
  arrange(station_name, hour_unit)

bike.panel$net_over_time <- seq(0,1,nrow(bike.panel))
for(i in 2:nrow(bike.panel)){
  if(bike.panel$station_name[i] != bike.panel$station_name[i-1]){
    bike.panel$net_over_time[i] <- bike.panel$net[i]
  } else {
    bike.panel$net_over_time[i] <- bike.panel$net_over_time[i-1] + bike.panel$net[i]
  }
}

bike.panel$total_over_time <- seq(0,1,nrow(bike.panel))
for(i in 2:nrow(bike.panel)){
  if(bike.panel$station_name[i] != bike.panel$station_name[i-1]){
    bike.panel$total_over_time[i] <- bike.panel$total[i]
  } else {
    bike.panel$total_over_time[i] <- bike.panel$total_over_time[i-1] + bike.panel$total[i]
  }
}

weather.Data <- 
  riem_measures(station = "PHL", date_start = "2022-01-01", date_end = "2022-12-12")

weather.Panel <-  
  weather.Data %>%
  mutate_if(is.character, list(~replace(as.character(.), is.na(.), "0"))) %>% 
  replace(is.na(.), 0) %>%
  mutate(interval60 = ymd_h(substr(valid, 1, 13))) %>%
  mutate(week = week(interval60),
         dotw = wday(interval60, label=TRUE)) %>%
  group_by(interval60) %>%
  summarize(Temperature = max(tmpf),
            Percipitation = sum(p01i),
            Wind_Speed = max(sknt)) %>%
  mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))



#time features
bike.panel <- 
  bike.panel %>% 
  arrange(station_name, hour_unit) %>% 
  group_by(station_name) %>% 
  mutate(sclagHour = dplyr::lag(start_count,1),
         sclag2Hours = dplyr::lag(start_count,2),
         sclag3Hours = dplyr::lag(start_count,3),
         sclag12Hours = dplyr::lag(start_count,12),
         sclag1day = dplyr::lag(start_count,24),
         lagHour = dplyr::lag(net,1),
         lag2Hours = dplyr::lag(net,2),
         lag3Hours = dplyr::lag(net,3),
         lag12Hours = dplyr::lag(net,12),
         lag1day = dplyr::lag(net,24),
         ) %>% 
  ungroup()




bike.panel <- 
  bike.panel %>% 
  left_join(weather.Panel, by = c('hour_unit' = 'interval60')) %>%
  left_join(stats %>% select(geometry, Station_Name), by = c('station_name' = 'Station_Name')) %>%
  mutate(week = week(hour_unit),
         dotw = wday(hour_unit, label = TRUE)) %>%
  st_sf()

mondays <- 
  mutate(bike.panel,
         monday = ifelse(dotw == "Mon" & hour(hour_unit) == 1,
                         hour_unit, 0)) %>%
  filter(monday != 0) 

bike.panel$hour <- bike.panel$hour_unit %>% hour()
bike.panel$day <- bike.panel$hour_unit %>% day()

bike.Train <- filter(bike.panel, week < 18)
bike.Test <- filter(bike.panel, week >= 18)

Feature Engineering

Target Variables

Ride Starts

The first target variable for our analysis is the amount of trip starts from an Indego station per hour. This plot looks at the temporal layout of ride starts for one location. We see a weekly pattern, as well as a 3 day period of high ridership and and a 3 day period of low ridership.

st_drop_geometry(rbind(
  mutate(bike.Train, Legend = "Training"), 
  mutate(bike.Test, Legend = "Testing"))) %>%
  group_by(hour_unit, station_name) %>% 
  summarize(total = sum(total),net_over_time = net_over_time, start_count = start_count, Legend = Legend, statname = station_name) %>% 
  ungroup() %>% 
  filter(station_name == 'Philadelphia Museum of Art') %>% 
  
ggplot(aes(hour_unit, start_count, color = Legend)) + geom_line() +
  scale_colour_manual(values = palette2)+ 
  geom_vline(data = mondays, aes(xintercept = monday)) +
  labs(title="Rides originating from Philadelphia Museum of Art",
    subtitle="Between April 9th 2021 and May 13th 2021", 
    x="Trip starts per Hour", y="") +
  plotTheme() + theme()

h <- bike.panel %>% 
    group_by(station_name) %>% 
  summarize(starts = sum(start_count),
            total = sum(total),
            net = sum(net)) %>% .[order(-.$total),] %>%
  mutate(stats5 = ifelse(total %in% quantile(.$total), 'y','n')) %>% 
  left_join(statdocks)

h2 <- ggplot(h)+ 
  geom_bar(aes(y=total,x=reorder(station_name, total)),fill = '#0085ca', stat = 'identity')+
  coord_flip()+
  theme_minimal()+
  theme(legend.position = 'none',  plot.title = element_text(hjust = .85, vjust=0))+
  
  labs(title='Total ride starts over 5 week range', x = '', y = '')

ggdraw() +
  draw_plot(h2)

Net Rides

The second target variable we use is the net amount of trips (trips started - trips ended) at a station per hour. Stations that receive more bikes than they disperse will require a method for bikes to be removed, and stations that disperse more bikes than they receive will need a method for bikes to be entered.

st_drop_geometry(rbind(
  mutate(bike.Train, Legend = "Training"), 
  mutate(bike.Test, Legend = "Testing"))) %>%
  group_by(hour_unit, station_name) %>% 
  summarize(total = sum(total),net_over_time = net_over_time, start_count = start_count, Legend = Legend, statname = station_name) %>% 
  ungroup() %>% 

ggplot(aes(hour_unit, net_over_time, group = station_name, color = net_over_time)) + geom_line() +
  scale_color_gradient2(mid = 'grey')+
  geom_vline(data = mondays, aes(xintercept = monday)) +
  labs(title="Spread of net trips at each station",
    subtitle="Between April 9th 2021 and May 13th 2021", 
    x="Total Net Trips", y='') +
  plotTheme() + theme(legend.position = 'none')    

Predictor Variables

We take into account the weather during each hour of our study period. People are less likely to travel by bicycle with inclement weather.

grid.arrange(top = "Weather Data in Philadelphia over study period",
             ggplot(weather.Panel, aes(interval60,Percipitation)) + geom_line() + 
               labs(title="Percipitation", x="Hour", y="Percipitation") + plotTheme(),
             ggplot(weather.Panel, aes(interval60,Wind_Speed)) + geom_line() + 
               labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme(),
             ggplot(weather.Panel, aes(interval60,Temperature)) + geom_line() + 
               labs(title="Temperature", x="Hour", y="Temperature") + plotTheme())

We created temporal lag variables to add to our models. Knowing the amount of trips an hour before is a good indicator of the amount of trips in the current hour.

Other potential variables could include event start times and lags.

Map Animation

This animation runs for the duration of the study period, visualizing the total amount of trip and net amount of trip for each station. We see clustering of high-use net negative stations in Center City, while the high-use net positive stations are in the neighborhoods adjacent to Center City. This animation is zoomed in on Center City; some stations are beyond the boundaries of the map.

river <- area_water('PA', county = 'Philadelphia') %>%
  st_as_sf()%>%
  st_transform(crs=4326) %>% filter(AWATER > 10000) %>% 
  st_crop(y = st_bbox(stats))
bounds <- c(ymax = 39.98, ymin = 39.924, xmin =-75.211, xmax = -75.13)


rideshare_animation_b <-
  ggplot() +
  geom_sf(data=river%>%  st_crop(y = bounds),color= NA, fill = 'blue')+
  geom_sf(data = bike.panel %>%  st_crop(y = bounds), aes(color = net_over_time, size = total_over_time)) +
  scale_color_gradient2(mid = 'grey')+
  labs(title = "Station Net Bike Upkeep",
       subtitle = "60 minute intervals: {current_frame}") +
  transition_manual(hour_unit) +
  mapTheme()

animate(rideshare_animation_b, duration=20, renderer = gifski_renderer())

Modeling

OLS Regressions

Ride Starts Models

As ride starts is a count variable, we use a Poisson regression to predict the amount of ride starts for each station per hour during the test weeks. We layer different groups of independent variables to see their relative effects. The Mean Average Error decreases as features are added to the model. Overall it predicts better onto week 19 than 18. On average, the most accurate model mis-predicts trip starts by station per hour by slightly higher than 0.6, which is fairly accurate.

model_pred <- function(dat, fit){
  pred <- predict(fit, newdata = dat)}

## start count regressions 
reg1 <- lm(start_count ~  hour + dotw + Temperature, data=bike.Train, )
reg2 <- lm(start_count ~  station_name + dotw + Temperature, data=bike.Train)
reg3 <- lm(start_count ~  station_name + hour + dotw + Temperature, data=bike.Train)
reg4 <- lm(start_count ~  station_name + hour + dotw + Temperature + 
             sclagHour + sclag2Hours + sclag3Hours + sclag12Hours + sclag1day, data=bike.Train)

bike.Test.weekNest <- 
  as.data.frame(bike.Test) %>%
  nest(-week) 


week_predictions <- 
  bike.Test.weekNest %>% 
  mutate(A_Time_FE = map(.x = data, fit = reg1, .f = model_pred),
         B_Space_FE = map(.x = data, fit = reg2, .f = model_pred),
         C_Space_Time_FE = map(.x = data, fit = reg3, .f = model_pred),
         D_Space_Time_Lags = map(.x = data, fit = reg4, .f = model_pred))



week_predictions <- week_predictions %>%  
  gather(Regression, Prediction, -data, -week) %>% 
  mutate(Observed = map(data, pull, start_count),
         Absolute_Error = map2(Observed, Prediction,  ~ abs(.x - .y)),
         MAE = map_dbl(Absolute_Error, mean),
         sd_AE = map_dbl(Absolute_Error, sd)) 


week_predictions %>% 
  dplyr::select(week, Regression, MAE) %>%
  gather(Variable, MAE, -Regression, -week) %>%
  ggplot(aes(week, MAE)) + 
  geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
  scale_fill_manual(values = palette5) +
  scale_x_continuous(breaks = c(18,19))+
  labs(title = "Mean Absolute Errors by model specification and training week") +
  plotTheme()

week_predictions %>% 
  mutate(hour_unit = map(data, pull, hour_unit),
         station_name = map(data, pull, station_name)) %>%
  dplyr::select(hour_unit, station_name, Observed, Prediction, Regression) %>%
  unnest(cols = c(hour_unit, station_name, Observed, Prediction)) %>%
  gather(Variable, Value, -Regression, -hour_unit, -station_name) %>%
  group_by(Regression, Variable, hour_unit) %>%
  summarize(Value = mean(Value)) %>%
  ggplot(aes(hour_unit, Value, colour=Variable)) + geom_line(size = 1.1) + 
  facet_wrap(~Regression, ncol=1) +
  scale_colour_manual(values = palette2) +
  labs(title = "Mean Predicted/Observed start count by hourly interval", 
       x = "Hour", y= "Trip Starts") +
  plotTheme()

Net Rides Models

The net trips model struggles compared to the start count model. As net trips can have a negative value, our models had to be linear regressions and not Poisson regressions. The MAE actually rose as features were added, most notably with the lag variables.

reg1 <- lm(net ~  hour + dotw + Temperature, data=bike.Train)
reg2 <- lm(net ~  station_name + dotw + Temperature, data=bike.Train)
reg3 <- lm(net ~  station_name + hour + dotw + Temperature, data=bike.Train)
reg4 <- lm(net ~  station_name + hour + dotw + Temperature + 
             lagHour + lag2Hours + lag3Hours + lag12Hours + lag1day, data=bike.Train)

bike.Test.weekNest <- 
  as.data.frame(bike.Test) %>%
  nest(-week) 



week_predictions <- 
  bike.Test.weekNest %>% 
  mutate(A_Time_FE = map(.x = data, fit = reg1, .f = model_pred),
         B_Space_FE = map(.x = data, fit = reg2, .f = model_pred),
         C_Space_Time_FE = map(.x = data, fit = reg3, .f = model_pred),
         D_Space_Time_Lags = map(.x = data, fit = reg4, .f = model_pred))



week_predictions <- week_predictions %>%  
  gather(Regression, Prediction, -data, -week) %>% 
  mutate(Observed = map(data, pull, net),
         Absolute_Error = map2(Observed, Prediction,  ~ abs(.x - .y)),
         MAE = map_dbl(Absolute_Error, mean),
         sd_AE = map_dbl(Absolute_Error, sd)) 


week_predictions %>%
  dplyr::select(week, Regression, MAE) %>%
  gather(Variable, MAE, -Regression, -week) %>%
  ggplot(aes(week, MAE)) + 
  geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
  scale_fill_manual(values = palette5) +
  labs(title = "Mean Absolute Errors by model specification and week") +
  plotTheme()

week_predictions %>% 
  mutate(hour_unit = map(data, pull, hour_unit),
         station_name = map(data, pull, station_name)) %>%
  dplyr::select(hour_unit, station_name, Observed, Prediction, Regression) %>%
  unnest() %>%
  gather(Variable, Value, -Regression, -hour_unit, -station_name) %>%
  group_by(Regression, Variable, hour_unit) %>%
  summarize(Value = mean(Value)) %>%
  ggplot(aes(hour_unit, Value, colour=Variable)) + geom_line(size = 1.1) + 
  facet_wrap(~Regression, ncol=1) +
  scale_colour_manual(values = palette2) +
  labs(title = "Mean Predicted/Observed ride share by hourly interval", 
       x = "Hour", y= "Rideshare Trips") +
  plotTheme()

Cross Validation

As the net trips model is less accurate, we only cross validate the start count model. We do both spatial and temporal LOGO cross validations. For the spatial LOGO, we run use the station name as a holdout, while for the temporal LOGO each week is held out.

The range of errors for the spatial CV is substantially, with a big outlier above 1. This means the model is not very generalizable across the stations. To run this cross validation, the station fixed effect had to be removed, which has a large influence on the model.

The temporal CV is much more compact, meaning the model predicts well across weeks. As the same travel patterns tend to repeat themselves week to week, this makes sense.

spatial.cv.vars <- bike.panel %>% st_drop_geometry() %>% select(hour, dotw, Temperature,
                                                                sclagHour, sclag2Hours, sclag3Hours, sclag12Hours, sclag1day) %>% colnames()


spatial.cv <- crossValidate(
  dataset = bike.panel,
  id = "station_name",
  dependentVariable = "start_count",
  indVariables = spatial.cv.vars) %>%
  dplyr::select(cvID = station_name, start_count, Prediction, geometry)

temporal.cv.vars <- bike.panel %>% st_drop_geometry() %>% select(station_name, hour, dotw, Temperature,
                                                                 sclagHour, sclag2Hours, sclag3Hours, sclag12Hours, sclag1day) %>% colnames()


temporal.cv <- crossValidate(
  dataset = bike.panel,
  id = "week",
  dependentVariable = "start_count",
  indVariables = temporal.cv.vars) %>%
  dplyr::select(cvID = week, start_count, Prediction, geometry)




reg.summary <- 
  rbind(
    mutate(spatial.cv,           Error = Prediction - start_count,
           Regression = "Spatial CV"),
    
    mutate(temporal.cv,        Error = Prediction - start_count,
           Regression = "Temporal CV")) %>%
  st_sf() 

error_by_reg_and_fold <- 
  reg.summary %>%
  group_by(Regression, cvID) %>% 
  summarize(Mean_Error = mean(Prediction - start_count, na.rm = T),
            MAE = mean(abs(Mean_Error), na.rm = T),
            SD_MAE = mean(abs(Mean_Error), na.rm = T)) %>%
  ungroup()

error_by_reg_and_fold %>%
  ggplot(aes(MAE)) + 
  geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
  facet_wrap(~Regression) +  
  geom_vline(xintercept = 0) + scale_x_continuous(breaks = seq(0, 8, by = 1)) + 
  labs(title="Distribution of MAE",
       x="Mean Absolute Error", y="Count") +
  plotTheme()

Conclusion

Overall, our model does decent job to inform us when to expect influxes of bikes at most stations. The time horizon in our predictions is quite far, so there is ample time to coordinate re-balancing resource deployment. Ideally, if we had current counts at the stations, we could use this model to predict when stations will overflow or be empty and can deploy resources then. Perhaps extra features, such as event starts and lags could be helpful to pickup odd times when lots of trips are made to and from an area on an irregular basis.