overview

Revenue and client quantity distributions are rarely the same. This scoring mechanism seeks to normalize customer behavior by assigning specific actions to newly labeled cohorts. While its origins date back to the 1930’s, there are virtually limitless variations on this scoring mechanism and it remains a powerful tool that can be utilized in a wide variety of environments with limited data.

# REQUIRED PACKAGES
require(dplyr)
require(stats)
require(data.table)
require(RODBC) 
require(DBI)
require(RODBCext)

#PULL CONNECTION 
my_connection <- odbcDriverConnect('driver={SQL Server};server=SERVER;database=DATABASE;trusted_connection=true')

#sales order table needed for instead

sales <- sqlQuery(my_connection, '
select 
  Qty
, Amt
, CustomerDimKey
, DateDimKey
from 
  TABLE WITH DATA
group by 
  Qty
, Amt
, CustomerDimKey
, DateDimKey
                 ') 

sales_customer <- sqlQuery(my_connection, '
select 
  SUM(Qty) as Sales
, CustomerDimKey
                           
from 
  TABLE WITH DATA
group 
  by CustomerDimKey
                           
                 ') 

top_filter       <- 20
min_purchase_qty <- 100

#FILTER OUT TOP CUSTOMERS TO GET A LIST OF TARGET CUSTOMERS 
customers_top    <- sales_customer[order(-sales_customer$Sales), ]
customers_target <- customers_top[(top_filter + 1):(nrow(customers_top)),]

#FILTER OUT CUSOTMERS WHO HAVE NOT PURCHASED MORE THAN 5 ITEMS
customers_target <- customers_target[customers_target$Sales > min_purchase_qty,]

#FILTER TARGET CUSTOMERS IN TRANSACTION LIST 
sales_transactions <- sales %>% 
                        filter(CustomerDimKey %in% customers_target$CustomerDimKey)

#convert to date 
sales_transactions$DateDimKey <- as.character(sales_transactions$DateDimKey)
sales_transactions$DateDimKey <- as.Date(sales_transactions$DateDimKey, "%Y %m %d")

#filter out 0 value sales qty and group by date 
sales_transactions <- aggregate(.~ DateDimKey + CustomerDimKey, sales_transactions, sum)
sales_transactions <- sales_transactions[sales_transactions$Qty != 0,]

#locate first and last sold date by customer and merge to main transaction df
customer_first_sale <- aggregate(sales_transactions$DateDimKey, by=list(sales_transactions$CustomerDimKey), min)
  colnames(customer_first_sale) <- c("CustomerDimKey", "FirstSoldDate")
  
customer_last_sale  <- aggregate(sales_transactions$DateDimKey, by=list(sales_transactions$CustomerDimKey), max)
  colnames(customer_last_sale) <- c("CustomerDimKey", "LastSoldDate")

sales_transactions <- merge(sales_transactions, customer_first_sale, by = c("CustomerDimKey"))
sales_transactions <- merge(sales_transactions, customer_last_sale, by = c("CustomerDimKey"))

#calculate cumsum qty ordered
sales_transactions <- arrange(sales_transactions, 
             CustomerDimKey, 
             DateDimKey)

sales_transactions$PurchasedQtyCumSum <- as.vector(ave(sales_transactions$Qty, 
                                                    sales_transactions$CustomerDimKey, 
                                                    FUN = cumsum))

sales_transactions$PurchasedAmtCumSum <- as.vector(ave(sales_transactions$Amt, 
                                                    sales_transactions$CustomerDimKey, 
                                                    FUN = cumsum))

#transaction count
sales_transactions <- data.table(sales_transactions)
sales_transactions <- sales_transactions[, TransactionCount := sequence(.N), by = CustomerDimKey]

#age 
today <- Sys.Date()
sales_transactions$AgeAtTrans <- as.numeric(sales_transactions$DateDimKey - sales_transactions$FirstSoldDate)
sales_transactions$CustomerAge <- as.numeric(today - sales_transactions$FirstSoldDate)

#calculate metrics for segmentation 
#recency 
sales_transactions$DaysSinceLastTransaction <- today - sales_transactions$LastSoldDate

#frequency of order 
sales_transactions$OrdersPerDay <- sales_transactions$TransactionCount / sales_transactions$AgeAtTrans
sales_transactions$TransFreq    <- sales_transactions$AgeAtTrans / sales_transactions$TransactionCount

#monetary avg order size
sales_transactions$AvgOrderSize <- sales_transactions$PurchasedQtyCumSum / sales_transactions$TransactionCount
sales_transactions$AvgOrderAmt  <- sales_transactions$PurchasedAmtCumSum / sales_transactions$TransactionCount

#LTV metrics 
#can parameterize using quantile percentages
sales_transactions$AvgAmtPerDay <- sales_transactions$AvgOrderAmt * sales_transactions$OrdersPerDay
sales_transactions$LowerLTV     <- as.numeric(quantile(sales_transactions$CustomerAge, 0.25)) * 
                                                       sales_transactions$AvgAmtPerDay

sales_transactions$MidQLTV     <- as.numeric(quantile(sales_transactions$CustomerAge, 0.50)) * 
                                                      sales_transactions$AvgAmtPerDay

sales_transactions$UpperQLTV     <- as.numeric(quantile(sales_transactions$CustomerAge, 0.75)) * 
                                                       sales_transactions$AvgAmtPerDay

#scoring mechanism
Rscore <- function(sales_transactions) {
#browser()
  score <- vector()
  
  DaysSinceLastTransaction <- as.numeric(sales_transactions$DaysSinceLastTransaction)
  highest   <- as.numeric(quantile(DaysSinceLastTransaction, 0.50))
  high      <- as.numeric(quantile(DaysSinceLastTransaction, 0.60))
  benchmark <- as.numeric(quantile(DaysSinceLastTransaction, 0.70))
  low       <- as.numeric(quantile(DaysSinceLastTransaction, 0.80))
  lowest    <- as.numeric(quantile(DaysSinceLastTransaction, 0.90))
  
  for (i in 1:length(DaysSinceLastTransaction)) {
    if(DaysSinceLastTransaction[i] < 0 ||
       DaysSinceLastTransaction[i] >= 0 & 
       DaysSinceLastTransaction[i] <= highest) {
      score[i] <- 5
    } else if(DaysSinceLastTransaction[i] > highest & 
       DaysSinceLastTransaction[i] <= high) {
      score[i] <- 4
    } else if(DaysSinceLastTransaction[i] > high & 
       DaysSinceLastTransaction[i] <= benchmark) {
      score[i] <- 3
    } else if(DaysSinceLastTransaction[i] > benchmark & 
       DaysSinceLastTransaction[i] <= low) {
      score[i] <- 2
    } else if(DaysSinceLastTransaction[i] > low & 
       DaysSinceLastTransaction[i] <= lowest) {
      score[i] <- 1
    } else {
      score[i] <- 0
    }
  }
  sales_transactions$RScore <- score
  
  return(sales_transactions)
}
  sales_transactions <- Rscore(sales_transactions)

#scoring mechanism
Fscore <- function(sales_transactions) {
  #browser()
  score <- vector()
  
  TransFreq <- as.numeric(sales_transactions$TransFreq)
  highest   <- as.numeric(quantile(TransFreq, 0.20))
  high      <- as.numeric(quantile(TransFreq, 0.40))
  benchmark <- as.numeric(quantile(TransFreq, 0.60))
  low       <- as.numeric(quantile(TransFreq, 0.80))
  lowest    <- as.numeric(quantile(TransFreq, 0.95))
  
  for (i in 1:length(TransFreq)) {
    if(TransFreq[i] < highest) {
      score[i] <- 5
    } else if(TransFreq[i] < high & 
              TransFreq[i] >= highest) {
      score[i] <- 4
    } else if(TransFreq[i] < benchmark & 
              TransFreq[i] >= high) {
      score[i] <- 3
    } else if(TransFreq[i] < low & 
              TransFreq[i] >= benchmark) {
      score[i] <- 2
    } else if(TransFreq[i] < lowest & 
              TransFreq[i] >= low) {
      score[i] <- 1
    } else {
      score[i] <- 0
    }
  }
  sales_transactions$FScore <- score
  
  return(sales_transactions)
}
  sales_transactions <- Fscore(sales_transactions)

Mscore <- function(sales_transactions) {
  #browser()
  score <- vector()
  
  AvgOrderSize <- as.numeric(sales_transactions$AvgOrderSize)
  highest   <- as.numeric(quantile(AvgOrderSize, 0.90))
  high      <- as.numeric(quantile(AvgOrderSize, 0.80))
  benchmark <- as.numeric(quantile(AvgOrderSize, 0.70))
  low       <- as.numeric(quantile(AvgOrderSize, 0.60))
  lowest    <- as.numeric(quantile(AvgOrderSize, 0.50))
  
  for (i in 1:length(AvgOrderSize)) {
    if(AvgOrderSize[i] > highest) {
      score[i] <- 5
    } else if(AvgOrderSize[i] > high & 
              AvgOrderSize[i] <= highest) {
      score[i] <- 4
    } else if(AvgOrderSize[i] > benchmark & 
              AvgOrderSize[i] <= high) {
      score[i] <- 3
    } else if(AvgOrderSize[i] > low & 
              AvgOrderSize[i] <= benchmark) {
      score[i] <- 2
    } else if(AvgOrderSize[i] > lowest & 
              AvgOrderSize[i] <= low) {
      score[i] <- 1
    } else {
      score[i] <- 0
    }
  }
  sales_transactions$MScore <- score
  
  return(sales_transactions)
}
  sales_transactions <- Mscore(sales_transactions)
  
AssignCohort <- function(sales_transactions) {
  #browser()
  
  flag <- vector()
  
  Rscore <- sales_transactions$RScore
  Fscore <- sales_transactions$FScore
  Mscore <- sales_transactions$MScore
  
  DaysSinceLastTransaction <- sales_transactions$DaysSinceLastTransaction
  TransFreq                <- sales_transactions$TransFreq   
  
  for (i in 1:length(Rscore)) { 
    
    if (Rscore[i] <= 2 &
        Fscore[i] >= 3 &
        Mscore[i] >= 3 ) {
      
         flag[i] <- 'At Risk'
         
    } else if 
       (Rscore[i] >= 4 &
        Fscore[i] >= 4 &
        Mscore[i] >= 4) {
      
         flag[i] <- 'VIPs'
         
    } else if 
      (Rscore[i] <= 5 &
       Fscore[i] <= 5 &
       Fscore[i] >  2 &
       Mscore[i] <= 2 ) {
        
         flag[i] <- 'UpSell'
        
    } else if 
      (Rscore[i] <= 2 &
       Fscore[i] <= 2 &
       Mscore[i] <= 2 ) {
        
         flag[i] <- 'Lost'
         
    } else if 
      (Rscore[i] >= 3 &
       Fscore[i] <= 2 &
       Mscore[i] >= 3 ) {
        
        flag[i] <- 'Seasonal'
    } else if 
      ((Rscore[i] + Fscore[i] + Mscore[i]) >= 12) {
      
      flag[i] <- 'Healthy'
      
    } else if 
      ((Rscore[i] + Fscore[i] + Mscore[i]) < 9) {
      
      flag[i] <- 'Un-Healthy'
      
    } else if 
      ((Rscore[i] + Fscore[i] + Mscore[i]) >= 9 ||
       (Rscore[i] + Fscore[i] + Mscore[i]) < 12) {
      
      flag[i] <- 'Un-Assigned'
      
    }
  }
  return(flag)
}
  sales_transactions$Cohort <- AssignCohort(sales_transactions)
 
last_recorded_sales <- sales_transactions %>% 
  group_by(CustomerDimKey) %>% 
  filter(DateDimKey == max(DateDimKey))

AtRisk      <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "At Risk", ])) 
Healthy     <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Healthy", ])) 
UnHealthy   <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Un-Healthy", ])) 
Unassigned  <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Un-Assigned", ]))
Lost        <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Lost", ]))
Seasonal    <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Seasonal", ]))
UpSell      <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "UpSell", ]))
VIPs        <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "VIPs", ]))

CohortCnts      <- c(AtRisk, Healthy, UnHealthy, Unassigned, Lost, Seasonal, UpSell, VIPs)
CohortCntLabels <- data.frame(Cohorts=c('AtRisk','Healthy','UnHealthy','Unassigned', 'Lost', 'Seasonal', 'UpSell', 'VIPs')) 
CohortCntLabels <- cbind(CohortCntLabels, CohortCnts)