Search

Showing posts with label Calculate Percentile. Show all posts
Showing posts with label Calculate Percentile. Show all posts

Friday, May 27, 2011

Calculate Percentile


Calculating Percentiles with SQL Server CTE
Here percentiles calculations are based on this fictitious table of employee salaries:

CREATE TABLE Employee
(EmployeeID INT IDENTITY(1,1) PRIMARY KEY,
 GroupName VARCHAR(30),
 Emp_Name VARCHAR(80),
 Salary FLOAT)

INSERT INTO Employee(GroupName, Emp_Name, Salary) values('PRODUCTION', 'P1', 12000)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('PRODUCTION', 'P2', 13500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('PRODUCTION', 'P3', 13500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('PRODUCTION', 'P4', 11500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('SALES', 'S1', 12500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('SALES', 'S2', 15500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('SALES', 'S3', 14500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('SALES', 'S4', 12500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('SALES', 'S5', 11000)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('SALES', 'S6', 14000)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('MARKETING', 'M1', 11500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('MARKETING', 'M2', 11000)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('MARKETING', 'M3', 16500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('MARKETING', 'M4', 15500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('MARKETING', 'M5', 12250)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('MARKETING', 'M6', 11500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('ACCOUNTING', 'A1', 10500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('ACCOUNTING', 'A2', 13500)
INSERT INTO Employee(GroupName, Emp_Name, Salary) values('ACCOUNTING', 'A3', 15000)
GO

Here is the code to calculate the 75th percentile of employee salaries using the above table data:

DECLARE @Percentile FLOAT
SELECT @Percentile = .95;

WITH Emp_Sal(Salary, Prev_Rank, Curr_Rank, Next_Rank) AS
(
   SELECT Salary, 
         (ROW_NUMBER() OVER ( ORDER BY GroupName, Salary ) - 2.0) / ((SELECT COUNT(*) FROM Employee) - 1)  [Prev_Rank],
         (ROW_NUMBER() OVER ( ORDER BY GroupName, Salary ) - 1.0) / ((SELECT COUNT(*) FROM Employee) - 1)  [Curr_Rank],
         (ROW_NUMBER() OVER ( ORDER BY GroupName, Salary ) + 0.0) / ((SELECT COUNT(*) FROM Employee) - 1)  [Next_Rank]
   FROM Employee
)
SELECT 
   CASE 
   WHEN T1.Salary = T2.Salary THEN T1.Salary
   ELSE T1.Salary + (T2.Salary - T1.Salary) * ((@Percentile - T1.Curr_Rank) /  (T2.Curr_Rank - T1.Curr_Rank)) 
   END
FROM Emp_Sal T1, Emp_Sal T2
WHERE (T1.Curr_Rank = @Percentile OR (T1.Curr_Rank < @Percentile AND T1.Next_Rank > @Percentile))
  AND (T2.Curr_Rank = @Percentile OR (T2.Curr_Rank > @Percentile AND T2.Prev_Rank < @Percentile))

This query uses SQL Server CTE along with the new ROW_NUMBER() function. The Emp_Sal CTE at the top produces a table that ranks each row. The Prev_Rank and Next_Rank columns help locate a row for interpolation when the desired percentile does not fall onto one of the rows. 


Calculating Percentiles with Grouping


The query in the previous section evaluates only the overall percentile. If you want to calculate the 95th percentile for each Group than use the below query: It contains two CTEs:

DECLARE @Percentile FLOAT
SELECT @Percentile = .95;

WITH Group_Row_Count(GroupName, Row_Count) AS
(
   SELECT GroupName, COUNT(*)
   FROM Employee
   GROUP BY GroupName
   HAVING COUNT(*) > 1
),
Emp_Sal(GroupName, Salary, Prev_Rank, Curr_Rank, Next_Rank) AS
(
   SELECT E.GroupName, E.Salary, 
         (ROW_NUMBER() OVER ( PARTITION BY E.GroupName ORDER BY E.Salary) - 2.0) / (C.Row_Count - 1), 
         (ROW_NUMBER() OVER ( PARTITION BY E.GroupName ORDER BY E.Salary) - 1.0) / (C.Row_Count - 1), 
         (ROW_NUMBER() OVER ( PARTITION BY E.GroupName ORDER BY E.Salary) + 0.0) / (C.Row_Count - 1)
   FROM Employee E 
   JOIN Group_Row_Count C ON C.GroupName = E.GroupName
)
SELECT T1.GroupName, 
   CASE 
   WHEN T1.Salary = T2.Salary THEN T1.Salary
   ELSE T1.Salary + (T2.Salary - T1.Salary) * ((@Percentile - T1.Curr_Rank) / (T2.Curr_Rank - T1.Curr_Rank)) 
   END
FROM Emp_Sal T1 
JOIN Emp_Sal T2 ON T1.GroupName = T2.GroupName
WHERE (T1.Curr_Rank = @Percentile OR (T1.Curr_Rank < @Percentile AND T1.Next_Rank > @Percentile))
  AND (T2.Curr_Rank = @Percentile OR (T2.Curr_Rank > @Percentile AND T2.Prev_Rank < @Percentile))
Go

Result: