Queer European MD passionate about IT

Mission374Solutions.Rmd 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. ---
  2. title: "Answering Business Questions using SQL (Intermediate SQL in R): Guided Project Solutions"
  3. output: html_document
  4. ---
  5. # Creating Helper Functions
  6. ```{r}
  7. library(RSQLite)
  8. library(DBI)
  9. db <- 'chinook.db'
  10. run_query <- function(q) {
  11. conn <- dbConnect(SQLite(), db)
  12. result <- dbGetQuery(conn, q)
  13. dbDisconnect(conn)
  14. return(result)
  15. }
  16. show_tables <- function() {
  17. q = "SELECT name, type FROM sqlite_master WHERE type IN ('table', 'view')"
  18. return(run_query(q))
  19. }
  20. show_tables()
  21. ```
  22. # Selecting New Albums to Purchase
  23. ```{r}
  24. albums_to_purchase = '
  25. WITH usa_tracks_sold AS
  26. (
  27. SELECT il.* FROM invoice_line il
  28. INNER JOIN invoice i on il.invoice_id = i.invoice_id
  29. INNER JOIN customer c on i.customer_id = c.customer_id
  30. WHERE c.country = "USA"
  31. )
  32. SELECT
  33. g.name genre,
  34. count(uts.invoice_line_id) tracks_sold,
  35. cast(count(uts.invoice_line_id) AS FLOAT) / (
  36. SELECT COUNT(*) from usa_tracks_sold
  37. ) percentage_sold
  38. FROM usa_tracks_sold uts
  39. INNER JOIN track t on t.track_id = uts.track_id
  40. INNER JOIN genre g on g.genre_id = t.genre_id
  41. GROUP BY 1
  42. ORDER BY 2 DESC
  43. LIMIT 10;
  44. '
  45. run_query(albums_to_purchase)
  46. ```
  47. ```{r}
  48. library(ggplot2)
  49. genre_sales = run_query(albums_to_purchase)
  50. ggplot(data = genre_sales, aes(x = reorder(genre, -percentage_sold),
  51. y = percentage_sold)) +
  52. geom_bar(stat = "identity")
  53. ```
  54. Among the genres represented in our list of 4 albums, punk, blues and pop are the highest rated. Therefore, we should recommend:
  55. - Red Tone (Punk)
  56. - Slim Jim Bites (Blues)
  57. - Meteor and the Girls (Pop)
  58. By far though, rock makes up the majority of the sales. To better capture sales in the USA, we might want to ask the record label if they have any up-and-coming rock bands.
  59. # Analyzing Employee Sales Performance
  60. ```{r}
  61. employee_sales_performance = '
  62. WITH customer_support_rep_sales AS
  63. (
  64. SELECT
  65. i.customer_id,
  66. c.support_rep_id,
  67. SUM(i.total) total
  68. FROM invoice i
  69. INNER JOIN customer c ON i.customer_id = c.customer_id
  70. GROUP BY 1,2
  71. )
  72. SELECT
  73. e.first_name || " " || e.last_name employee,
  74. e.hire_date,
  75. SUM(csrs.total) total_sales
  76. FROM customer_support_rep_sales csrs
  77. INNER JOIN employee e ON e.employee_id = csrs.support_rep_id
  78. GROUP BY 1;
  79. '
  80. run_query(employee_sales_performance)
  81. ```
  82. ```{r}
  83. employee_sales = run_query(employee_sales_performance)
  84. ggplot(data = employee_sales, aes(x = reorder(employee, -total_sales),
  85. y = total_sales)) +
  86. geom_bar(stat = "identity")
  87. ```
  88. Jane Peacock has the highest amount of sales, but she also has been at the company the longest. If we really want to hone in on employee efficiency, we might want to standardize sales by the number of days or hours worked.
  89. # Visualizing Sales by Country
  90. ```{r}
  91. sales_by_country = '
  92. WITH country_or_other AS
  93. (
  94. SELECT
  95. CASE
  96. WHEN (
  97. SELECT count(*)
  98. FROM customer
  99. where country = c.country
  100. ) = 1 THEN "Other"
  101. ELSE c.country
  102. END AS country,
  103. c.customer_id,
  104. il.*
  105. FROM invoice_line il
  106. INNER JOIN invoice i ON i.invoice_id = il.invoice_id
  107. INNER JOIN customer c ON c.customer_id = i.customer_id
  108. )
  109. SELECT
  110. country,
  111. customers,
  112. total_sales,
  113. average_order,
  114. customer_lifetime_value
  115. FROM
  116. (
  117. SELECT
  118. country,
  119. count(distinct customer_id) customers,
  120. SUM(unit_price) total_sales,
  121. SUM(unit_price) / count(distinct customer_id) customer_lifetime_value,
  122. SUM(unit_price) / count(distinct invoice_id) average_order,
  123. CASE
  124. WHEN country = "Other" THEN 1
  125. ELSE 0
  126. END AS sort
  127. FROM country_or_other
  128. GROUP BY country
  129. ORDER BY sort ASC, total_sales DESC
  130. );
  131. '
  132. run_query(sales_by_country)
  133. ```
  134. # Visualizing Sales by Country
  135. ```{r}
  136. country_metrics = run_query(sales_by_country)
  137. ggplot(data = country_metrics, aes(x = reorder(country, -total_sales),
  138. y = total_sales,
  139. fill = country)) +
  140. geom_bar(stat = "identity") +
  141. labs(
  142. title = "Total sales by country",
  143. x = "Country",
  144. y = "Total Sales"
  145. ) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
  146. ggplot(data = country_metrics, aes(x = reorder(country, -customers),
  147. y = customers,
  148. fill = country)) +
  149. geom_bar(stat = "identity") +
  150. coord_polar("y") +
  151. labs(
  152. title = "Number of customers by country",
  153. x = "Country",
  154. y = "Customers"
  155. )
  156. ggplot(data = country_metrics, aes(x = reorder(country, -customer_lifetime_value),
  157. y = customer_lifetime_value,
  158. color = country)) +
  159. geom_point(stat = "identity") +
  160. labs(
  161. title = "Customer lifetime value by country",
  162. x = "Country",
  163. y = "Customer Lifetime Value"
  164. ) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
  165. ```
  166. # Albums vs Individual Tracks
  167. ```{r}
  168. albums_vs_tracks = '
  169. WITH invoice_first_track AS
  170. (
  171. SELECT
  172. il.invoice_id invoice_id,
  173. MIN(il.track_id) first_track_id
  174. FROM invoice_line il
  175. GROUP BY 1
  176. )
  177. SELECT
  178. album_purchase,
  179. COUNT(invoice_id) number_of_invoices,
  180. CAST(count(invoice_id) AS FLOAT) / (
  181. SELECT COUNT(*) FROM invoice
  182. ) percent
  183. FROM
  184. (
  185. SELECT
  186. ifs.*,
  187. CASE
  188. WHEN
  189. (
  190. SELECT t.track_id FROM track t
  191. WHERE t.album_id = (
  192. SELECT t2.album_id FROM track t2
  193. WHERE t2.track_id = ifs.first_track_id
  194. )
  195. EXCEPT
  196. SELECT il2.track_id FROM invoice_line il2
  197. WHERE il2.invoice_id = ifs.invoice_id
  198. ) IS NULL
  199. AND
  200. (
  201. SELECT il2.track_id FROM invoice_line il2
  202. WHERE il2.invoice_id = ifs.invoice_id
  203. EXCEPT
  204. SELECT t.track_id FROM track t
  205. WHERE t.album_id = (
  206. SELECT t2.album_id FROM track t2
  207. WHERE t2.track_id = ifs.first_track_id
  208. )
  209. ) IS NULL
  210. THEN "yes"
  211. ELSE "no"
  212. END AS "album_purchase"
  213. FROM invoice_first_track ifs
  214. )
  215. GROUP BY album_purchase;
  216. '
  217. run_query(albums_vs_tracks)
  218. ```
  219. Album purchases account for almost a quarter of the total sales, so it is inadvisable to change strategy to just purchase the most popular tracks.