predictive-fleet-maintenance/step2_build_predictive_models.Rmd at main · sourabhgithubcode/predictive-fleet-maintenance · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Capstone"
output:
  word_document: default
  html_document:
    knit_root_dir: null
  pdf_document: default
editor_options:
  markdown:
    wrap: 72
---

#Import as peand summary statistics

```{r}
# Load the required library
library(readxl)

# Define the file path (update the path to your actual Excel file)
file_path <- "/Users/HP/Downloads/Capstone/DePaul_Summer_Quarter_meter.xlsx"

# Read each worksheet into a separate dataframe
asset_details <- read_excel(file_path, sheet = "Asset Detail")
wo_costs      <- read_excel(file_path, sheet = "2024 WO Costs")
usage_data    <- read_excel(file_path, sheet = "Miles (Driven Last Year)")

# Optional: View structure of the data
str(asset_details)
str(wo_costs)
str(usage_data)

```

```{r}
# Load required libraries
library(dplyr)   # for data manipulation (summaries, etc.)
library(skimr)   # for comprehensive summary statistics
library(ggplot2) # for data visualization

# Inspect the structure of each dataset to identify numeric vs categorical columns
glimpse(asset_details)  # view columns, types, sample values of Asset Details
glimpse(wo_costs)       # view columns, types, sample values of WO Costs
glimpse(usage_data)     # view columns, types, sample values of Usage data
```

```{r}

# (Optional) If any categorical columns are read as character, consider converting to factor
library(dplyr)

asset_details <- asset_details %>%
  mutate(across(where(is.character), as.factor))

wo_costs <- wo_costs %>%
  mutate(across(where(is.character), as.factor))

usage_data <- usage_data %>%
  mutate(across(where(is.character), as.factor))

```


#Summarize Numeric Fields

```{r}
# Summarize numeric fields in Asset Details (e.g., YEAR, METER)
# Calculate mean, median, standard deviation (sd), min, and max for each numeric column
asset_details %>%
  summarise(across(where(is.numeric),
                   list(mean   = ~ mean(.x, na.rm = TRUE),
                        median = ~ median(.x, na.rm = TRUE),
                        sd     = ~ sd(.x, na.rm = TRUE),
                        min    = ~ min(.x, na.rm = TRUE),
                        max    = ~ max(.x, na.rm = TRUE))))


# Summarize numeric fields in WO Costs (e.g., LINE TOTAL, EXTERNAL LABOR, PART AMOUNT)
wo_costs %>%
  summarise(across(where(is.numeric),
                   list(mean   = ~ mean(.x, na.rm = TRUE),
                        median = ~ median(.x, na.rm = TRUE),
                        sd     = ~ sd(.x, na.rm = TRUE),
                        min    = ~ min(.x, na.rm = TRUE),
                        max    = ~ max(.x, na.rm = TRUE))))


# Summarize numeric fields in Usage data (e.g., MILES DRIVEN, ANNUALIZED MILEAGE, UTILIZED DAYS PER MONTH)
usage_data %>%
  summarise(across(where(is.numeric),
                   list(mean   = ~ mean(.x, na.rm = TRUE),
                        median = ~ median(.x, na.rm = TRUE),
                        sd     = ~ sd(.x, na.rm = TRUE),
                        min    = ~ min(.x, na.rm = TRUE),
                        max    = ~ max(.x, na.rm = TRUE))))
```


#Summarize Categorical Fields

```{r}
library(dplyr)
library(purrr)

# Helper to print counts for each factor column in a df
summarize_factors <- function(df, df_name) {
  factor_cols <- names(df)[map_lgl(df, is.factor)]
  cat("\n---", df_name, "categorical summaries ---\n")
  walk(factor_cols, ~ {
    cat("\n", .x, ":\n", sep = "")
    print(df %>% count(!!sym(.x), sort = TRUE))
  })
}

# (Assuming you’ve already converted all character columns to factors)
summarize_factors(asset_details, "Asset Details")
summarize_factors(wo_costs,       "WO Costs")
summarize_factors(usage_data,     "Usage Data")


```

#Missing Values per Column

```{r}
# Identify missing values in each column of each dataset.

# Count of missing values in each column for Asset Details
asset_details %>% summarise(across(everything(), ~ sum(is.na(.))))
# Each column in Asset Details will have a count of NA values (0 means no missing data in that column).

# Count of missing values in each column for WO Costs
wo_costs %>% summarise(across(everything(), ~ sum(is.na(.))))
# Outputs number of NAs in each WO Costs column.

# Count of missing values in each column for Usage data
usage_data %>% summarise(across(everything(), ~ sum(is.na(.))))
# Outputs number of NAs in each Usage data column.

```

```{r}
# Use skimr to get an all-in-one descriptive summary of each dataset.
skim(asset_details)
skim(wo_costs)
skim(usage_data)

```


```{r}

library(dplyr)
library(janitor)

# Clean column names (lowercase, underscores)
asset_details <- asset_details %>%
  clean_names() %>%
  rename(
    unit_no = unit_number,
    unit_status = unit_status,
    category = category,
    category_class_desc = category_class_desc,
    year = year,
    make = make,
    model = model,
    asset_in_service_date = in_service_date,
    asset_disposal_date = disposal_date,
    meter = meter,
    company = company
  )


```

```{r}
library(dplyr)
library(janitor)

wo_costs <- wo_costs %>%
  clean_names() %>%  # Converts names to snake_case
  rename(
    fiscal_period           = fiscal_period,
    unit_no                 = unit_no,
    category_class          = category_class,
    cat_class_desc          = cat_class_desc,
    wo_nbr                  = wo_nbr,
    wo_reason_desc          = wo_reason_desc,
    open_date               = open_date,
    wo_completed_date       = wo_completed_date,
    wo_status               = wo_status,
    job                     = job,
    job_description         = job_description,
    job_reason_description  = job_reason_description,
    external_labor_cost     = external_labor_cost,
    labor_amount            = labor_amount,
    labor_tax_rate          = labor_tax_rate,
    labor_tax               = labor_tax,
    part_amount             = part_amount,
    parts_tax_rate          = parts_tax_rate,
    parts_tax               = parts_tax,
    line_total              = line_total,
    maint_loc               = maint_loc,
    damage                  = damage,
    company                 = company
  )


```

```{r}
library(dplyr)
library(janitor)

usage_data <- usage_data %>%
clean_names() %>%  # this makes all column names lowercase with underscores
 rename(
 unit_no = unit_no,
  company = company,
  asset_type = asset_type_asset_type,
  model_year = model_year,
  miles_driven = miles_driven,
  months_with_mileage = months_with_mileage,
  annualized_mileage = annualized_mileage,
   utilized_days_per_month = utilized_days_per_month )
```


#Visualize Key Distributions

```{r}
# Plot histograms for numeric distributions and bar charts for categorical frequencies.

# 1. Histogram of a numeric field: e.g., MILES_DRIVEN from Usage data
ggplot(usage_data, aes(x = miles_driven)) +
  geom_histogram(binwidth = 1000, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Miles Driven", x = "Miles Driven", y = "Count")
# This creates a histogram of Miles Driven. Adjust binwidth as needed for clarity (here 1000 is an example).

# 2. Histogram of another numeric field: e.g., METER from Asset Details (odometer readings)
ggplot(asset_details, aes(x = meter)) +
  geom_histogram(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Asset Meter Readings", x = "Meter Reading", y = "Count")
# (No binwidth specified above, so ggplot2 will choose default bins. You can set binwidth depending on data range.)

# 3. Bar chart for a categorical field: e.g., DAMAGE_FLAG from WO Costs
ggplot(wo_costs, aes(x = damage)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Frequency of Damage Flag Status", x = "Damage", y = "Count")
# Each bar represents the count of records for each category of DAMAGE_FLAG (e.g., "Yes" vs "No").

# 4. Bar chart for top 5 companies in WO Costs by frequency
top5_companies <- wo_costs %>% count(company, sort = TRUE) %>% slice_head(n = 5)
ggplot(top5_companies, aes(x = reorder(company, -n), y = n)) +
  geom_col(fill = "orange") +
  labs(title = "Top 5 Companies by Work Order Count", x = "Company", y = "Number of Work Orders") +
  coord_flip()
# The above creates a horizontal bar chart of the 5 most frequent companies in the WO Costs data.
# We use reorder() to sort bars by count and coord_flip() to flip axes for better label readability.

```

#other uselful visualizations:

# Distribution of Total Costs

```{r}
# Inspect
summary(wo_costs$line_total)
str(wo_costs$line_total)
unique(wo_costs$line_total[!is.na(wo_costs$line_total) & !is.numeric(wo_costs$line_total)])

# Convert line_total to numeric
wo_costs <- wo_costs %>%
  mutate(line_total = as.numeric(line_total))  # conversion only

# Remove only NA values (keep all valid values including outliers)
wo_costs_clean <- wo_costs %>%
  filter(!is.na(line_total) & line_total > 0)

# Plot the full distribution including outliers
ggplot(wo_costs_clean, aes(x = line_total)) +
  geom_histogram(binwidth = 500, fill = "steelblue", color = "blue") +
  labs(
    title = "Distribution of Work Order Total Costs (Including Outliers)",
    x = "Line Total ($)", y = "Frequency"
  )

ggplot(wo_costs_clean, aes(x = log1p(line_total))) +
  geom_histogram(binwidth = 0.1, fill = "steelblue", color = "white") +
  labs(title = "Log-Scaled Distribution of Work Order Costs", x = "Log(Line Total $)", y = "Frequency")

ggplot(wo_costs_clean, aes(y = line_total)) +
  geom_boxplot(fill = "skyblue") +
  scale_y_log10() +
  labs(title = "Boxplot of Work Order Total Costs (Log Scale)", y = "Log(Line Total $)")

library(dplyr)
library(ggplot2)

# Step 1: Convert line_total to numeric if not already
wo_costs <- wo_costs %>%
  mutate(line_total = as.numeric(line_total))

# Step 2: Remove missing values (keep outliers)
wo_costs_clean <- wo_costs %>%
  filter(!is.na(line_total) & line_total > 0)

# Step 3: Calculate 99th percentile threshold
cost_threshold <- quantile(wo_costs_clean$line_total, 0.99, na.rm = TRUE)

# Step 4: Tag each work order as 'Top 1%' or 'Normal'
wo_costs_clean <- wo_costs_clean %>%
  mutate(cost_category = if_else(line_total >= cost_threshold, "Top 1%", "Normal"))

# Step 5: Visualize with different colors
ggplot(wo_costs_clean, aes(x = line_total, fill = cost_category)) +
  geom_histogram(binwidth = 500, color = "orange", alpha = 0.8, position = "identity") +
  scale_fill_manual(values = c("Normal" = "steelblue", "Top 1%" = "red")) +
  labs(title = "Distribution of WO Costs (Top 1% Highlighted)",
       x = "Line Total ($)", y = "Count", fill = "WO Cost Category") +
  theme_minimal()


library(dplyr)
library(ggplot2)

# Step 1: Convert to numeric
wo_costs <- wo_costs %>%
  mutate(line_total = as.numeric(line_total))

# Step 2: Impute NA values in line_total
# Option A: Impute with median (robust to outliers)
median_val <- median(wo_costs$line_total, na.rm = TRUE)
wo_costs <- wo_costs %>%
  mutate(line_total = ifelse(is.na(line_total), median_val, line_total))

# Option B (alternative): Impute with mean
# mean_val <- mean(wo_costs$line_total, na.rm = TRUE)
# wo_costs <- wo_costs %>%
#   mutate(line_total = ifelse(is.na(line_total), mean_val, line_total))

# Step 3: Plot full distribution including outliers
ggplot(wo_costs, aes(x = line_total)) +
  geom_histogram(binwidth = 500, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Work Order Total Costs (With Imputed NAs)",
       x = "Line Total ($)", y = "Frequency")

```

R Code to Find Asset with Most WOs

```{r}
library(dplyr)

# Count work orders per asset
top_asset <- wo_costs_clean %>%
  group_by(unit_no) %>%
  summarise(work_order_count = n()) %>%
  arrange(desc(work_order_count)) %>%
  slice(1)  # pick the top one

print(top_asset)

# Filter all work orders for the top asset
wo_top_asset <- wo_costs_clean %>%
  filter(unit_no == top_asset$unit_no)


```


Most work orders cost less than $5,000.

A small number of WOs are extremely costly (outliers), but they're retained in this view for completeness.

The extreme skew makes it difficult to visually compare the bulk of the data.


# Top Repair Job Types

```{r}
wo_costs %>%
  count(job_description, sort = TRUE) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = reorder(job_description, n), y = n)) +
  geom_col(fill = "tomato") +
  coord_flip() +
  labs(title = "Top 10 Job Descriptions", x = "Job Description", y = "Count")

```

# Cost Comparison: Planned vs Unplanned Repairs

```{r}
ggplot(wo_costs, aes(x = wo_reason_desc, y = line_total)) +
  geom_boxplot(fill = "skyblue") +
  scale_y_log10() +  # Useful for cost data
  labs(title = "Repair Cost by Work Order Type", x = "WO Reason", y = "Line Total (log scale)")

```

Insights:

PLANNED maintenance (i.e., preventive work) has the lowest median cost
and relatively tight interquartile range → indicates more controlled and
predictable expenses.

COLLISION/ORIGAMI and UNPLANNED repairs show higher medians and wider
spread, with more extreme outliers → these are less predictable and
often more expensive.

REMAN CENTER work appears consistently high in cost → may involve heavy
refurbishing.

ROADCALL repairs are similar to UNPLANNED, but with slightly tighter
variation.

The log scale highlights that most costs fall between \$10–\$1,000 but
vary by type.

Implication: Focusing more on planned maintenance could help reduce
costs and variability in repairs.

# Mileage vs Repair Costs

```{r}
merged_data <- merge(asset_details, wo_costs, by.x = "unit_no", by.y = "unit_no")

ggplot(merged_data, aes(x = meter, y = line_total)) +
  geom_point(alpha = 0.3, color = "darkgreen") +
  scale_y_log10() +
  labs(title = "Meter Reading vs Repair Cost", x = "Meter", y = "Line Total ($)")

```

What it shows: This scatter plot shows the relationship between vehicle
usage (meter) and repair cost (line_total).

Insights:

No strong linear trend — higher meter readings don’t consistently result
in higher costs.

A cluster of points appears at very low meter readings (likely new
assets or early failures), yet some of them have high costs → may
indicate early defects or accidents.

Meter readings at regular intervals (2.5M, 5M, etc.) could be system
artifacts or specific maintenance milestones.

The wide spread of costs at all meter levels implies that cost is
influenced by factors beyond mileage (e.g., type of repair, age, company
practices).

#Monthly Work Order Trend

```{r}
library(dplyr)
library(lubridate)
library(ggplot2)

wo_costs %>%
  mutate(
    # parse your datetime properly, e.g. "01/18/24 04:19"
    open_dt = mdy_hm(open_date),
    month   = floor_date(open_dt, "month")
  ) %>%
  count(month) %>%
  ggplot(aes(x = month, y = n)) +
    geom_col(fill = "darkorange") +
    labs(
      title = "Monthly Work Orders Trend",
      x     = "Month",
      y     = "Work Order Count"
    )

```

Key Observations: Data spikes sharply from January 2024:

Almost no activity in 2023, which may suggest:

The data collection started in Jan 2024, or

Older records were excluded or not relevant for this analysis.

Consistently high volume across 2024:

The monthly work order volume ranges around 33,000–38,000 WOs/month,
indicating stable operational activity.

Slight seasonal fluctuations, but overall volume stays strong and
consistent.

Drop-off after November 2024:

December 2024 to February 2025 shows a sharp decline in work order
counts.

Possible reasons:

Partial or incomplete data for those months.

Seasonal slowdown (e.g., holidays, reduced fleet usage).

Data not fully recorded/uploaded yet.

Implications: 2024 is the main year for analysis and modeling due to
full activity.

Sudden drop in 2025 may require validation—is it missing data or
operational change?

Useful for planning time series forecasting, resource allocation, or
identifying seasonal repair patterns.

```{r}
library(naniar)

vis_miss(wo_costs, warn_large_data = FALSE) +
  labs(title = "Missing Data in Full Work Order Table")


```

# Company/Location Cost Comparison

```{r}
wo_costs %>%
  group_by(company) %>%
  summarise(avg_cost = mean(line_total, na.rm = TRUE),
            median_cost = median(line_total, na.rm = TRUE),
            total_wo = n()) %>%
  arrange(desc(avg_cost))

```

# Labor vs Part Cost Comparison

```{r}
library(ggplot2)
ggplot(wo_costs, aes(x = labor_amount, y = part_amount)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  scale_x_log10() + scale_y_log10() +
  labs(title = "Labor vs Part Costs", x = "Labor Amount ($)", y = "Part Amount ($)")

```

Insights: Shows a positive spread between labor_amount and part_amount.

Majority of records are clustered between \$10–\$1,000, for both labor
and part.

The log-log scale reveals a triangle-shaped distribution:

Suggests that some WOs are part-heavy, some are labor-heavy, and some
are balanced.

There are a few extreme outliers, with very high labor or part costs,
worth investigating.

Implication: Helps understand which WOs are driven by labor vs material.

Could inform budget allocation, outsourcing vs in-house repairs, or
vendor cost benchmarking.

# Average Cost per Work Order Over Time

```{r}
library(dplyr)
library(lubridate)
library(ggplot2)

wo_costs %>%
  # 1) parse your datetime strings into POSIXct
  mutate(open_dt = mdy_hm(open_date)) %>%
  # 2) floor to the first of each month
  mutate(month = floor_date(open_dt, "month")) %>%
  # 3) group & summarise
  group_by(month) %>%
  summarise(avg_cost = mean(line_total, na.rm = TRUE)) %>%
  # 4) plot
  ggplot(aes(x = month, y = avg_cost)) +
    geom_line(color = "darkorange") +
    labs(
      title = "Average Cost per WO Over Time",
      x     = "Month",
      y     = "Average Line Total ($)"
    )


```

Insights: Sharp cost spike around mid-to-late 2023 (\~\$2400 avg per
WO), then a rapid decline.

From early 2024 onward, costs stabilize in the \$300–\$500 range
monthly.

This suggests either:

Historical data includes abnormal cases (accidents, cleanups).

Change in processes, cost recording, or vendor contracts starting in
2024.

```{r}
wo_costs %>%
  mutate(open_dt = lubridate::mdy_hm(`open_date`),
         month = lubridate::floor_date(open_dt, "month")) %>%
  count(month) %>%
  arrange(month)


```

```{r}
library(dplyr)
library(ggplot2)
library(lubridate)

# Prepare summary table
wo_summary <- wo_costs %>%
  mutate(open_dt = mdy_hm(`open_date`),
         month = floor_date(open_dt, "month")) %>%
  group_by(month) %>%
  summarise(
    avg_cost = mean(`line_total`, na.rm = TRUE),
    med_cost = median(`line_total`, na.rm = TRUE),
    count    = n()
  ) %>%
  ungroup()

# Plot with dual line (avg & median) and count as bars
ggplot(wo_summary, aes(x = month)) +
  geom_col(aes(y = count / 10), fill = "grey80", alpha = 0.5) +  # scaled count as background bars
  geom_line(aes(y = avg_cost), color = "orange", size = 1) +
  geom_line(aes(y = med_cost), color = "steelblue", size = 1, linetype = "dashed") +
  scale_y_continuous(
    name = "WO Cost ($)",
    sec.axis = sec_axis(~ . * 10, name = "WO Count")
  ) +
  labs(
    title = "WO Cost Trends with Volume",
    x = "Month",
    y = "Average Line Total ($)",
    caption = "Orange = Mean | Blue Dashed = Median | Grey Bars = WO Count (scaled)"
  ) +
  theme_minimal()

```

```{r}
library(dplyr)
library(lubridate)

# Convert open_date to date-time
wo_costs <- wo_costs %>%
  mutate(open_dt = mdy_hm(`open_date`))

# Filter spike period (e.g., July–Nov 2023)
wo_spike <- wo_costs %>%
  filter(open_dt >= ymd("2023-07-01"), open_dt <= ymd("2023-11-30"))

# View the top costly work orders in that period
wo_spike %>%
  select(`wo_nbr`, `unit_no`, 'open_date', `line_total`, `wo_reason_desc`, `job_description`) %>%
  arrange(desc(`line_total`)) %>%
  head(15)  # or top_n(10, `Line Total`)

# Option 1: See top 10 in console


```

#To calculate and compare the average cost-to-count ratio during the
spike period vs. the rest of the data, and scale it from 0 to 10, follow
the steps below.

```{r}
library(dplyr)
library(lubridate)

# Parse open date
wo_costs <- wo_costs %>%
  mutate(open_dt = mdy_hm(`open_date`))

# Define spike period: July to Nov 2023
spike_period <- wo_costs %>%
  filter(open_dt >= ymd("2023-07-01") & open_dt <= ymd("2023-11-30"))

# Define rest of period (excluding spike)
rest_period <- wo_costs %>%
  filter(open_dt < ymd("2023-07-01") | open_dt > ymd("2023-11-30"))

# Calculate average cost
avg_spike_cost <- mean(spike_period$`line_total`, na.rm = TRUE)
avg_rest_cost  <- mean(rest_period$`line_total`, na.rm = TRUE)


# Scale as a percentage score (0–100)
scaled_score <- (avg_spike_cost / avg_rest_cost) * 100


cat("Average WO cost during spike period: $", round(avg_spike_cost, 2), "\n")
cat("Average WO cost during rest period:  $", round(avg_rest_cost, 2), "\n")
cat("Scaled spike cost (0–100): ", round(scaled_score, 1), "%\n")


```

Work orders during the spike period were nearly 3.66x more expensive
than the rest of the time.

This confirms that the spike in average WO cost is not due to volume but
due to a significant increase in per-WO expenses, likely caused by a few
extreme-cost jobs.

This confirms that the spike in average WO cost is not due to volume but
due to a significant increase in per-WO expenses, likely caused by a few
extreme-cost jobs.

```{r}
library(dplyr)
library(ggplot2)
library(lubridate)

# Parse date
wo_costs <- wo_costs %>%
  mutate(open_dt = mdy_hm(`open_date`))

# Define spike period
spike_period <- wo_costs %>%
  filter(open_dt >= ymd("2023-07-01") & open_dt <= ymd("2023-11-30"))

# Get top 6 costliest WOs
top_6_wo <- spike_period %>%
  select(`wo_nbr`, `unit_no`, 'open_date', `line_total`, `wo_reason_desc`, `job_description`) %>%
  arrange(desc(`line_total`)) %>%
  slice_head(n = 6) %>%
  mutate(WO_Label = paste("WO", `wo_nbr`, "-", `unit_no`))

# Plot
ggplot(top_6_wo, aes(x = reorder(WO_Label, `line_total`), y = `line_total`)) +
  geom_col(fill = "firebrick") +
  geom_text(aes(label = paste0("$", round(`line_total`, 0))),
            vjust = -0.5, size = 3.5) +
  labs(
    title = "Top 6 Highest Cost Work Orders (Spike Period)",
    x = "Work Order",
    y = "Line Total ($)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

```

Calculate the contribution of each cost type (labor, parts, external
labor, tax) to the Line Total, grouped by Category Class, in R.

```{r}
library(dplyr)

# Summarize cost components per category
contribution_scaled <- wo_costs %>%
  group_by(`category_class`) %>%
  summarise(
    total_line       = sum(line_total, na.rm = TRUE),
    total_labor      = sum(labor_amount, na.rm = TRUE),
    total_parts      = sum(part_amount, na.rm = TRUE),
    total_ext_labor  = sum(external_labor_cost, na.rm = TRUE),
    total_labor_tax  = sum(labor_tax, na.rm = TRUE),
    total_parts_tax  = sum(parts_tax, na.rm = TRUE)
  ) %>%
  # Calculate raw percent contributions
  mutate(
    pct_labor     = total_labor / total_line,
    pct_parts     = total_parts / total_line,
    pct_ext_labor = total_ext_labor / total_line,
    pct_tax       = (total_labor_tax + total_parts_tax) / total_line
  ) %>%
  # Rescale each component per row from 1 to 10
  rowwise() %>%
  mutate(
    sum_pct = sum(c_across(starts_with("pct_"))),
    scale_labor     = round((pct_labor     / sum_pct) * 10, 1),
    scale_parts     = round((pct_parts     / sum_pct) * 10, 1),
    scale_ext_labor = round((pct_ext_labor / sum_pct) * 10, 1),
    scale_tax       = round((pct_tax       / sum_pct) * 10, 1)
  ) %>%
  ungroup() %>%
  select(`category_class`, scale_labor, scale_parts, scale_ext_labor, scale_tax)
# View the result
print(contribution_scaled)

```

line_total is strongly correlated with:

part_amount

labor_amount

labor_tax

parts_tax

Tax rates (labor_tax_rate, parts_tax_rate) show very weak correlation →
likely constant or low-variance fields.

Heatmap of Scaled Contributions

```{r}
library(dplyr)
library(tidyr)
library(ggplot2)

# Use your existing `contribution_scaled` data frame

# Convert to long format for heatmap
contribution_long <- contribution_scaled %>%
  pivot_longer(
    cols = starts_with("scale_"),
    names_to = "Cost_Type",
    values_to = "Score"
  )

# Clean up labels
contribution_long$Cost_Type <- gsub("scale_", "", contribution_long$Cost_Type)

# Heatmap
ggplot(contribution_long, aes(x = Cost_Type, y = `category_class`, fill = Score)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "firebrick") +
  labs(title = "Heatmap of Scaled Cost Contributions (1–10)",
       x = "Cost Component", y = "Category Class") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 6))

```

```{r}
#install.packages('fmsb')
library(fmsb)
library(tibble)


# Select top 5 categories by total_line or any other logic
top5 <- contribution_scaled %>%
  filter(!is.na(scale_labor)) %>%
  slice_max(order_by = scale_labor + scale_parts + scale_ext_labor + scale_tax, n = 5)

# Format for radar chart
radar_data <- top5 %>%
  column_to_rownames("category_class") %>%
  select(scale_labor, scale_parts, scale_ext_labor, scale_tax)

# fmsb requires max/min rows
radar_data <- rbind(rep(10, 4), rep(0, 4), radar_data)

# Plot radar
radarchart(radar_data,
           pcol = rainbow(5),
           pfcol = scales::alpha(rainbow(5), 0.4),
           plwd = 2,
           axistype = 1,
           title = "Radar Chart: Top 5 Category Classes by Cost Mix")
legend("topright", legend = rownames(radar_data)[-c(1, 2)], fill = scales::alpha(rainbow(5), 0.4), cex = 0.6)

```
line total from planned and un panned wos, by year, wo status, damage and company

```{r}
library(dplyr)
library(lubridate)

# Ensure open date is in datetime format
wo_costs <- wo_costs %>%
  mutate(open_dt = mdy_hm(`open_date`),
         year = year(open_dt),
         wo_reason = `wo_reason_desc`,
         wo_status = `wo_status`,
         damage    = `damage`,
         company   = `company`)

# Summarize Line Total by multiple dimensions
wo_summary <- wo_costs %>%
  group_by(year, wo_reason, wo_status, damage, company) %>%
  summarise(total_cost = sum(line_total, na.rm = TRUE), .groups = "drop")

```


Visulizations:

```{r}

# A. Cost Trend by WO Reason and Year


library(ggplot2)

ggplot(wo_summary %>% filter(!is.na(wo_reason)),
       aes(x = year, y = total_cost, fill = wo_reason)) +
  geom_col(position = "dodge") +
  labs(title = "Total Line Cost by WO Reason per Year",
       x = "Year", y = "Total Line Cost ($)", fill = "WO Reason") +
  theme_minimal()

#B. Faceted Cost Breakdown by Company and WO Status

ggplot(wo_summary %>% filter(!is.na(company)),
       aes(x = company, y = total_cost, fill = wo_status)) +
  geom_col(position = "stack") +
  facet_wrap(~ wo_reason, scales = "free_y") +
  labs(title = "Cost by Company & WO Status (Faceted by WO Reason)",
       x = "Company", y = "Total Line Cost") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


#C. Damage vs No Damage Cost Comparison
ggplot(wo_summary %>% filter(!is.na(damage)),
       aes(x = damage, y = total_cost, fill = damage)) +
  geom_col() +
  facet_wrap(~ wo_reason) +
  labs(title = "Cost by Damage Flag and WO Reason",
       x = "Damage", y = "Total Line Cost") +
  theme_minimal()


```

```{r}
wo_costs %>%
  group_by(wo_status) %>%
  summarise(total_cost = sum(line_total, na.rm = TRUE), .groups = "drop")

```


# Summary Table by Category Class

```{r}
```


```{r}
```


```{r}
```


```{r}
```


```{r}
```


```{r}
```


```{r}
```


```{r}
```


```{r}
wo_costs %>%
  group_by(cat_class_desc) %>%
  summarise(total_wos = n(),
            avg_line_total = mean(line_total, na.rm = TRUE),
            median_line_total = median(line_total, na.rm = TRUE)) %>%
  arrange(desc(avg_line_total))

```

#check for duplicates

```{r}

library(dplyr)