From 57f04fbc464a32b45d8469f9a5e643b2c670af8b Mon Sep 17 00:00:00 2001
From: Matthew Bernhardt <mjbernha@mit.edu>
Date: Mon, 22 Dec 2025 17:24:56 -0500
Subject: [PATCH 1/2] Move existing normalize method to a vocabulary

** Why are these changes being introduced:

We need a way to clean up messy format / content type values that come
back from Primo and TIMDEX. While ultimately we hope to fix data at the
source, it is also helpful to have a cleanup capability here.

We also want to standardize how these fields are treated across data
sources, because currently Primo, TIMDEX, and GeoData are all slightly
different.

** Relevant ticket(s):

* https://mitlibraries.atlassian.net/browse/use-299

** How does this address that need:

We start by moving an existing "normalize_type" method out of the Primo
normalizer model and into a standalone class (a "format" vocabulary).

The new model includes additional entries mentioned in the ticket as
needing cleaned up.

There are no changes to the view layer here, because the "format" field
doesn't change shape - only where the work gets performed.

** Document any side effects to this change:

This also changes how the pnx.display.type field is handled, in that
each value is passed through the vocabulary individually, rather than
after having been joined together. It isn't clear to me whether Primo
ever actually returns multiple values, so this change may not impact
anything - but it bears mentioning, and will be needed when we get to
the TIMDEX side of things.
---
 app/models/normalize_primo_record.rb    | 11 +----------
 app/models/vocabularies/format.rb       | 26 +++++++++++++++++++++++++
 test/models/vocabularies/format_test.rb | 15 ++++++++++++++
 3 files changed, 42 insertions(+), 10 deletions(-)
 create mode 100644 app/models/vocabularies/format.rb
 create mode 100644 test/models/vocabularies/format_test.rb

diff --git a/app/models/normalize_primo_record.rb b/app/models/normalize_primo_record.rb
index 35999f34..ba43cbfe 100644
--- a/app/models/normalize_primo_record.rb
+++ b/app/models/normalize_primo_record.rb
@@ -92,7 +92,7 @@ def year
   def format
     return unless @record['pnx']['display']['type']
 
-    normalize_type(@record['pnx']['display']['type'].join)
+    @record['pnx']['display']['type'].map { |term| Vocabularies::Format.lookup(term) }&.join(' ; ')
   end
 
   # While the links object in the Primo response often contains more than the Alma openurl, that is
@@ -277,15 +277,6 @@ def encode_author(author)
     URI.encode_uri_component(author)
   end
 
-  def normalize_type(type)
-    r_types = {
-      'BKSE' => 'eBook',
-      'reference_entry' => 'Reference Entry',
-      'Book_chapter' => 'Book Chapter'
-    }
-    r_types[type] || type.capitalize
-  end
-
   # It's possible we'll encounter records that use a different server,
   # so we want to test against our expected server to guard against
   # malformed URLs. This assumes all URL strings begin with https://.
diff --git a/app/models/vocabularies/format.rb b/app/models/vocabularies/format.rb
new file mode 100644
index 00000000..2018cfbb
--- /dev/null
+++ b/app/models/vocabularies/format.rb
@@ -0,0 +1,26 @@
+module Vocabularies
+  class Format
+    # FORMAT_MAPPINGS is an object listing all the machine-friendly format values we have encountered from TIMDEX or
+    # Primo, and the human-friendly values we want to normalize to. Entries should be alphabetized for easier
+    # maintenance.
+    FORMAT_MAPPINGS = {
+      'bkse' => 'eBook',
+      'book_chapter' => 'Book Chapter',
+      'conference_proceeding' => 'Conference Proceeding',
+      'magazinearticle' => 'Magazine Article',
+      'newsletterarticle' => 'Newsletter Article',
+      'reference_entry' => 'Reference Entry',
+      'researchdatabases' => 'Research Database'
+    }.freeze
+
+    # The lookup method attemps to look up a human-friendly value for any of the format values we get back from our
+    # source systems. The fetch method used allows a default value, which is what happens when a more human-friendly
+    # value isn't found in the FORMAT_MAPPINGS constant.
+    #
+    # @param value [String] A format value to be looked up, if a better version exists.
+    # @return [String, nil] The cleaned up version, or nil if a nil was submited.
+    def self.lookup(value)
+      FORMAT_MAPPINGS.fetch(value.downcase, value&.capitalize)
+    end
+  end
+end
diff --git a/test/models/vocabularies/format_test.rb b/test/models/vocabularies/format_test.rb
new file mode 100644
index 00000000..ceff7daf
--- /dev/null
+++ b/test/models/vocabularies/format_test.rb
@@ -0,0 +1,15 @@
+require 'test_helper'
+
+class VocabularyFormatTest < ActiveSupport::TestCase
+  test 'lookup method returns better values where we know them' do
+    value = 'BKSE'
+    output = Vocabularies::Format.lookup(value)
+    assert_equal output, 'eBook'
+  end
+
+  test 'lookup method returns sentence case as a default' do
+    value = 'UNEXPECTED VALUE'
+    output = Vocabularies::Format.lookup(value)
+    assert_equal output, 'Unexpected value'
+  end
+end

From 81db800e59b9e16706f766ef7f56ce99b2dfb76b Mon Sep 17 00:00:00 2001
From: Matthew Bernhardt <mjbernha@mit.edu>
Date: Mon, 5 Jan 2026 16:51:24 -0500
Subject: [PATCH 2/2] Extend new format vocabulary model to TIMDEX

** Why are these changes being introduced:

With the format vocabulary model in place, we now need to tweak the
TIMDEX normalizer to take advantage of it, for both USE and GeoData
instances.

Additionally, it seems like our codebase doesn't match the current
structure of TIMDEX when it comes to the contentType field. The codebase
assumes a key-value object strucuture, but TIMDEX is actually returning
just a list of values. This does not cause problems because we use
.each blocks and not .map blocks, so the provided data doesn't get
updated, and just passes to the next code block.

** Relevant ticket(s):

* https://mitlibraries.atlassian.net/browse/use-299

** How does this address that need:

The Timdex normalizer gets updated - for both format and content_type
fields - to use the new Vocabularies::Format.lookup method. Similarly
to the Primo normalizer, values are passed individually via a map block,
and then joined together into a single string that is passed to the view
layer.

The view templates are then simplified, for both USE and GeoData, to
render only the single string, without need for an each block or join.

** Document any side effects to this change:

There a few things to call out:

- The TIMDEX normalizer now has two identical methods: format and
content_type. This isn't great, but cleaning it up feels like it is
outside the scope of the current ticket. I'm not sure how long this
rabbit hole will go if I start trying to collapse down to only using
one field, and this ticket has gone on too long already.

- The existing geodata view template has a conditional branch that is
used by the record view, which does not use the normalizer. This
branch is unchanged here, so the resulting view template has different
logic depending on whether the search or record view is calling that
shared partial. The each block still isn't necessary, but I'm trying
not to spider into different problems on this ticket.

- Is it a code smell that both the Primo and TIMDEX paths use the same
field&.map { ... }&.join() logic? Perhaps. At the moment, I like the
format vocabulary dealing only with single values, leaving the
normalizer to deal with the list as a whole - but I'm open to feedback
here.

- Tangential test mocks and fixtures get updated to match the data I'm
seeing with TIMDEX (switching from a key-value object to a list of
values).
---
 app/models/normalize_timdex_record.rb       | 7 +++++--
 app/views/search/_result.html.erb           | 2 +-
 app/views/shared/_geo_data_info.html.erb    | 3 ++-
 test/controllers/search_controller_test.rb  | 8 ++++----
 test/fixtures/timdex/full_record.json       | 5 +----
 test/models/normalize_timdex_record_test.rb | 5 +----
 6 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/app/models/normalize_timdex_record.rb b/app/models/normalize_timdex_record.rb
index 440fe951..9684a11c 100644
--- a/app/models/normalize_timdex_record.rb
+++ b/app/models/normalize_timdex_record.rb
@@ -102,10 +102,11 @@ def year
     end
   end
 
+  # This is the same as the content_type field below.
   def format
     return '' unless @record['contentType']
 
-    @record['contentType'].map { |type| type['value'] }.join(' ; ')
+    @record['contentType']&.map { |term| Vocabularies::Format.lookup(term) }&.join(' ; ')
   end
 
   def links
@@ -158,8 +159,10 @@ def identifier
   end
 
   # TIMDEX-specific methods
+
+  # This is the same as the format field above.
   def content_type
-    @record['contentType']
+    @record['contentType']&.map { |term| Vocabularies::Format.lookup(term) }&.join(' ; ')
   end
 
   def dates
diff --git a/app/views/search/_result.html.erb b/app/views/search/_result.html.erb
index d0f91bd3..ab39522e 100644
--- a/app/views/search/_result.html.erb
+++ b/app/views/search/_result.html.erb
@@ -11,7 +11,7 @@
     <div class="result-metadata">
 
       <p class="pub-info">
-        <span><%= result[:content_type]&.each { |type| type['value'] }&.join(' ; ') %></span>
+        <span><%= result[:content_type] %></span>
         
         <% if result[:date_range].present? %>
           <span><%= result[:date_range] %></span>
diff --git a/app/views/shared/_geo_data_info.html.erb b/app/views/shared/_geo_data_info.html.erb
index 74254731..5e7b9fe4 100644
--- a/app/views/shared/_geo_data_info.html.erb
+++ b/app/views/shared/_geo_data_info.html.erb
@@ -1,7 +1,8 @@
 <ul class="list-inline">
   <% if metadata[:content_type] %>
-    <li><%= metadata[:content_type]&.each { |type| type['value'] }&.join(' ; ') %></li>
+    <li><%= metadata[:content_type] %></li>
   <% elsif metadata['contentType'] %>
+    <% # This branch is used by the record view, which is not passed through a normalizer. %>
     <li><%= metadata['contentType']&.each { |type| type['value'] }&.join(' ; ') %></li>
   <% end %>
 
diff --git a/test/controllers/search_controller_test.rb b/test/controllers/search_controller_test.rb
index 330ef6f6..d0332447 100644
--- a/test/controllers/search_controller_test.rb
+++ b/test/controllers/search_controller_test.rb
@@ -95,7 +95,7 @@ def mock_timdex_search_success
       'api' => 'timdex',
       'title' => 'Sample TIMDEX Document Title',
       'timdexRecordId' => 'sample-record-123',
-      'contentType' => [{ 'value' => 'Article' }],
+      'contentType' => ['Article'],
       'dates' => [{ 'kind' => 'Publication date', 'value' => '2023' }],
       'contributors' => [{ 'value' => 'Foo Barston', 'kind' => 'Creator' }],
       'highlight' => [
@@ -143,7 +143,7 @@ def mock_timdex_search_all_tab
       'api' => 'timdex',
       'title' => 'Sample TIMDEX Document Title',
       'timdexRecordId' => 'sample-record-123',
-      'contentType' => [{ 'value' => 'Article' }],
+      'contentType' => ['Article'],
       'dates' => [{ 'kind' => 'Publication date', 'value' => '2023' }],
       'contributors' => [{ 'value' => 'Foo Barston', 'kind' => 'Creator' }],
       'highlight' => [
@@ -186,7 +186,7 @@ def mock_timdex_search_with_hits(total_hits)
       {
         'title' => "Sample TIMDEX Document Title #{i}",
         'timdexRecordId' => "sample-record-#{i}",
-        'contentType' => [{ 'value' => 'Article' }],
+        'contentType' => ['Article'],
         'dates' => [{ 'kind' => 'Publication date', 'value' => '2023' }],
         'contributors' => [{ 'value' => "Creator #{i}", 'kind' => 'Creator' }],
         'sourceLink' => "https://example.com/record#{i}"
@@ -448,7 +448,7 @@ def mock_timdex_search_with_hits(total_hits)
       'api' => 'timdex',
       'title' => 'Sample TIMDEX Document Title',
       'timdexRecordId' => 'sample-record-123',
-      'contentType' => [{ 'value' => 'Article' }],
+      'contentType' => ['Article'],
       'dates' => [{ 'kind' => 'Publication date', 'value' => '2023' }],
       'contributors' => [{ 'value' => 'Foo Barston', 'kind' => 'Creator' }],
       'highlight' => [],
diff --git a/test/fixtures/timdex/full_record.json b/test/fixtures/timdex/full_record.json
index 58927aa8..f9ee2f46 100644
--- a/test/fixtures/timdex/full_record.json
+++ b/test/fixtures/timdex/full_record.json
@@ -2,10 +2,7 @@
   "timdexRecordId": "test-record-123",
   "title": "Sample TIMDEX Record for Testing",
   "source": "Test Repository",
-  "contentType": [
-    {"value": "Dataset"},
-    {"value": "Geospatial data"}
-  ],
+  "contentType": ["Dataset", "Geospatial data"],
   "dates": [
     {"kind": "Publication date", "value": "2023-01-15"},
     {"kind": "Coverage", "value": "2020-2023"}
diff --git a/test/models/normalize_timdex_record_test.rb b/test/models/normalize_timdex_record_test.rb
index a166e797..4796364c 100644
--- a/test/models/normalize_timdex_record_test.rb
+++ b/test/models/normalize_timdex_record_test.rb
@@ -185,10 +185,7 @@ def minimal_record
   # Test TIMDEX-specific fields
   test 'includes TIMDEX-specific content_type field' do
     normalized = NormalizeTimdexRecord.new(full_record, 'test').normalize
-    expected_content_type = [
-      { 'value' => 'Dataset' },
-      { 'value' => 'Geospatial data' }
-    ]
+    expected_content_type = 'Dataset ; Geospatial data'
     assert_equal expected_content_type, normalized[:content_type]
   end