diff --git a/formal-semantics.md b/formal-semantics.md index ac93684..c5b81b7 100644 --- a/formal-semantics.md +++ b/formal-semantics.md @@ -246,6 +246,12 @@ Abstract: URI × Literal × Maybe Literal × Maybe Literal × Maybe Literal → Python: def execute(self, url: URIRef, value: Literal, title: Literal = None, description: Literal = None, fragment: Literal = None) -> Any ``` +**ldh-AddFile** - Add file (binary) to LinkedDataHub document via multipart RDF/POST +``` +Abstract: URI × Literal × Literal × Maybe Literal × Maybe Literal → Any +Python: def execute(self, url: URIRef, file_path: Literal, title: Literal, description: Literal = None, content_type: Literal = None) -> Any +``` + **ldh-RemoveBlock** - Remove content block from LinkedDataHub document ``` Abstract: URI × Maybe URI → Any diff --git a/pyproject.toml b/pyproject.toml index ccad399..bfbd786 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "openai", "mcp[cli]==1.10.1", "pydantic-settings", + "urllib3", ] [project.urls] diff --git a/src/web_algebra/client.py b/src/web_algebra/client.py index f9dc187..1a4e3d2 100644 --- a/src/web_algebra/client.py +++ b/src/web_algebra/client.py @@ -1,4 +1,5 @@ -from typing import Optional +from typing import Optional, Tuple +import hashlib import ssl import json import time @@ -10,6 +11,7 @@ from http.client import HTTPResponse from rdflib import Graph from rdflib.plugins.sparql.parser import parseQuery +from urllib3.filepost import encode_multipart_formdata MEDIA_TYPES = { @@ -194,6 +196,124 @@ def patch(self, url: str, sparql_update: str) -> HTTPResponse: return self.opener.open(request) +class FileClient: + """Multipart RDF/POST file upload for LinkedDataHub file resources. + + Files are not Linked Data — request bodies are bytes with a Content-Type + rather than RDF graphs — so they get their own client surface instead + of being grafted onto `LinkedDataClient`. Auth and TLS setup duplicate + `LinkedDataClient` / `SPARQLClient` by convention: each client in this + module configures its own ssl_context + opener inline. + + Wire format matches LinkedDataHub's `bin/add-file.sh` script: a + multipart/form-data body using LDH's RDF/POST dialect where each + `pu=` form field is paired with the next `ol=` or + `ou=` field, sharing a blank-node subject named via `sb=`. The + file body itself is carried as a multipart file part labelled `ol` + with the supplied Content-Type. LDH stores the bytes under its + built-in `/uploads/{sha1}` namespace and appends the file's RDF + description (filename, MIME type, sha1, title) to the target document. + """ + + _NFO_FILE_NAME = "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#fileName" + _NFO_FILE_DATA_OBJECT = "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#FileDataObject" + _DCT_TITLE = "http://purl.org/dc/terms/title" + _DCT_DESCRIPTION = "http://purl.org/dc/terms/description" + _RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + + def __init__( + self, + cert_pem_path: Optional[str] = None, + cert_password: Optional[str] = None, + verify_ssl: bool = True, + ): + """Initialize TLS context + opener; mirrors `LinkedDataClient.__init__`.""" + self.ssl_context = ssl.create_default_context() + + if cert_pem_path and cert_password: + self.ssl_context.load_cert_chain( + certfile=cert_pem_path, password=cert_password + ) + + if not verify_ssl: + self.ssl_context.check_hostname = False + self.ssl_context.verify_mode = ssl.CERT_NONE + + self.opener = urllib.request.build_opener( + urllib.request.HTTPSHandler(context=self.ssl_context), + HTTPRedirectHandler308(), + RetryAfterHandler(), + ) + + self.opener.addheaders = [ + ( + "User-Agent", + "Web-Algebra/1.0 (LinkedData Processing System; https://github.com/atomgraph/Web-Algebra)", + ) + ] + + def add_file( + self, + target_url: str, + file_body: bytes, + content_type: str, + title: str, + description: Optional[str] = None, + filename: Optional[str] = None, + ) -> Tuple[HTTPResponse, str]: + """RDF/POST a file to `target_url`. + + :param target_url: The document URI the file's RDF description is + appended to. Note this is *not* the URI the file ends up at — + LDH stores the bytes under its own `/uploads/{sha1}` namespace + regardless of `target_url`. + :param file_body: Raw file bytes. + :param content_type: MIME type of the file (e.g. `image/png`). + :param title: `dct:title` literal. + :param description: Optional `dct:description` literal. + :param filename: Optional filename for the multipart part's + `Content-Disposition`. Defaults to `"upload"` when absent; + LDH does not depend on this value for URI minting. + :return: `(HTTPResponse, sha1_hex)`. The sha1 is computed over + `file_body` client-side so callers can construct the resulting + `/uploads/{sha1}` URI without parsing the response body. + """ + sha1 = hashlib.sha1(file_body).hexdigest() + + # `encode_multipart_formdata` accepts a list of `(name, value)` + # tuples — duplicates allowed, order preserved. A plain string/bytes + # value becomes a form field; a `(filename, body, content_type)` + # tuple becomes a file part. RDF/POST relies on this ordering + # because each `pu=` field is paired with the next + # `ol=` / `ou=` field by LDH's parser. + fields: list[tuple[str, object]] = [ + ("rdf", ""), + ("sb", "file"), + ("pu", self._NFO_FILE_NAME), + ("ol", (filename or "upload", file_body, content_type)), + ("pu", self._DCT_TITLE), + ("ol", title), + ("pu", self._RDF_TYPE), + ("ou", self._NFO_FILE_DATA_OBJECT), + ] + if description: + fields.extend([ + ("pu", self._DCT_DESCRIPTION), + ("ol", description), + ]) + + body, content_type_header = encode_multipart_formdata(fields) + headers = { + "Content-Type": content_type_header, + "Accept": "text/turtle", + } + request = urllib.request.Request( + target_url, data=body, headers=headers, method="POST" + ) + response = self.opener.open(request) + return response, sha1 + + class SPARQLClient: def __init__( self, diff --git a/src/web_algebra/operations/linkeddatahub/add_file.py b/src/web_algebra/operations/linkeddatahub/add_file.py new file mode 100644 index 0000000..b10bd47 --- /dev/null +++ b/src/web_algebra/operations/linkeddatahub/add_file.py @@ -0,0 +1,228 @@ +from typing import Any, Optional +import logging +import mimetypes +import urllib.parse +from pathlib import Path + +from mcp import types +from rdflib import Literal, URIRef +from rdflib.namespace import XSD +from rdflib.query import Result + +from web_algebra.client import FileClient +from web_algebra.json_result import JSONResult +from web_algebra.mcp_tool import MCPTool +from web_algebra.operation import Operation + + +class AddFile(Operation, MCPTool): + """RDF/POST a file to a LinkedDataHub document, returning the minted upload URI. + + The file's RDF description (`nfo:FileDataObject` + filename + MIME type + + sha1 + title) is appended to the target document; the file bytes + themselves are stored by LDH at `/uploads/{sha1}` under its + built-in upload namespace, independent of the target document's URI. + + Unlike the rest of the `ldh-Add*` family, this op does not subclass + `POST` — file upload uses `multipart/form-data` with LDH's RDF/POST + dialect rather than an N-triples body, so it carries its own + `FileClient` instance instead of inheriting `LinkedDataClient` plumbing. + """ + + def model_post_init(self, __context: Any) -> None: + self.client = FileClient( + cert_pem_path=getattr(self.settings, "cert_pem_path", None), + cert_password=getattr(self.settings, "cert_password", None), + verify_ssl=False, + ) + + @classmethod + def name(cls): + return "ldh-AddFile" + + @classmethod + def description(cls) -> str: + return """Adds a file to a LinkedDataHub document via multipart RDF/POST. + + Appends `a nfo:FileDataObject ; nfo:fileName ; dct:title ; ...` + to the target document and stores the file bytes at + `/uploads/{sha1}` (LDH's built-in upload namespace). + + Arguments: + - `url` — URI of the target document to add the file's description to. + - `file` — absolute local file path. The bytes are read and streamed + to the server. + - `title` — human-readable title (`dct:title`). + - `description` — optional description (`dct:description`). + - `content_type` — optional MIME-type override; auto-detected from + the file path if absent. + + Returns a result with `url` (the minted `/uploads/{sha1}` URI + the file resource is now addressable at) and `status` (HTTP status + code) bindings. + """ + + @classmethod + def inputSchema(cls) -> dict: + return { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Target document URI to add the file's description to.", + }, + "file": { + "type": "string", + "description": "Absolute local file path. The bytes are read and uploaded.", + }, + "title": { + "type": "string", + "description": "Title of the file (dct:title).", + }, + "description": { + "type": "string", + "description": "Optional description (dct:description).", + }, + "content_type": { + "type": "string", + "description": "Optional MIME-type override; auto-detected from path if absent.", + }, + }, + "required": ["url", "file", "title"], + } + + def execute( + self, + url: URIRef, + file_path: Literal, + title: Literal, + description: Optional[Literal] = None, + content_type: Optional[Literal] = None, + ) -> Result: + """Pure function: RDF/POST a file from disk with RDFLib terms.""" + if not isinstance(url, URIRef): + raise TypeError( + f"AddFile.execute expects url to be URIRef, got {type(url)}" + ) + if not isinstance(file_path, Literal): + raise TypeError( + f"AddFile.execute expects file_path to be Literal, got {type(file_path)}" + ) + if not isinstance(title, Literal): + raise TypeError( + f"AddFile.execute expects title to be Literal, got {type(title)}" + ) + if description is not None and not isinstance(description, Literal): + raise TypeError( + f"AddFile.execute expects description to be Literal or None, got {type(description)}" + ) + if content_type is not None and not isinstance(content_type, Literal): + raise TypeError( + f"AddFile.execute expects content_type to be Literal or None, got {type(content_type)}" + ) + + path_str = str(file_path) + with open(path_str, "rb") as f: + body = f.read() + + ct: Optional[str] = str(content_type) if content_type is not None else None + if ct is None: + ct, _ = mimetypes.guess_type(path_str) + if ct is None: + ct = "application/octet-stream" + + url_str = str(url) + logging.info( + "RDF/POSTing file %s (%d bytes, %s) to <%s>", + path_str, len(body), ct, url_str, + ) + + response, sha1 = self.client.add_file( + target_url=url_str, + file_body=body, + content_type=ct, + title=str(title), + description=str(description) if description is not None else None, + filename=Path(path_str).name, + ) + + # The minted file URI lives at `:///uploads/` + # regardless of which target document we RDF/POSTed to. Reconstruct + # from the target URL's host so callers don't need to thread the + # base URL through separately. + parsed = urllib.parse.urlparse(url_str) + file_uri = f"{parsed.scheme}://{parsed.netloc}/uploads/{sha1}" + + logging.info("AddFile status %s → <%s>", response.status, file_uri) + + return JSONResult( + vars=["status", "url"], + bindings=[ + { + "status": Literal(response.status, datatype=XSD.integer), + "url": URIRef(file_uri), + } + ], + ) + + def execute_json(self, arguments: dict, variable_stack: list = []) -> Result: + """JSON execution: process arguments with strict type checking.""" + url_data = Operation.process_json( + self.settings, arguments["url"], self.context, variable_stack + ) + if not isinstance(url_data, URIRef): + raise TypeError( + f"ldh-AddFile expects 'url' to be URIRef, got {type(url_data)}" + ) + + file_data = Operation.process_json( + self.settings, arguments["file"], self.context, variable_stack + ) + file_literal = self.to_string_literal(file_data) + + title_data = Operation.process_json( + self.settings, arguments["title"], self.context, variable_stack + ) + title_literal = self.to_string_literal(title_data) + + description_literal: Optional[Literal] = None + if "description" in arguments: + description_data = Operation.process_json( + self.settings, arguments["description"], self.context, variable_stack + ) + description_literal = self.to_string_literal(description_data) + + content_type_literal: Optional[Literal] = None + if "content_type" in arguments: + content_type_data = Operation.process_json( + self.settings, arguments["content_type"], self.context, variable_stack + ) + content_type_literal = self.to_string_literal(content_type_data) + + return self.execute( + url_data, + file_literal, + title_literal, + description_literal, + content_type_literal, + ) + + def mcp_run(self, arguments: dict, context: Any = None) -> Any: + """MCP execution: plain args → plain results.""" + url = URIRef(arguments["url"]) + file_path = Literal(arguments["file"], datatype=XSD.string) + title = Literal(arguments["title"], datatype=XSD.string) + description = ( + Literal(arguments["description"], datatype=XSD.string) + if "description" in arguments + else None + ) + content_type = ( + Literal(arguments["content_type"], datatype=XSD.string) + if "content_type" in arguments + else None + ) + + result = self.execute(url, file_path, title, description, content_type) + url_binding = result.bindings[0]["url"] + return [types.TextContent(type="text", text=f"File added: {url_binding}")] diff --git a/tests/unit/test_ldh_add_file.py b/tests/unit/test_ldh_add_file.py new file mode 100644 index 0000000..9ad2a1c --- /dev/null +++ b/tests/unit/test_ldh_add_file.py @@ -0,0 +1,68 @@ +"""Spec: formal-semantics.md "ldh-AddFile - Add file (binary) to LinkedDataHub document via multipart RDF/POST" +Abstract: URI × Literal × Literal × Maybe Literal × Maybe Literal → Any +Python: def execute(self, url: URIRef, file_path: Literal, title: Literal, + description: Literal = None, content_type: Literal = None) -> Any +""" + +from __future__ import annotations + +import pytest +from rdflib import Literal, URIRef + +from web_algebra.operation import Operation + + +class TestLDHAddFilePure: + def test_wrong_url_type_raises(self, settings): + op = Operation.get("ldh-AddFile")(settings=settings) + with pytest.raises(TypeError): + op.execute( + Literal("not-a-uri"), + Literal("/abs/path.png"), + Literal("Title"), + ) + + def test_wrong_file_path_type_raises(self, settings): + op = Operation.get("ldh-AddFile")(settings=settings) + with pytest.raises(TypeError): + op.execute( + URIRef("https://example.org/"), + URIRef("not-a-literal"), + Literal("Title"), + ) + + def test_wrong_title_type_raises(self, settings): + op = Operation.get("ldh-AddFile")(settings=settings) + with pytest.raises(TypeError): + op.execute( + URIRef("https://example.org/"), + Literal("/abs/path.png"), + URIRef("not-a-literal"), + ) + + def test_wrong_description_type_raises(self, settings): + op = Operation.get("ldh-AddFile")(settings=settings) + with pytest.raises(TypeError): + op.execute( + URIRef("https://example.org/"), + Literal("/abs/path.png"), + Literal("Title"), + description=URIRef("not-a-literal"), + ) + + def test_wrong_content_type_type_raises(self, settings): + op = Operation.get("ldh-AddFile")(settings=settings) + with pytest.raises(TypeError): + op.execute( + URIRef("https://example.org/"), + Literal("/abs/path.png"), + Literal("Title"), + content_type=URIRef("not-a-literal"), + ) + + +@pytest.mark.ldh +class TestLDHAddFileLive: + @pytest.mark.skip(reason="UNCLEAR(spec): return type `Any`. Covered by integration LDH composition fixture instead.") + def test_basic(self, settings_with_auth): + pass diff --git a/uv.lock b/uv.lock index 409e08a..7e0102b 100644 --- a/uv.lock +++ b/uv.lock @@ -863,6 +863,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" }, ] +[[package]] +name = "urllib3" +version = "2.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, +] + [[package]] name = "uvicorn" version = "0.35.0" @@ -879,13 +888,14 @@ wheels = [ [[package]] name = "web-algebra" -version = "1.0.0" +version = "1.1.0" source = { editable = "." } dependencies = [ { name = "mcp", extra = ["cli"] }, { name = "openai" }, { name = "pydantic-settings" }, { name = "rdflib" }, + { name = "urllib3" }, ] [package.dev-dependencies] @@ -900,6 +910,7 @@ requires-dist = [ { name = "openai" }, { name = "pydantic-settings" }, { name = "rdflib", specifier = "==7.0.0" }, + { name = "urllib3" }, ] [package.metadata.requires-dev]