Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions formal-semantics.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,12 @@ Abstract: URI × Literal × Maybe Literal × Maybe Literal × Maybe Literal →
Python: def execute(self, url: URIRef, value: Literal, title: Literal = None, description: Literal = None, fragment: Literal = None) -> Any
```

**ldh-AddFile** - Add file (binary) to LinkedDataHub document via multipart RDF/POST
```
Abstract: URI × Literal × Literal × Maybe Literal × Maybe Literal → Any
Python: def execute(self, url: URIRef, file_path: Literal, title: Literal, description: Literal = None, content_type: Literal = None) -> Any
```

**ldh-RemoveBlock** - Remove content block from LinkedDataHub document
```
Abstract: URI × Maybe URI → Any
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"openai",
"mcp[cli]==1.10.1",
"pydantic-settings",
"urllib3",
]

[project.urls]
Expand Down
122 changes: 121 additions & 1 deletion src/web_algebra/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Optional
from typing import Optional, Tuple
import hashlib
import ssl
import json
import time
Expand All @@ -10,6 +11,7 @@
from http.client import HTTPResponse
from rdflib import Graph
from rdflib.plugins.sparql.parser import parseQuery
from urllib3.filepost import encode_multipart_formdata


MEDIA_TYPES = {
Expand Down Expand Up @@ -194,6 +196,124 @@ def patch(self, url: str, sparql_update: str) -> HTTPResponse:
return self.opener.open(request)


class FileClient:
"""Multipart RDF/POST file upload for LinkedDataHub file resources.

Files are not Linked Data — request bodies are bytes with a Content-Type
rather than RDF graphs — so they get their own client surface instead
of being grafted onto `LinkedDataClient`. Auth and TLS setup duplicate
`LinkedDataClient` / `SPARQLClient` by convention: each client in this
module configures its own ssl_context + opener inline.

Wire format matches LinkedDataHub's `bin/add-file.sh` script: a
multipart/form-data body using LDH's RDF/POST dialect where each
`pu=<predicate>` form field is paired with the next `ol=<literal>` or
`ou=<uri>` field, sharing a blank-node subject named via `sb=`. The
file body itself is carried as a multipart file part labelled `ol`
with the supplied Content-Type. LDH stores the bytes under its
built-in `/uploads/{sha1}` namespace and appends the file's RDF
description (filename, MIME type, sha1, title) to the target document.
"""

_NFO_FILE_NAME = "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#fileName"
_NFO_FILE_DATA_OBJECT = "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#FileDataObject"
_DCT_TITLE = "http://purl.org/dc/terms/title"
_DCT_DESCRIPTION = "http://purl.org/dc/terms/description"
_RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"

def __init__(
self,
cert_pem_path: Optional[str] = None,
cert_password: Optional[str] = None,
verify_ssl: bool = True,
):
"""Initialize TLS context + opener; mirrors `LinkedDataClient.__init__`."""
self.ssl_context = ssl.create_default_context()

if cert_pem_path and cert_password:
self.ssl_context.load_cert_chain(
certfile=cert_pem_path, password=cert_password
)

if not verify_ssl:
self.ssl_context.check_hostname = False
self.ssl_context.verify_mode = ssl.CERT_NONE

self.opener = urllib.request.build_opener(
urllib.request.HTTPSHandler(context=self.ssl_context),
HTTPRedirectHandler308(),
RetryAfterHandler(),
)

self.opener.addheaders = [
(
"User-Agent",
"Web-Algebra/1.0 (LinkedData Processing System; https://github.com/atomgraph/Web-Algebra)",
)
]

def add_file(
self,
target_url: str,
file_body: bytes,
content_type: str,
title: str,
description: Optional[str] = None,
filename: Optional[str] = None,
) -> Tuple[HTTPResponse, str]:
"""RDF/POST a file to `target_url`.

:param target_url: The document URI the file's RDF description is
appended to. Note this is *not* the URI the file ends up at —
LDH stores the bytes under its own `/uploads/{sha1}` namespace
regardless of `target_url`.
:param file_body: Raw file bytes.
:param content_type: MIME type of the file (e.g. `image/png`).
:param title: `dct:title` literal.
:param description: Optional `dct:description` literal.
:param filename: Optional filename for the multipart part's
`Content-Disposition`. Defaults to `"upload"` when absent;
LDH does not depend on this value for URI minting.
:return: `(HTTPResponse, sha1_hex)`. The sha1 is computed over
`file_body` client-side so callers can construct the resulting
`<base>/uploads/{sha1}` URI without parsing the response body.
"""
sha1 = hashlib.sha1(file_body).hexdigest()

# `encode_multipart_formdata` accepts a list of `(name, value)`
# tuples — duplicates allowed, order preserved. A plain string/bytes
# value becomes a form field; a `(filename, body, content_type)`
# tuple becomes a file part. RDF/POST relies on this ordering
# because each `pu=<predicate>` field is paired with the next
# `ol=<literal>` / `ou=<uri>` field by LDH's parser.
fields: list[tuple[str, object]] = [
("rdf", ""),
("sb", "file"),
("pu", self._NFO_FILE_NAME),
("ol", (filename or "upload", file_body, content_type)),
("pu", self._DCT_TITLE),
("ol", title),
("pu", self._RDF_TYPE),
("ou", self._NFO_FILE_DATA_OBJECT),
]
if description:
fields.extend([
("pu", self._DCT_DESCRIPTION),
("ol", description),
])

body, content_type_header = encode_multipart_formdata(fields)
headers = {
"Content-Type": content_type_header,
"Accept": "text/turtle",
}
request = urllib.request.Request(
target_url, data=body, headers=headers, method="POST"
)
response = self.opener.open(request)
return response, sha1


class SPARQLClient:
def __init__(
self,
Expand Down
228 changes: 228 additions & 0 deletions src/web_algebra/operations/linkeddatahub/add_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
from typing import Any, Optional
import logging
import mimetypes
import urllib.parse
from pathlib import Path

from mcp import types
from rdflib import Literal, URIRef
from rdflib.namespace import XSD
from rdflib.query import Result

from web_algebra.client import FileClient
from web_algebra.json_result import JSONResult
from web_algebra.mcp_tool import MCPTool
from web_algebra.operation import Operation


class AddFile(Operation, MCPTool):
"""RDF/POST a file to a LinkedDataHub document, returning the minted upload URI.

The file's RDF description (`nfo:FileDataObject` + filename + MIME type +
sha1 + title) is appended to the target document; the file bytes
themselves are stored by LDH at `<base>/uploads/{sha1}` under its
built-in upload namespace, independent of the target document's URI.

Unlike the rest of the `ldh-Add*` family, this op does not subclass
`POST` — file upload uses `multipart/form-data` with LDH's RDF/POST
dialect rather than an N-triples body, so it carries its own
`FileClient` instance instead of inheriting `LinkedDataClient` plumbing.
"""

def model_post_init(self, __context: Any) -> None:
self.client = FileClient(
cert_pem_path=getattr(self.settings, "cert_pem_path", None),
cert_password=getattr(self.settings, "cert_password", None),
verify_ssl=False,
)

@classmethod
def name(cls):
return "ldh-AddFile"

@classmethod
def description(cls) -> str:
return """Adds a file to a LinkedDataHub document via multipart RDF/POST.

Appends `a nfo:FileDataObject ; nfo:fileName ; dct:title ; ...`
to the target document and stores the file bytes at
`<base>/uploads/{sha1}` (LDH's built-in upload namespace).

Arguments:
- `url` — URI of the target document to add the file's description to.
- `file` — absolute local file path. The bytes are read and streamed
to the server.
- `title` — human-readable title (`dct:title`).
- `description` — optional description (`dct:description`).
- `content_type` — optional MIME-type override; auto-detected from
the file path if absent.

Returns a result with `url` (the minted `<base>/uploads/{sha1}` URI
the file resource is now addressable at) and `status` (HTTP status
code) bindings.
"""

@classmethod
def inputSchema(cls) -> dict:
return {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "Target document URI to add the file's description to.",
},
"file": {
"type": "string",
"description": "Absolute local file path. The bytes are read and uploaded.",
},
"title": {
"type": "string",
"description": "Title of the file (dct:title).",
},
"description": {
"type": "string",
"description": "Optional description (dct:description).",
},
"content_type": {
"type": "string",
"description": "Optional MIME-type override; auto-detected from path if absent.",
},
},
"required": ["url", "file", "title"],
}

def execute(
self,
url: URIRef,
file_path: Literal,
title: Literal,
description: Optional[Literal] = None,
content_type: Optional[Literal] = None,
) -> Result:
"""Pure function: RDF/POST a file from disk with RDFLib terms."""
if not isinstance(url, URIRef):
raise TypeError(
f"AddFile.execute expects url to be URIRef, got {type(url)}"
)
if not isinstance(file_path, Literal):
raise TypeError(
f"AddFile.execute expects file_path to be Literal, got {type(file_path)}"
)
if not isinstance(title, Literal):
raise TypeError(
f"AddFile.execute expects title to be Literal, got {type(title)}"
)
if description is not None and not isinstance(description, Literal):
raise TypeError(
f"AddFile.execute expects description to be Literal or None, got {type(description)}"
)
if content_type is not None and not isinstance(content_type, Literal):
raise TypeError(
f"AddFile.execute expects content_type to be Literal or None, got {type(content_type)}"
)

path_str = str(file_path)
with open(path_str, "rb") as f:
body = f.read()

ct: Optional[str] = str(content_type) if content_type is not None else None
if ct is None:
ct, _ = mimetypes.guess_type(path_str)
if ct is None:
ct = "application/octet-stream"

url_str = str(url)
logging.info(
"RDF/POSTing file %s (%d bytes, %s) to <%s>",
path_str, len(body), ct, url_str,
)

response, sha1 = self.client.add_file(
target_url=url_str,
file_body=body,
content_type=ct,
title=str(title),
description=str(description) if description is not None else None,
filename=Path(path_str).name,
)

# The minted file URI lives at `<scheme>://<host>/uploads/<sha1>`
# regardless of which target document we RDF/POSTed to. Reconstruct
# from the target URL's host so callers don't need to thread the
# base URL through separately.
parsed = urllib.parse.urlparse(url_str)
file_uri = f"{parsed.scheme}://{parsed.netloc}/uploads/{sha1}"

logging.info("AddFile status %s → <%s>", response.status, file_uri)

return JSONResult(
vars=["status", "url"],
bindings=[
{
"status": Literal(response.status, datatype=XSD.integer),
"url": URIRef(file_uri),
}
],
)

def execute_json(self, arguments: dict, variable_stack: list = []) -> Result:
"""JSON execution: process arguments with strict type checking."""
url_data = Operation.process_json(
self.settings, arguments["url"], self.context, variable_stack
)
if not isinstance(url_data, URIRef):
raise TypeError(
f"ldh-AddFile expects 'url' to be URIRef, got {type(url_data)}"
)

file_data = Operation.process_json(
self.settings, arguments["file"], self.context, variable_stack
)
file_literal = self.to_string_literal(file_data)

title_data = Operation.process_json(
self.settings, arguments["title"], self.context, variable_stack
)
title_literal = self.to_string_literal(title_data)

description_literal: Optional[Literal] = None
if "description" in arguments:
description_data = Operation.process_json(
self.settings, arguments["description"], self.context, variable_stack
)
description_literal = self.to_string_literal(description_data)

content_type_literal: Optional[Literal] = None
if "content_type" in arguments:
content_type_data = Operation.process_json(
self.settings, arguments["content_type"], self.context, variable_stack
)
content_type_literal = self.to_string_literal(content_type_data)

return self.execute(
url_data,
file_literal,
title_literal,
description_literal,
content_type_literal,
)

def mcp_run(self, arguments: dict, context: Any = None) -> Any:
"""MCP execution: plain args → plain results."""
url = URIRef(arguments["url"])
file_path = Literal(arguments["file"], datatype=XSD.string)
title = Literal(arguments["title"], datatype=XSD.string)
description = (
Literal(arguments["description"], datatype=XSD.string)
if "description" in arguments
else None
)
content_type = (
Literal(arguments["content_type"], datatype=XSD.string)
if "content_type" in arguments
else None
)

result = self.execute(url, file_path, title, description, content_type)
url_binding = result.bindings[0]["url"]
return [types.TextContent(type="text", text=f"File added: {url_binding}")]
Loading
Loading