Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD
- Optional filter to specify which source compression format should be converted. Use with `--convert-to` to convert only files with a specific compression format. Example: `--convert-to gz --convert-from bz2` converts only `.bz2` files to `.gz`, leaving other formats unchanged.
- `--validate-checksum`
- Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted.
- **Filters (Pipe syntax)**
- You can filter files within a version/artifact/group using a pipe-separated syntax: `$URI|filter1|filter2`.
- Content variants: `key=value` (e.g. `lang=en`) or just `value` (e.g. `en`) to match any variant.
- Format: `.extension` (e.g. `.ttl`).
- Compression: `..compression` (e.g. `..gz`).
- Example: `databusclient download "https://.../version|lang=en|.ttl|..gz"`

**Help and further information on download command:**
```bash
Expand Down Expand Up @@ -337,6 +343,7 @@ Options:
https://cloud.example.com/remote.php/webdav)
--remote TEXT rclone remote name (e.g., 'nextcloud')
--path TEXT Remote path on Nextcloud (e.g., 'datasets/mydataset')
--dry-run Generate and print JSON-LD without deploying (gen preview)
--help Show this message and exit.
```

Expand Down
144 changes: 114 additions & 30 deletions databusclient/api/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,22 +752,24 @@ def _download_version(
convert_to: str = None,
convert_from: str = None,
validate_checksum: bool = False,
filters: List[str] = None,
) -> None:
"""Download all files in a databus artifact version.
"""Download matching files in a databus artifact version.

Args:
uri: The full databus artifact version URI.
localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory.
vault_token_file: Path to Vault refresh token file for protected downloads.
databus_key: Databus API key for protected downloads.
uri: The full databus artifact version URI (base URI without filters).
localDir: Local directory to download files to.
vault_token_file: Path to Vault refresh token file.
databus_key: Databus API key.
auth_url: Keycloak token endpoint URL.
client_id: Client ID for token exchange.
convert_to: Target compression format for on-the-fly conversion.
convert_to: Target compression format.
convert_from: Optional source compression format filter.
validate_checksum: Whether to validate checksums after downloading.
validate_checksum: Whether to validate checksums.
filters: Optional list of filters (content variants, format, compression).
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str, filters=filters)
# build url -> checksum mapping from JSON-LD when available
checksums: dict = {}
try:
Expand Down Expand Up @@ -800,20 +802,22 @@ def _download_artifact(
convert_to: str = None,
convert_from: str = None,
validate_checksum: bool = False,
filters: List[str] = None,
) -> None:
"""Download files in a databus artifact.

Args:
uri: The full databus artifact URI.
localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory.
all_versions: If True, download all versions of the artifact; otherwise, only download the latest version.
vault_token_file: Path to Vault refresh token file for protected downloads.
databus_key: Databus API key for protected downloads.
localDir: Local directory to download files to.
all_versions: If True, download all versions; otherwise, only latest.
vault_token_file: Path to Vault refresh token file.
databus_key: Databus API key.
auth_url: Keycloak token endpoint URL.
client_id: Client ID for token exchange.
convert_to: Target compression format for on-the-fly conversion.
convert_to: Target compression format.
convert_from: Optional source compression format filter.
validate_checksum: Whether to validate checksums after downloading.
validate_checksum: Whether to validate checksums.
filters: Optional list of filters to apply to each version's files.
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
versions = _get_databus_versions_of_artifact(json_str, all_versions=all_versions)
Expand All @@ -822,7 +826,9 @@ def _download_artifact(
for version_uri in versions:
print(f"Downloading version: {version_uri}")
json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
file_urls = _get_file_download_urls_from_artifact_jsonld(
json_str, filters=filters
)
# extract checksums for this version
checksums: dict = {}
try:
Expand Down Expand Up @@ -882,14 +888,73 @@ def _get_databus_versions_of_artifact(
return version_urls[0]


def _get_file_download_urls_from_artifact_jsonld(json_str: str) -> List[str]:
def _matches_filters(node: dict, filters: List[str]) -> bool:
"""Check if a JSON-LD node matches the given filters.

Filters can be:
- .extension (e.g. .ttl)
- ..compression (e.g. ..gz)
- key=value (content variant)
- value (match any content variant value)
"""
if not filters:
return True

for f in filters:
if f.startswith(".."):
# Compression filter
expected = f[2:].lower()
actual = str(node.get("compression", "")).lower()
if actual != expected:
return False
elif f.startswith("."):
# Format extension filter
expected = f[1:].lower()
actual = str(node.get("formatExtension", "")).lower()
if actual != expected:
return False
Comment on lines +903 to +915
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fall back to the file name when formatExtension or compression is missing.

file_format and compression are optional in published metadata, so .ttl / ..gz currently fail on otherwise valid Part nodes that only expose file. That makes the new filter syntax silently skip matching files on those datasets.

💡 Localized fix
     for f in filters:
         if f.startswith(".."):
             # Compression filter
             expected = f[2:].lower()
-            actual = str(node.get("compression", "")).lower()
+            actual = str(node.get("compression", "")).lower()
+            if not actual:
+                actual = _detect_compression_format(str(node.get("file", ""))) or ""
             if actual != expected:
                 return False
         elif f.startswith("."):
             # Format extension filter
             expected = f[1:].lower()
-            actual = str(node.get("formatExtension", "")).lower()
+            actual = str(node.get("formatExtension", "")).lower()
+            if not actual:
+                path = urlparse(str(node.get("file", ""))).path
+                basename = os.path.basename(path).lower()
+                basename = re.sub(r"\.(bz2|gz|xz)$", "", basename)
+                actual = basename.rsplit(".", 1)[-1] if "." in basename else ""
             if actual != expected:
                 return False
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@databusclient/api/download.py` around lines 903 - 915, The filter logic in
the for-loop that checks f.startswith(".") and f.startswith("..") currently
reads formatExtension and compression from node.get(...) only, causing valid
Part nodes with only a "file" field to fail; update the checks in that loop to
fall back to extracting the extension and compression from node["file"] when
node.get("formatExtension") or node.get("compression") are empty—use the file
name (node.get("file") or node["file"]) to derive the format extension (e.g.,
the suffix after the last '.') and the compression (e.g., a trailing
.gz/.bz2/etc.), then compare those derived values (lowercased) against expected
in the existing f.startswith(".") and f.startswith("..") branches so the filters
match when metadata fields are absent.

elif "=" in f:
# Specific content variant key=value
key, val = f.split("=", 1)
# Try various common prefixes
actual = None
for prefix in ["dcv:", "dataid-cv:", ""]:
potential_val = node.get(f"{prefix}{key}")
if potential_val is not None:
if isinstance(potential_val, dict):
actual = potential_val.get("@value")
else:
actual = potential_val
break
if str(actual) != val:
return False
else:
# Match any content variant value
found = False
for k, v in node.items():
if k.startswith("dcv:") or k.startswith("dataid-cv:"):
actual_val = v
if isinstance(v, dict):
actual_val = v.get("@value")
if str(actual_val) == f:
found = True
break
if not found:
return False
return True


def _get_file_download_urls_from_artifact_jsonld(
json_str: str, filters: List[str] = None
) -> List[str]:
"""Parse the JSON-LD of a databus artifact version to extract download URLs.

Args:
json_str: JSON-LD string of the databus artifact version.
filters: Optional list of filters to apply to the files.

Returns:
List of all file download URLs in the artifact version.
List of matching file download URLs in the artifact version.
"""

databusIdUrl: List[str] = []
Expand All @@ -898,6 +963,9 @@ def _get_file_download_urls_from_artifact_jsonld(json_str: str) -> List[str]:
graph = json_dict.get("@graph", [])
for node in graph:
if node.get("@type") == "Part":
if not _matches_filters(node, filters):
continue

file_uri = node.get("file")
if not isinstance(file_uri, str):
continue
Expand All @@ -916,20 +984,22 @@ def _download_group(
convert_to: str = None,
convert_from: str = None,
validate_checksum: bool = False,
filters: List[str] = None,
) -> None:
"""Download files in a databus group.

Args:
uri: The full databus group URI.
localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory.
all_versions: If True, download all versions of each artifact in the group; otherwise, only download the latest version.
vault_token_file: Path to Vault refresh token file for protected downloads.
databus_key: Databus API key for protected downloads.
localDir: Local directory to download files to.
all_versions: If True, download all versions; otherwise, only latest.
vault_token_file: Path to Vault refresh token file.
databus_key: Databus API key.
auth_url: Keycloak token endpoint URL.
client_id: Client ID for token exchange.
convert_to: Target compression format for on-the-fly conversion.
convert_to: Target compression format.
convert_from: Optional source compression format filter.
validate_checksum: Whether to validate checksums after downloading.
validate_checksum: Whether to validate checksums.
filters: Optional list of filters to apply to each file.
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
artifacts = _get_databus_artifacts_of_group(json_str)
Expand All @@ -946,6 +1016,7 @@ def _download_group(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)


Expand Down Expand Up @@ -1013,8 +1084,18 @@ def download(
validate_checksum: Whether to validate checksums after downloading.
"""
for databusURI in databusURIs:
# Support pipe-separated filters for version/artifact/group URIs
# Syntax: https://.../version|key1=val1|.format|..compression
filters = []
base_uri = databusURI
if databusURI.startswith("http://") or databusURI.startswith("https://"):
if "|" in databusURI:
parts = databusURI.split("|")
base_uri = parts[0]
filters = parts[1:]

host, account, group, artifact, version, file = (
get_databus_id_parts_from_file_url(databusURI)
get_databus_id_parts_from_file_url(base_uri)
)

# Determine endpoint per-URI if not explicitly provided
Expand Down Expand Up @@ -1064,9 +1145,9 @@ def download(
expected_checksum=expected,
)
elif version is not None:
print(f"Downloading version: {databusURI}")
print(f"Downloading version: {base_uri}")
_download_version(
databusURI,
base_uri,
localDir,
vault_token_file=token,
databus_key=databus_key,
Expand All @@ -1075,13 +1156,14 @@ def download(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)
elif artifact is not None:
print(
f"Downloading {'all' if all_versions else 'latest'} version(s) of artifact: {databusURI}"
f"Downloading {'all' if all_versions else 'latest'} version(s) of artifact: {base_uri}"
)
_download_artifact(
databusURI,
base_uri,
localDir,
all_versions=all_versions,
vault_token_file=token,
Expand All @@ -1091,13 +1173,14 @@ def download(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)
elif group is not None and group != "collections":
print(
f"Downloading group and all its artifacts and versions: {databusURI}"
f"Downloading group and all its artifacts and versions: {base_uri}"
)
_download_group(
databusURI,
base_uri,
localDir,
all_versions=all_versions,
vault_token_file=token,
Expand All @@ -1107,6 +1190,7 @@ def download(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)
elif account is not None:
print("accountId not supported yet") # TODO
Expand Down
51 changes: 50 additions & 1 deletion databusclient/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def app():
)
@click.option("--remote", help="rclone remote name (e.g., 'nextcloud')")
@click.option("--path", help="Remote path on Nextcloud (e.g., 'datasets/mydataset')")
@click.option(
"--dry-run", is_flag=True, help="Generate and print JSON-LD without deploying"
)
@click.argument("distributions", nargs=-1)
def deploy(
version_id,
Expand All @@ -73,6 +76,7 @@ def deploy(
webdav_url,
remote,
path,
dry_run,
distributions: List[str],
):
"""
Expand Down Expand Up @@ -105,6 +109,12 @@ def deploy(
license_url=license_url,
distributions=distributions,
)

if dry_run:
click.echo("[DRY-RUN] Generated DataID JSON-LD:")
click.echo(json.dumps(dataid, indent=2))
return

api_deploy.deploy(dataid=dataid, api_key=apikey)
return

Expand All @@ -113,6 +123,21 @@ def deploy(
click.echo(f"[MODE] Deploy from metadata file: {metadata_file}")
with open(metadata_file, "r") as f:
metadata = json.load(f)

if dry_run:
click.echo("[DRY-RUN] Would deploy from metadata file")
# We could still generate the full DataID here to show it
dataid = api_deploy.create_dataset(
version_id=version_id,
artifact_version_title=title,
artifact_version_abstract=abstract,
artifact_version_description=description,
license_url=license_url,
distributions=api_deploy._create_distributions_from_metadata(metadata),
)
click.echo(json.dumps(dataid, indent=2))
return

api_deploy.deploy_from_metadata(
metadata, version_id, title, abstract, description, license_url, apikey
)
Expand All @@ -134,7 +159,17 @@ def deploy(

click.echo("[MODE] Upload & Deploy to DBpedia Databus via Nextcloud")
click.echo(f"→ Uploading to: {remote}:{path}")
metadata = webdav.upload_to_webdav(distributions, remote, path, webdav_url)
if dry_run:
click.echo("[DRY-RUN] Skipping WebDAV upload")
metadata = []
else:
metadata = webdav.upload_to_webdav(distributions, remote, path, webdav_url)

if dry_run:
click.echo("[DRY-RUN] Generated metadata (partial):")
click.echo(json.dumps(metadata, indent=2))
return
Comment on lines +162 to +171
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

--dry-run in WebDAV mode never builds the DataID preview.

This branch returns after printing [], so it skips the same dataset-construction path that real metadata deploys use. The result is that deploy --dry-run --webdav-url ... does not actually simulate the deploy or surface metadata/DataID errors until a real run.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@databusclient/cli.py` around lines 162 - 171, The current dry-run branch
returns before building the dataset/DataID preview so `--dry-run --webdav-url`
prints an empty list; fix by ensuring metadata is constructed even in dry-run
mode: extract or reuse the same metadata-generation logic used for real deploys
(the code that produces `metadata` from `distributions`, `remote`, `path`, and
`webdav_url`) and call it regardless of `dry_run`, but only skip the actual
upload side-effect when `dry_run` is true (i.e., call `webdav.upload_to_webdav`
or a new `build_metadata` helper in both cases or pass a dry_run flag to
`webdav.upload_to_webdav`), and remove the premature `return` so the
DataID/metadata preview is printed for dry runs.


api_deploy.deploy_from_metadata(
metadata, version_id, title, abstract, description, license_url, apikey
)
Expand Down Expand Up @@ -254,5 +289,19 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool)
)


@app.command()
@click.argument("shell", type=click.Choice(["bash", "zsh", "fish"]))
def completion(shell):
"""Generate shell completion script."""
import os

if shell == "bash":
os.system("_DATABUSCLIENT_COMPLETE=bash_source databusclient")
elif shell == "zsh":
os.system("_DATABUSCLIENT_COMPLETE=zsh_source databusclient")
elif shell == "fish":
os.system("_DATABUSCLIENT_COMPLETE=fish_source databusclient")


if __name__ == "__main__":
app()
Loading
Loading