diff --git a/.coveragerc b/.coveragerc index 04092257a..e78e7a931 100644 --- a/.coveragerc +++ b/.coveragerc @@ -9,6 +9,6 @@ omit = google/cloud/bigquery_v2/* # Legacy proto-based types. exclude_lines = # Re-enable the standard pragma - pragma: NO COVER + pragma: (no cover|NO COVER) # Ignore debug-only repr def __repr__ diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml deleted file mode 100644 index 6d064ddb9..000000000 --- a/.github/.OwlBot.lock.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -docker: - image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:94bb690db96e6242b2567a4860a94d48fa48696d092e51b0884a1a2c0a79a407 -# created: 2024-07-31T14:52:44.926548819Z diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 6763f258c..c7478150e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,8 +4,8 @@ # For syntax help see: # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-syntax -# The @googleapis/api-bigquery is the default owner for changes in this repo -* @googleapis/api-bigquery @googleapis/yoshi-python +# The @googleapis/python-core-client-libraries is the default owner for changes in this repo +* @googleapis/python-core-client-libraries @googleapis/yoshi-python # The python-samples-reviewers team is the default owner for samples changes -/samples/ @googleapis/api-bigquery @googleapis/python-samples-owners @googleapis/yoshi-python +/samples/ @googleapis/python-core-client-libraries @googleapis/python-samples-owners @googleapis/yoshi-python diff --git a/.github/auto-approve.yml b/.github/auto-approve.yml deleted file mode 100644 index 311ebbb85..000000000 --- a/.github/auto-approve.yml +++ /dev/null @@ -1,3 +0,0 @@ -# https://github.com/googleapis/repo-automation-bots/tree/main/packages/auto-approve -processes: - - "OwlBotTemplateChanges" diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml index 5b7383dc7..ff168399d 100644 --- a/.github/blunderbuss.yml +++ b/.github/blunderbuss.yml @@ -4,14 +4,14 @@ # Note: This file is autogenerated. To make changes to the assignee # team, please update `codeowner_team` in `.repo-metadata.json`. assign_issues: - - googleapis/api-bigquery + - googleapis/python-core-client-libraries assign_issues_by: - labels: - "samples" to: - googleapis/python-samples-reviewers - - googleapis/api-bigquery + - googleapis/python-core-client-libraries assign_prs: - - googleapis/api-bigquery + - googleapis/python-core-client-libraries diff --git a/.github/release-please.yml b/.github/release-please.yml deleted file mode 100644 index 5161ab347..000000000 --- a/.github/release-please.yml +++ /dev/null @@ -1,14 +0,0 @@ -releaseType: python -handleGHRelease: true -# NOTE: this section is generated by synthtool.languages.python -# See https://github.com/googleapis/synthtool/blob/master/synthtool/languages/python.py -branches: -- branch: v2 - handleGHRelease: true - releaseType: python -- branch: v1 - handleGHRelease: true - releaseType: python -- branch: v0 - handleGHRelease: true - releaseType: python diff --git a/.github/release-trigger.yml b/.github/release-trigger.yml deleted file mode 100644 index d4ca94189..000000000 --- a/.github/release-trigger.yml +++ /dev/null @@ -1 +0,0 @@ -enabled: true diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml deleted file mode 100644 index 6543d5285..000000000 --- a/.github/sync-repo-settings.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# https://github.com/googleapis/repo-automation-bots/tree/main/packages/sync-repo-settings -mergeCommitAllowed: false -# Rules for main branch protection -branchProtectionRules: -# Identifies the protection rule pattern. Name of the branch to be protected. -# Defaults to `main` -- pattern: main - requiresLinearHistory: true - requiresCodeOwnerReviews: true - requiresStrictStatusChecks: true - requiredStatusCheckContexts: - - 'Kokoro' - - 'Kokoro snippets-3.8' - - 'Kokoro snippets-3.12' - - 'Kokoro system-3.8' - - 'Kokoro system-3.12' - - 'cla/google' - - 'Samples - Lint' - - 'Samples - Python 3.7' - - 'Samples - Python 3.8' - - 'Samples - Python 3.9' - - 'Samples - Python 3.10' - - 'Samples - Python 3.11' - - 'Samples - Python 3.12' -- pattern: v2 - requiresLinearHistory: true - requiresCodeOwnerReviews: true - requiresStrictStatusChecks: true - requiredStatusCheckContexts: - - 'Kokoro' - - 'Kokoro snippets-3.8' - - 'cla/google' - - 'Samples - Lint' - - 'Samples - Python 3.7' - - 'Samples - Python 3.8' diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..9372faac2 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,39 @@ +on: + pull_request: + branches: + - main +name: docs +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run docs session + run: | + nox -s docs-3.10 + + docfx: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run docfx session + run: | + nox -s docfx-3.10 diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml new file mode 100644 index 000000000..550724076 --- /dev/null +++ b/.github/workflows/unittest.yml @@ -0,0 +1,88 @@ +on: + pull_request: + branches: + - main +name: unittest +jobs: + unit: + runs-on: ubuntu-latest + strategy: + matrix: + python: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14'] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run unit tests + env: + COVERAGE_FILE: .coverage-${{ matrix.python }} + run: | + nox -s unit-${{ matrix.python }} + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: coverage-artifact-${{ matrix.python }} + path: .coverage-${{ matrix.python }} + include-hidden-files: true + + unit_noextras: + # Use `ubuntu-latest` runner. + runs-on: ubuntu-latest + strategy: + matrix: + python: ['3.9', '3.14'] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run unit_noextras tests + env: + COVERAGE_FILE: .coverage-unit-noextras-${{ matrix.python }} + run: | + nox -s unit_noextras-${{ matrix.python }} + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: coverage-artifact-unit-noextras-${{ matrix.python }} + path: .coverage-unit-noextras-${{ matrix.python }} + include-hidden-files: true + + cover: + runs-on: ubuntu-latest + needs: + - unit + - unit_noextras + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + - name: Install coverage + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install coverage + - name: Download coverage results + uses: actions/download-artifact@v4 + with: + path: .coverage-results/ + - name: Report coverage results + run: | + find .coverage-results -type f -name '*.zip' -exec unzip {} \; + coverage combine .coverage-results/**/.coverage* + coverage report --show-missing --fail-under=100 diff --git a/.kokoro/build.sh b/.kokoro/build.sh index e4da2e2a7..d41b45aa1 100755 --- a/.kokoro/build.sh +++ b/.kokoro/build.sh @@ -15,11 +15,13 @@ set -eo pipefail +CURRENT_DIR=$(dirname "${BASH_SOURCE[0]}") + if [[ -z "${PROJECT_ROOT:-}" ]]; then - PROJECT_ROOT="github/python-bigquery" + PROJECT_ROOT=$(realpath "${CURRENT_DIR}/..") fi -cd "${PROJECT_ROOT}" +pushd "${PROJECT_ROOT}" # Disable buffering, so that the logs stream through. export PYTHONUNBUFFERED=1 @@ -28,10 +30,16 @@ export PYTHONUNBUFFERED=1 env | grep KOKORO # Setup service account credentials. -export GOOGLE_APPLICATION_CREDENTIALS=${KOKORO_GFILE_DIR}/service-account.json +if [[ -f "${KOKORO_GFILE_DIR}/service-account.json" ]] +then + export GOOGLE_APPLICATION_CREDENTIALS=${KOKORO_GFILE_DIR}/service-account.json +fi # Setup project id. -export PROJECT_ID=$(cat "${KOKORO_GFILE_DIR}/project-id.json") +if [[ -f "${KOKORO_GFILE_DIR}/project-id.json" ]] +then + export PROJECT_ID=$(cat "${KOKORO_GFILE_DIR}/project-id.json") +fi # If this is a continuous build, send the test log to the FlakyBot. # See https://github.com/googleapis/repo-automation-bots/tree/main/packages/flakybot. @@ -46,7 +54,7 @@ fi # If NOX_SESSION is set, it only runs the specified session, # otherwise run all the sessions. if [[ -n "${NOX_SESSION:-}" ]]; then - python3 -m nox -s ${NOX_SESSION:-} + python3 -m nox -s ${NOX_SESSION:-} else - python3 -m nox + python3 -m nox fi diff --git a/.kokoro/continuous/prerelease-deps-3.12.cfg b/.kokoro/continuous/prerelease-deps-3.13.cfg similarity index 77% rename from .kokoro/continuous/prerelease-deps-3.12.cfg rename to .kokoro/continuous/prerelease-deps-3.13.cfg index ece962a17..99a1e7150 100644 --- a/.kokoro/continuous/prerelease-deps-3.12.cfg +++ b/.kokoro/continuous/prerelease-deps-3.13.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "prerelease_deps-3.12" + value: "prerelease_deps-3.13" } diff --git a/.kokoro/docker/docs/Dockerfile b/.kokoro/docker/docs/Dockerfile deleted file mode 100644 index e5410e296..000000000 --- a/.kokoro/docker/docs/Dockerfile +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ubuntu:24.04 - -ENV DEBIAN_FRONTEND noninteractive - -# Ensure local Python is preferred over distribution Python. -ENV PATH /usr/local/bin:$PATH - -# Install dependencies. -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - apt-transport-https \ - build-essential \ - ca-certificates \ - curl \ - dirmngr \ - git \ - gpg-agent \ - graphviz \ - libbz2-dev \ - libdb5.3-dev \ - libexpat1-dev \ - libffi-dev \ - liblzma-dev \ - libreadline-dev \ - libsnappy-dev \ - libssl-dev \ - libsqlite3-dev \ - portaudio19-dev \ - redis-server \ - software-properties-common \ - ssh \ - sudo \ - tcl \ - tcl-dev \ - tk \ - tk-dev \ - uuid-dev \ - wget \ - zlib1g-dev \ - && add-apt-repository universe \ - && apt-get update \ - && apt-get -y install jq \ - && apt-get clean autoclean \ - && apt-get autoremove -y \ - && rm -rf /var/lib/apt/lists/* \ - && rm -f /var/cache/apt/archives/*.deb - - -###################### Install python 3.10.14 for docs/docfx session - -# Download python 3.10.14 -RUN wget https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz - -# Extract files -RUN tar -xvf Python-3.10.14.tgz - -# Install python 3.10.14 -RUN ./Python-3.10.14/configure --enable-optimizations -RUN make altinstall - -ENV PATH /usr/local/bin/python3.10:$PATH - -###################### Install pip -RUN wget -O /tmp/get-pip.py 'https://bootstrap.pypa.io/get-pip.py' \ - && python3.10 /tmp/get-pip.py \ - && rm /tmp/get-pip.py - -# Test pip -RUN python3.10 -m pip - -# Install build requirements -COPY requirements.txt /requirements.txt -RUN python3.10 -m pip install --require-hashes -r requirements.txt - -CMD ["python3.10"] diff --git a/.kokoro/docker/docs/fetch_gpg_keys.sh b/.kokoro/docker/docs/fetch_gpg_keys.sh deleted file mode 100755 index d653dd868..000000000 --- a/.kokoro/docker/docs/fetch_gpg_keys.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# A script to fetch gpg keys with retry. -# Avoid jinja parsing the file. -# - -function retry { - if [[ "${#}" -le 1 ]]; then - echo "Usage: ${0} retry_count commands.." - exit 1 - fi - local retries=${1} - local command="${@:2}" - until [[ "${retries}" -le 0 ]]; do - $command && return 0 - if [[ $? -ne 0 ]]; then - echo "command failed, retrying" - ((retries--)) - fi - done - return 1 -} - -# 3.6.9, 3.7.5 (Ned Deily) -retry 3 gpg --keyserver ha.pool.sks-keyservers.net --recv-keys \ - 0D96DF4D4110E5C43FBFB17F2D347EA6AA65421D - -# 3.8.0 (Ɓukasz Langa) -retry 3 gpg --keyserver ha.pool.sks-keyservers.net --recv-keys \ - E3FF2839C048B25C084DEBE9B26995E310250568 - -# diff --git a/.kokoro/docker/docs/requirements.in b/.kokoro/docker/docs/requirements.in deleted file mode 100644 index 816817c67..000000000 --- a/.kokoro/docker/docs/requirements.in +++ /dev/null @@ -1 +0,0 @@ -nox diff --git a/.kokoro/docker/docs/requirements.txt b/.kokoro/docker/docs/requirements.txt deleted file mode 100644 index 7129c7715..000000000 --- a/.kokoro/docker/docs/requirements.txt +++ /dev/null @@ -1,42 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile --allow-unsafe --generate-hashes requirements.in -# -argcomplete==3.4.0 \ - --hash=sha256:69a79e083a716173e5532e0fa3bef45f793f4e61096cf52b5a42c0211c8b8aa5 \ - --hash=sha256:c2abcdfe1be8ace47ba777d4fce319eb13bf8ad9dace8d085dcad6eded88057f - # via nox -colorlog==6.8.2 \ - --hash=sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44 \ - --hash=sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33 - # via nox -distlib==0.3.8 \ - --hash=sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 \ - --hash=sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64 - # via virtualenv -filelock==3.15.4 \ - --hash=sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb \ - --hash=sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 - # via virtualenv -nox==2024.4.15 \ - --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ - --hash=sha256:ecf6700199cdfa9e5ea0a41ff5e6ef4641d09508eda6edb89d9987864115817f - # via -r requirements.in -packaging==24.1 \ - --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ - --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 - # via nox -platformdirs==4.2.2 \ - --hash=sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee \ - --hash=sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3 - # via virtualenv -tomli==2.0.1 \ - --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ - --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f - # via nox -virtualenv==20.26.3 \ - --hash=sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a \ - --hash=sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 - # via nox diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg deleted file mode 100644 index 41b86fc29..000000000 --- a/.kokoro/docs/common.cfg +++ /dev/null @@ -1,66 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline_v2.sh" - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-lib-docs" -} -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery/.kokoro/publish-docs.sh" -} - -env_vars: { - key: "STAGING_BUCKET" - value: "docs-staging" -} - -env_vars: { - key: "V2_STAGING_BUCKET" - # Push google cloud library docs to the Cloud RAD bucket `docs-staging-v2` - value: "docs-staging-v2" -} - -# It will upload the docker image after successful builds. -env_vars: { - key: "TRAMPOLINE_IMAGE_UPLOAD" - value: "true" -} - -# It will always build the docker image. -env_vars: { - key: "TRAMPOLINE_DOCKERFILE" - value: ".kokoro/docker/docs/Dockerfile" -} - -# Fetch the token needed for reporting release status to GitHub -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "yoshi-automation-github-key" - } - } -} - -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "docuploader_service_account" - } - } -} \ No newline at end of file diff --git a/.kokoro/docs/docs-presubmit.cfg b/.kokoro/docs/docs-presubmit.cfg deleted file mode 100644 index 08adb2e28..000000000 --- a/.kokoro/docs/docs-presubmit.cfg +++ /dev/null @@ -1,28 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "STAGING_BUCKET" - value: "gcloud-python-test" -} - -env_vars: { - key: "V2_STAGING_BUCKET" - value: "gcloud-python-test" -} - -# We only upload the image in the main `docs` build. -env_vars: { - key: "TRAMPOLINE_IMAGE_UPLOAD" - value: "false" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery/.kokoro/build.sh" -} - -# Only run this nox session. -env_vars: { - key: "NOX_SESSION" - value: "docs docfx" -} diff --git a/.kokoro/docs/docs.cfg b/.kokoro/docs/docs.cfg deleted file mode 100644 index 8f43917d9..000000000 --- a/.kokoro/docs/docs.cfg +++ /dev/null @@ -1 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto \ No newline at end of file diff --git a/.kokoro/presubmit/presubmit.cfg b/.kokoro/presubmit/presubmit.cfg deleted file mode 100644 index ac4cc5847..000000000 --- a/.kokoro/presubmit/presubmit.cfg +++ /dev/null @@ -1,7 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Disable system tests. -env_vars: { - key: "NOX_SESSION" - value: "unit_noextras unit cover docs docfx" -} diff --git a/.kokoro/presubmit/snippets-3.12.cfg b/.kokoro/presubmit/snippets-3.13.cfg similarity index 81% rename from .kokoro/presubmit/snippets-3.12.cfg rename to .kokoro/presubmit/snippets-3.13.cfg index 1381e8323..0b89f0863 100644 --- a/.kokoro/presubmit/snippets-3.12.cfg +++ b/.kokoro/presubmit/snippets-3.13.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "snippets-3.12" + value: "snippets-3.13" } diff --git a/.kokoro/presubmit/system-3.12.cfg b/.kokoro/presubmit/snippets-3.9.cfg similarity index 82% rename from .kokoro/presubmit/system-3.12.cfg rename to .kokoro/presubmit/snippets-3.9.cfg index 789455bd6..d1de209a2 100644 --- a/.kokoro/presubmit/system-3.12.cfg +++ b/.kokoro/presubmit/snippets-3.9.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system-3.12" + value: "snippets-3.9" } diff --git a/.kokoro/presubmit/snippets-3.8.cfg b/.kokoro/presubmit/system-3.13.cfg similarity index 80% rename from .kokoro/presubmit/snippets-3.8.cfg rename to .kokoro/presubmit/system-3.13.cfg index 840d9e716..a0e9a0108 100644 --- a/.kokoro/presubmit/snippets-3.8.cfg +++ b/.kokoro/presubmit/system-3.13.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "snippets-3.8" -} + value: "system-3.13" +} \ No newline at end of file diff --git a/.kokoro/presubmit/system-3.8.cfg b/.kokoro/presubmit/system-3.9.cfg similarity index 83% rename from .kokoro/presubmit/system-3.8.cfg rename to .kokoro/presubmit/system-3.9.cfg index f4bcee3db..b8ae66b37 100644 --- a/.kokoro/presubmit/system-3.8.cfg +++ b/.kokoro/presubmit/system-3.9.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system-3.8" + value: "system-3.9" } \ No newline at end of file diff --git a/.kokoro/publish-docs.sh b/.kokoro/publish-docs.sh deleted file mode 100755 index 233205d58..000000000 --- a/.kokoro/publish-docs.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -eo pipefail - -# Disable buffering, so that the logs stream through. -export PYTHONUNBUFFERED=1 - -export PATH="${HOME}/.local/bin:${PATH}" - -# Install nox -python3.10 -m pip install --require-hashes -r .kokoro/requirements.txt -python3.10 -m nox --version - -# build docs -nox -s docs - -# create metadata -python3.10 -m docuploader create-metadata \ - --name=$(jq --raw-output '.name // empty' .repo-metadata.json) \ - --version=$(python3.10 setup.py --version) \ - --language=$(jq --raw-output '.language // empty' .repo-metadata.json) \ - --distribution-name=$(python3.10 setup.py --name) \ - --product-page=$(jq --raw-output '.product_documentation // empty' .repo-metadata.json) \ - --github-repository=$(jq --raw-output '.repo // empty' .repo-metadata.json) \ - --issue-tracker=$(jq --raw-output '.issue_tracker // empty' .repo-metadata.json) - -cat docs.metadata - -# upload docs -python3.10 -m docuploader upload docs/_build/html --metadata-file docs.metadata --staging-bucket "${STAGING_BUCKET}" - - -# docfx yaml files -nox -s docfx - -# create metadata. -python3.10 -m docuploader create-metadata \ - --name=$(jq --raw-output '.name // empty' .repo-metadata.json) \ - --version=$(python3.10 setup.py --version) \ - --language=$(jq --raw-output '.language // empty' .repo-metadata.json) \ - --distribution-name=$(python3.10 setup.py --name) \ - --product-page=$(jq --raw-output '.product_documentation // empty' .repo-metadata.json) \ - --github-repository=$(jq --raw-output '.repo // empty' .repo-metadata.json) \ - --issue-tracker=$(jq --raw-output '.issue_tracker // empty' .repo-metadata.json) - -cat docs.metadata - -# upload docs -python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}" diff --git a/.kokoro/release.sh b/.kokoro/release.sh deleted file mode 100755 index 81cee716e..000000000 --- a/.kokoro/release.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -eo pipefail - -# Start the releasetool reporter -python3 -m pip install --require-hashes -r github/python-bigquery/.kokoro/requirements.txt -python3 -m releasetool publish-reporter-script > /tmp/publisher-script; source /tmp/publisher-script - -# Disable buffering, so that the logs stream through. -export PYTHONUNBUFFERED=1 - -# Move into the package, build the distribution and upload. -TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-1") -cd github/python-bigquery -python3 setup.py sdist bdist_wheel -twine upload --username __token__ --password "${TWINE_PASSWORD}" dist/* diff --git a/.kokoro/release/common.cfg b/.kokoro/release/common.cfg deleted file mode 100644 index cb8bbaa2e..000000000 --- a/.kokoro/release/common.cfg +++ /dev/null @@ -1,49 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline.sh" - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-multi" -} -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery/.kokoro/release.sh" -} - -# Fetch PyPI password -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "google-cloud-pypi-token-keystore-1" - } - } -} - -# Tokens needed to report release status back to GitHub -env_vars: { - key: "SECRET_MANAGER_KEYS" - value: "releasetool-publish-reporter-app,releasetool-publish-reporter-googleapis-installation,releasetool-publish-reporter-pem" -} - -# Store the packages we uploaded to PyPI. That way, we have a record of exactly -# what we published, which we can use to generate SBOMs and attestations. -action { - define_artifacts { - regex: "github/python-bigquery/**/*.tar.gz" - strip_prefix: "github/python-bigquery" - } -} diff --git a/.kokoro/release/release.cfg b/.kokoro/release/release.cfg deleted file mode 100644 index 8f43917d9..000000000 --- a/.kokoro/release/release.cfg +++ /dev/null @@ -1 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto \ No newline at end of file diff --git a/.kokoro/requirements.in b/.kokoro/requirements.in deleted file mode 100644 index fff4d9ce0..000000000 --- a/.kokoro/requirements.in +++ /dev/null @@ -1,11 +0,0 @@ -gcp-docuploader -gcp-releasetool>=2 # required for compatibility with cryptography>=42.x -importlib-metadata -typing-extensions -twine -wheel -setuptools -nox>=2022.11.21 # required to remove dependency on py -charset-normalizer<3 -click<8.1.0 -cryptography>=42.0.5 diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt deleted file mode 100644 index 9622baf0b..000000000 --- a/.kokoro/requirements.txt +++ /dev/null @@ -1,537 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile --allow-unsafe --generate-hashes requirements.in -# -argcomplete==3.4.0 \ - --hash=sha256:69a79e083a716173e5532e0fa3bef45f793f4e61096cf52b5a42c0211c8b8aa5 \ - --hash=sha256:c2abcdfe1be8ace47ba777d4fce319eb13bf8ad9dace8d085dcad6eded88057f - # via nox -attrs==23.2.0 \ - --hash=sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30 \ - --hash=sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1 - # via gcp-releasetool -backports-tarfile==1.2.0 \ - --hash=sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34 \ - --hash=sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991 - # via jaraco-context -cachetools==5.3.3 \ - --hash=sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945 \ - --hash=sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105 - # via google-auth -certifi==2024.7.4 \ - --hash=sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b \ - --hash=sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 - # via requests -cffi==1.16.0 \ - --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ - --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ - --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ - --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ - --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ - --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ - --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ - --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ - --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ - --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ - --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ - --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ - --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ - --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ - --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ - --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ - --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ - --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ - --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ - --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ - --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ - --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ - --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ - --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ - --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ - --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ - --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ - --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ - --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ - --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ - --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ - --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ - --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ - --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ - --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ - --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ - --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ - --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ - --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ - --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ - --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ - --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ - --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ - --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ - --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ - --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ - --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ - --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ - --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ - --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ - --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ - --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 - # via cryptography -charset-normalizer==2.1.1 \ - --hash=sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845 \ - --hash=sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f - # via - # -r requirements.in - # requests -click==8.0.4 \ - --hash=sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1 \ - --hash=sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb - # via - # -r requirements.in - # gcp-docuploader - # gcp-releasetool -colorlog==6.8.2 \ - --hash=sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44 \ - --hash=sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33 - # via - # gcp-docuploader - # nox -cryptography==42.0.8 \ - --hash=sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad \ - --hash=sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583 \ - --hash=sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b \ - --hash=sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c \ - --hash=sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1 \ - --hash=sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648 \ - --hash=sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949 \ - --hash=sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba \ - --hash=sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c \ - --hash=sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9 \ - --hash=sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d \ - --hash=sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c \ - --hash=sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e \ - --hash=sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2 \ - --hash=sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d \ - --hash=sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7 \ - --hash=sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70 \ - --hash=sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2 \ - --hash=sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7 \ - --hash=sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14 \ - --hash=sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe \ - --hash=sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e \ - --hash=sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71 \ - --hash=sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961 \ - --hash=sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7 \ - --hash=sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c \ - --hash=sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28 \ - --hash=sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842 \ - --hash=sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902 \ - --hash=sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801 \ - --hash=sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a \ - --hash=sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e - # via - # -r requirements.in - # gcp-releasetool - # secretstorage -distlib==0.3.8 \ - --hash=sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 \ - --hash=sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64 - # via virtualenv -docutils==0.21.2 \ - --hash=sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f \ - --hash=sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 - # via readme-renderer -filelock==3.15.4 \ - --hash=sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb \ - --hash=sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 - # via virtualenv -gcp-docuploader==0.6.5 \ - --hash=sha256:30221d4ac3e5a2b9c69aa52fdbef68cc3f27d0e6d0d90e220fc024584b8d2318 \ - --hash=sha256:b7458ef93f605b9d46a4bf3a8dc1755dad1f31d030c8679edf304e343b347eea - # via -r requirements.in -gcp-releasetool==2.0.1 \ - --hash=sha256:34314a910c08e8911d9c965bd44f8f2185c4f556e737d719c33a41f6a610de96 \ - --hash=sha256:b0d5863c6a070702b10883d37c4bdfd74bf930fe417f36c0c965d3b7c779ae62 - # via -r requirements.in -google-api-core==2.19.1 \ - --hash=sha256:f12a9b8309b5e21d92483bbd47ce2c445861ec7d269ef6784ecc0ea8c1fa6125 \ - --hash=sha256:f4695f1e3650b316a795108a76a1c416e6afb036199d1c1f1f110916df479ffd - # via - # google-cloud-core - # google-cloud-storage -google-auth==2.31.0 \ - --hash=sha256:042c4702efa9f7d3c48d3a69341c209381b125faa6dbf3ebe56bc7e40ae05c23 \ - --hash=sha256:87805c36970047247c8afe614d4e3af8eceafc1ebba0c679fe75ddd1d575e871 - # via - # gcp-releasetool - # google-api-core - # google-cloud-core - # google-cloud-storage -google-cloud-core==2.4.1 \ - --hash=sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073 \ - --hash=sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61 - # via google-cloud-storage -google-cloud-storage==2.17.0 \ - --hash=sha256:49378abff54ef656b52dca5ef0f2eba9aa83dc2b2c72c78714b03a1a95fe9388 \ - --hash=sha256:5b393bc766b7a3bc6f5407b9e665b2450d36282614b7945e570b3480a456d1e1 - # via gcp-docuploader -google-crc32c==1.5.0 \ - --hash=sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a \ - --hash=sha256:02c65b9817512edc6a4ae7c7e987fea799d2e0ee40c53ec573a692bee24de876 \ - --hash=sha256:02ebb8bf46c13e36998aeaad1de9b48f4caf545e91d14041270d9dca767b780c \ - --hash=sha256:07eb3c611ce363c51a933bf6bd7f8e3878a51d124acfc89452a75120bc436289 \ - --hash=sha256:1034d91442ead5a95b5aaef90dbfaca8633b0247d1e41621d1e9f9db88c36298 \ - --hash=sha256:116a7c3c616dd14a3de8c64a965828b197e5f2d121fedd2f8c5585c547e87b02 \ - --hash=sha256:19e0a019d2c4dcc5e598cd4a4bc7b008546b0358bd322537c74ad47a5386884f \ - --hash=sha256:1c7abdac90433b09bad6c43a43af253e688c9cfc1c86d332aed13f9a7c7f65e2 \ - --hash=sha256:1e986b206dae4476f41bcec1faa057851f3889503a70e1bdb2378d406223994a \ - --hash=sha256:272d3892a1e1a2dbc39cc5cde96834c236d5327e2122d3aaa19f6614531bb6eb \ - --hash=sha256:278d2ed7c16cfc075c91378c4f47924c0625f5fc84b2d50d921b18b7975bd210 \ - --hash=sha256:2ad40e31093a4af319dadf503b2467ccdc8f67c72e4bcba97f8c10cb078207b5 \ - --hash=sha256:2e920d506ec85eb4ba50cd4228c2bec05642894d4c73c59b3a2fe20346bd00ee \ - --hash=sha256:3359fc442a743e870f4588fcf5dcbc1bf929df1fad8fb9905cd94e5edb02e84c \ - --hash=sha256:37933ec6e693e51a5b07505bd05de57eee12f3e8c32b07da7e73669398e6630a \ - --hash=sha256:398af5e3ba9cf768787eef45c803ff9614cc3e22a5b2f7d7ae116df8b11e3314 \ - --hash=sha256:3b747a674c20a67343cb61d43fdd9207ce5da6a99f629c6e2541aa0e89215bcd \ - --hash=sha256:461665ff58895f508e2866824a47bdee72497b091c730071f2b7575d5762ab65 \ - --hash=sha256:4c6fdd4fccbec90cc8a01fc00773fcd5fa28db683c116ee3cb35cd5da9ef6c37 \ - --hash=sha256:5829b792bf5822fd0a6f6eb34c5f81dd074f01d570ed7f36aa101d6fc7a0a6e4 \ - --hash=sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13 \ - --hash=sha256:5ae44e10a8e3407dbe138984f21e536583f2bba1be9491239f942c2464ac0894 \ - --hash=sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31 \ - --hash=sha256:64e52e2b3970bd891309c113b54cf0e4384762c934d5ae56e283f9a0afcd953e \ - --hash=sha256:66741ef4ee08ea0b2cc3c86916ab66b6aef03768525627fd6a1b34968b4e3709 \ - --hash=sha256:67b741654b851abafb7bc625b6d1cdd520a379074e64b6a128e3b688c3c04740 \ - --hash=sha256:6ac08d24c1f16bd2bf5eca8eaf8304812f44af5cfe5062006ec676e7e1d50afc \ - --hash=sha256:6f998db4e71b645350b9ac28a2167e6632c239963ca9da411523bb439c5c514d \ - --hash=sha256:72218785ce41b9cfd2fc1d6a017dc1ff7acfc4c17d01053265c41a2c0cc39b8c \ - --hash=sha256:74dea7751d98034887dbd821b7aae3e1d36eda111d6ca36c206c44478035709c \ - --hash=sha256:759ce4851a4bb15ecabae28f4d2e18983c244eddd767f560165563bf9aefbc8d \ - --hash=sha256:77e2fd3057c9d78e225fa0a2160f96b64a824de17840351b26825b0848022906 \ - --hash=sha256:7c074fece789b5034b9b1404a1f8208fc2d4c6ce9decdd16e8220c5a793e6f61 \ - --hash=sha256:7c42c70cd1d362284289c6273adda4c6af8039a8ae12dc451dcd61cdabb8ab57 \ - --hash=sha256:7f57f14606cd1dd0f0de396e1e53824c371e9544a822648cd76c034d209b559c \ - --hash=sha256:83c681c526a3439b5cf94f7420471705bbf96262f49a6fe546a6db5f687a3d4a \ - --hash=sha256:8485b340a6a9e76c62a7dce3c98e5f102c9219f4cfbf896a00cf48caf078d438 \ - --hash=sha256:84e6e8cd997930fc66d5bb4fde61e2b62ba19d62b7abd7a69920406f9ecca946 \ - --hash=sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7 \ - --hash=sha256:8b87e1a59c38f275c0e3676fc2ab6d59eccecfd460be267ac360cc31f7bcde96 \ - --hash=sha256:8f24ed114432de109aa9fd317278518a5af2d31ac2ea6b952b2f7782b43da091 \ - --hash=sha256:98cb4d057f285bd80d8778ebc4fde6b4d509ac3f331758fb1528b733215443ae \ - --hash=sha256:998679bf62b7fb599d2878aa3ed06b9ce688b8974893e7223c60db155f26bd8d \ - --hash=sha256:9ba053c5f50430a3fcfd36f75aff9caeba0440b2d076afdb79a318d6ca245f88 \ - --hash=sha256:9c99616c853bb585301df6de07ca2cadad344fd1ada6d62bb30aec05219c45d2 \ - --hash=sha256:a1fd716e7a01f8e717490fbe2e431d2905ab8aa598b9b12f8d10abebb36b04dd \ - --hash=sha256:a2355cba1f4ad8b6988a4ca3feed5bff33f6af2d7f134852cf279c2aebfde541 \ - --hash=sha256:b1f8133c9a275df5613a451e73f36c2aea4fe13c5c8997e22cf355ebd7bd0728 \ - --hash=sha256:b8667b48e7a7ef66afba2c81e1094ef526388d35b873966d8a9a447974ed9178 \ - --hash=sha256:ba1eb1843304b1e5537e1fca632fa894d6f6deca8d6389636ee5b4797affb968 \ - --hash=sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346 \ - --hash=sha256:c02ec1c5856179f171e032a31d6f8bf84e5a75c45c33b2e20a3de353b266ebd8 \ - --hash=sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93 \ - --hash=sha256:c6c777a480337ac14f38564ac88ae82d4cd238bf293f0a22295b66eb89ffced7 \ - --hash=sha256:cae0274952c079886567f3f4f685bcaf5708f0a23a5f5216fdab71f81a6c0273 \ - --hash=sha256:cd67cf24a553339d5062eff51013780a00d6f97a39ca062781d06b3a73b15462 \ - --hash=sha256:d3515f198eaa2f0ed49f8819d5732d70698c3fa37384146079b3799b97667a94 \ - --hash=sha256:d5280312b9af0976231f9e317c20e4a61cd2f9629b7bfea6a693d1878a264ebd \ - --hash=sha256:de06adc872bcd8c2a4e0dc51250e9e65ef2ca91be023b9d13ebd67c2ba552e1e \ - --hash=sha256:e1674e4307fa3024fc897ca774e9c7562c957af85df55efe2988ed9056dc4e57 \ - --hash=sha256:e2096eddb4e7c7bdae4bd69ad364e55e07b8316653234a56552d9c988bd2d61b \ - --hash=sha256:e560628513ed34759456a416bf86b54b2476c59144a9138165c9a1575801d0d9 \ - --hash=sha256:edfedb64740750e1a3b16152620220f51d58ff1b4abceb339ca92e934775c27a \ - --hash=sha256:f13cae8cc389a440def0c8c52057f37359014ccbc9dc1f0827936bcd367c6100 \ - --hash=sha256:f314013e7dcd5cf45ab1945d92e713eec788166262ae8deb2cfacd53def27325 \ - --hash=sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183 \ - --hash=sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556 \ - --hash=sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.1 \ - --hash=sha256:103ebc4ba331ab1bfdac0250f8033627a2cd7cde09e7ccff9181e31ba4315b2c \ - --hash=sha256:eae451a7b2e2cdbaaa0fd2eb00cc8a1ee5e95e16b55597359cbc3d27d7d90e33 - # via google-cloud-storage -googleapis-common-protos==1.63.2 \ - --hash=sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945 \ - --hash=sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87 - # via google-api-core -idna==3.7 \ - --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ - --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 - # via requests -importlib-metadata==8.0.0 \ - --hash=sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f \ - --hash=sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812 - # via - # -r requirements.in - # keyring - # twine -jaraco-classes==3.4.0 \ - --hash=sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd \ - --hash=sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790 - # via keyring -jaraco-context==5.3.0 \ - --hash=sha256:3e16388f7da43d384a1a7cd3452e72e14732ac9fe459678773a3608a812bf266 \ - --hash=sha256:c2f67165ce1f9be20f32f650f25d8edfc1646a8aeee48ae06fb35f90763576d2 - # via keyring -jaraco-functools==4.0.1 \ - --hash=sha256:3b24ccb921d6b593bdceb56ce14799204f473976e2a9d4b15b04d0f2c2326664 \ - --hash=sha256:d33fa765374c0611b52f8b3a795f8900869aa88c84769d4d1746cd68fb28c3e8 - # via keyring -jeepney==0.8.0 \ - --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \ - --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 - # via - # keyring - # secretstorage -jinja2==3.1.4 \ - --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \ - --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d - # via gcp-releasetool -keyring==25.2.1 \ - --hash=sha256:2458681cdefc0dbc0b7eb6cf75d0b98e59f9ad9b2d4edd319d18f68bdca95e50 \ - --hash=sha256:daaffd42dbda25ddafb1ad5fec4024e5bbcfe424597ca1ca452b299861e49f1b - # via - # gcp-releasetool - # twine -markdown-it-py==3.0.0 \ - --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ - --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb - # via rich -markupsafe==2.1.5 \ - --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \ - --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \ - --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \ - --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \ - --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \ - --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \ - --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \ - --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \ - --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \ - --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \ - --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \ - --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \ - --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \ - --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \ - --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \ - --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \ - --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \ - --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \ - --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \ - --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \ - --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \ - --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \ - --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \ - --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \ - --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \ - --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \ - --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \ - --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \ - --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \ - --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \ - --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \ - --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \ - --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \ - --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \ - --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \ - --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \ - --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \ - --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \ - --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \ - --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \ - --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \ - --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \ - --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \ - --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \ - --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \ - --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \ - --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \ - --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \ - --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \ - --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \ - --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \ - --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \ - --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \ - --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \ - --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \ - --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \ - --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \ - --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \ - --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \ - --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68 - # via jinja2 -mdurl==0.1.2 \ - --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ - --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba - # via markdown-it-py -more-itertools==10.3.0 \ - --hash=sha256:e5d93ef411224fbcef366a6e8ddc4c5781bc6359d43412a65dd5964e46111463 \ - --hash=sha256:ea6a02e24a9161e51faad17a8782b92a0df82c12c1c8886fec7f0c3fa1a1b320 - # via - # jaraco-classes - # jaraco-functools -nh3==0.2.18 \ - --hash=sha256:0411beb0589eacb6734f28d5497ca2ed379eafab8ad8c84b31bb5c34072b7164 \ - --hash=sha256:14c5a72e9fe82aea5fe3072116ad4661af5cf8e8ff8fc5ad3450f123e4925e86 \ - --hash=sha256:19aaba96e0f795bd0a6c56291495ff59364f4300d4a39b29a0abc9cb3774a84b \ - --hash=sha256:34c03fa78e328c691f982b7c03d4423bdfd7da69cd707fe572f544cf74ac23ad \ - --hash=sha256:36c95d4b70530b320b365659bb5034341316e6a9b30f0b25fa9c9eff4c27a204 \ - --hash=sha256:3a157ab149e591bb638a55c8c6bcb8cdb559c8b12c13a8affaba6cedfe51713a \ - --hash=sha256:42c64511469005058cd17cc1537578eac40ae9f7200bedcfd1fc1a05f4f8c200 \ - --hash=sha256:5f36b271dae35c465ef5e9090e1fdaba4a60a56f0bb0ba03e0932a66f28b9189 \ - --hash=sha256:6955369e4d9f48f41e3f238a9e60f9410645db7e07435e62c6a9ea6135a4907f \ - --hash=sha256:7b7c2a3c9eb1a827d42539aa64091640bd275b81e097cd1d8d82ef91ffa2e811 \ - --hash=sha256:8ce0f819d2f1933953fca255db2471ad58184a60508f03e6285e5114b6254844 \ - --hash=sha256:94a166927e53972a9698af9542ace4e38b9de50c34352b962f4d9a7d4c927af4 \ - --hash=sha256:a7f1b5b2c15866f2db413a3649a8fe4fd7b428ae58be2c0f6bca5eefd53ca2be \ - --hash=sha256:c8b3a1cebcba9b3669ed1a84cc65bf005728d2f0bc1ed2a6594a992e817f3a50 \ - --hash=sha256:de3ceed6e661954871d6cd78b410213bdcb136f79aafe22aa7182e028b8c7307 \ - --hash=sha256:f0eca9ca8628dbb4e916ae2491d72957fdd35f7a5d326b7032a345f111ac07fe - # via readme-renderer -nox==2024.4.15 \ - --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ - --hash=sha256:ecf6700199cdfa9e5ea0a41ff5e6ef4641d09508eda6edb89d9987864115817f - # via -r requirements.in -packaging==24.1 \ - --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ - --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 - # via - # gcp-releasetool - # nox -pkginfo==1.10.0 \ - --hash=sha256:5df73835398d10db79f8eecd5cd86b1f6d29317589ea70796994d49399af6297 \ - --hash=sha256:889a6da2ed7ffc58ab5b900d888ddce90bce912f2d2de1dc1c26f4cb9fe65097 - # via twine -platformdirs==4.2.2 \ - --hash=sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee \ - --hash=sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3 - # via virtualenv -proto-plus==1.24.0 \ - --hash=sha256:30b72a5ecafe4406b0d339db35b56c4059064e69227b8c3bda7462397f966445 \ - --hash=sha256:402576830425e5f6ce4c2a6702400ac79897dab0b4343821aa5188b0fab81a12 - # via google-api-core -protobuf==5.27.2 \ - --hash=sha256:0e341109c609749d501986b835f667c6e1e24531096cff9d34ae411595e26505 \ - --hash=sha256:176c12b1f1c880bf7a76d9f7c75822b6a2bc3db2d28baa4d300e8ce4cde7409b \ - --hash=sha256:354d84fac2b0d76062e9b3221f4abbbacdfd2a4d8af36bab0474f3a0bb30ab38 \ - --hash=sha256:4fadd8d83e1992eed0248bc50a4a6361dc31bcccc84388c54c86e530b7f58863 \ - --hash=sha256:54330f07e4949d09614707c48b06d1a22f8ffb5763c159efd5c0928326a91470 \ - --hash=sha256:610e700f02469c4a997e58e328cac6f305f649826853813177e6290416e846c6 \ - --hash=sha256:7fc3add9e6003e026da5fc9e59b131b8f22b428b991ccd53e2af8071687b4fce \ - --hash=sha256:9e8f199bf7f97bd7ecebffcae45ebf9527603549b2b562df0fbc6d4d688f14ca \ - --hash=sha256:a109916aaac42bff84702fb5187f3edadbc7c97fc2c99c5ff81dd15dcce0d1e5 \ - --hash=sha256:b848dbe1d57ed7c191dfc4ea64b8b004a3f9ece4bf4d0d80a367b76df20bf36e \ - --hash=sha256:f3ecdef226b9af856075f28227ff2c90ce3a594d092c39bee5513573f25e2714 - # via - # gcp-docuploader - # gcp-releasetool - # google-api-core - # googleapis-common-protos - # proto-plus -pyasn1==0.6.0 \ - --hash=sha256:3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c \ - --hash=sha256:cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.0 \ - --hash=sha256:831dbcea1b177b28c9baddf4c6d1013c24c3accd14a1873fffaa6a2e905f17b6 \ - --hash=sha256:be04f15b66c206eed667e0bb5ab27e2b1855ea54a842e5037738099e8ca4ae0b - # via google-auth -pycparser==2.22 \ - --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \ - --hash=sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc - # via cffi -pygments==2.18.0 \ - --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ - --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a - # via - # readme-renderer - # rich -pyjwt==2.8.0 \ - --hash=sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de \ - --hash=sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320 - # via gcp-releasetool -pyperclip==1.9.0 \ - --hash=sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310 - # via gcp-releasetool -python-dateutil==2.9.0.post0 \ - --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \ - --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 - # via gcp-releasetool -readme-renderer==44.0 \ - --hash=sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151 \ - --hash=sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1 - # via twine -requests==2.32.3 \ - --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ - --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 - # via - # gcp-releasetool - # google-api-core - # google-cloud-storage - # requests-toolbelt - # twine -requests-toolbelt==1.0.0 \ - --hash=sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6 \ - --hash=sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06 - # via twine -rfc3986==2.0.0 \ - --hash=sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd \ - --hash=sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c - # via twine -rich==13.7.1 \ - --hash=sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222 \ - --hash=sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432 - # via twine -rsa==4.9 \ - --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \ - --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21 - # via google-auth -secretstorage==3.3.3 \ - --hash=sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77 \ - --hash=sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99 - # via keyring -six==1.16.0 \ - --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ - --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 - # via - # gcp-docuploader - # python-dateutil -tomli==2.0.1 \ - --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ - --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f - # via nox -twine==5.1.1 \ - --hash=sha256:215dbe7b4b94c2c50a7315c0275d2258399280fbb7d04182c7e55e24b5f93997 \ - --hash=sha256:9aa0825139c02b3434d913545c7b847a21c835e11597f5255842d457da2322db - # via -r requirements.in -typing-extensions==4.12.2 \ - --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ - --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 - # via -r requirements.in -urllib3==2.2.2 \ - --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \ - --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168 - # via - # requests - # twine -virtualenv==20.26.3 \ - --hash=sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a \ - --hash=sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 - # via nox -wheel==0.43.0 \ - --hash=sha256:465ef92c69fa5c5da2d1cf8ac40559a8c940886afcef87dcf14b9470862f1d85 \ - --hash=sha256:55c570405f142630c6b9f72fe09d9b67cf1477fcf543ae5b8dcb1f5b7377da81 - # via -r requirements.in -zipp==3.19.2 \ - --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ - --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c - # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -setuptools==70.2.0 \ - --hash=sha256:b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05 \ - --hash=sha256:bd63e505105011b25c3c11f753f7e3b8465ea739efddaccef8f0efac2137bac1 - # via -r requirements.in diff --git a/.kokoro/samples/python3.7/common.cfg b/.kokoro/samples/python3.13/common.cfg similarity index 87% rename from .kokoro/samples/python3.7/common.cfg rename to .kokoro/samples/python3.13/common.cfg index d30dc6018..ee9688995 100644 --- a/.kokoro/samples/python3.7/common.cfg +++ b/.kokoro/samples/python3.13/common.cfg @@ -10,13 +10,13 @@ action { # Specify which tests to run env_vars: { key: "RUN_TESTS_SESSION" - value: "py-3.7" + value: "py-3.13" } # Declare build specific Cloud project. env_vars: { key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py37" + value: "python-docs-samples-tests-313" } env_vars: { @@ -37,4 +37,4 @@ gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" diff --git a/.kokoro/samples/python3.7/continuous.cfg b/.kokoro/samples/python3.13/continuous.cfg similarity index 100% rename from .kokoro/samples/python3.7/continuous.cfg rename to .kokoro/samples/python3.13/continuous.cfg diff --git a/.kokoro/samples/python3.7/periodic-head.cfg b/.kokoro/samples/python3.13/periodic-head.cfg similarity index 100% rename from .kokoro/samples/python3.7/periodic-head.cfg rename to .kokoro/samples/python3.13/periodic-head.cfg diff --git a/.kokoro/samples/python3.7/periodic.cfg b/.kokoro/samples/python3.13/periodic.cfg similarity index 100% rename from .kokoro/samples/python3.7/periodic.cfg rename to .kokoro/samples/python3.13/periodic.cfg diff --git a/.kokoro/samples/python3.7/presubmit.cfg b/.kokoro/samples/python3.13/presubmit.cfg similarity index 100% rename from .kokoro/samples/python3.7/presubmit.cfg rename to .kokoro/samples/python3.13/presubmit.cfg diff --git a/.kokoro/samples/python3.8/common.cfg b/.kokoro/samples/python3.14/common.cfg similarity index 87% rename from .kokoro/samples/python3.8/common.cfg rename to .kokoro/samples/python3.14/common.cfg index 46759c6d6..d2fcee553 100644 --- a/.kokoro/samples/python3.8/common.cfg +++ b/.kokoro/samples/python3.14/common.cfg @@ -10,13 +10,13 @@ action { # Specify which tests to run env_vars: { key: "RUN_TESTS_SESSION" - value: "py-3.8" + value: "py-3.14" } # Declare build specific Cloud project. env_vars: { key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py38" + value: "python-docs-samples-tests-314" } env_vars: { @@ -37,4 +37,4 @@ gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" diff --git a/.kokoro/samples/python3.8/continuous.cfg b/.kokoro/samples/python3.14/continuous.cfg similarity index 100% rename from .kokoro/samples/python3.8/continuous.cfg rename to .kokoro/samples/python3.14/continuous.cfg diff --git a/.kokoro/samples/python3.8/periodic-head.cfg b/.kokoro/samples/python3.14/periodic-head.cfg similarity index 100% rename from .kokoro/samples/python3.8/periodic-head.cfg rename to .kokoro/samples/python3.14/periodic-head.cfg diff --git a/.kokoro/samples/python3.8/periodic.cfg b/.kokoro/samples/python3.14/periodic.cfg similarity index 100% rename from .kokoro/samples/python3.8/periodic.cfg rename to .kokoro/samples/python3.14/periodic.cfg diff --git a/.kokoro/samples/python3.8/presubmit.cfg b/.kokoro/samples/python3.14/presubmit.cfg similarity index 100% rename from .kokoro/samples/python3.8/presubmit.cfg rename to .kokoro/samples/python3.14/presubmit.cfg diff --git a/.kokoro/test-samples-impl.sh b/.kokoro/test-samples-impl.sh index 55910c8ba..40e248822 100755 --- a/.kokoro/test-samples-impl.sh +++ b/.kokoro/test-samples-impl.sh @@ -33,7 +33,7 @@ export PYTHONUNBUFFERED=1 env | grep KOKORO # Install nox -python3.9 -m pip install --upgrade --quiet nox +python3.9 -m pip install --upgrade --quiet nox virtualenv # Use secrets acessor service account to get secrets if [[ -f "${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" ]]; then diff --git a/.librarian/state.yaml b/.librarian/state.yaml new file mode 100644 index 000000000..efce633f2 --- /dev/null +++ b/.librarian/state.yaml @@ -0,0 +1,11 @@ +image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620 +libraries: + - id: google-cloud-bigquery + version: 3.40.1 + last_generated_commit: "" + apis: [] + source_roots: + - . + preserve_regex: [] + remove_regex: [] + tag_format: v{version} diff --git a/.repo-metadata.json b/.repo-metadata.json index d1be7ec4d..82a1684ca 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -12,7 +12,7 @@ "api_id": "bigquery.googleapis.com", "requires_billing": false, "default_version": "v2", - "codeowner_team": "@googleapis/api-bigquery", + "codeowner_team": "@googleapis/python-core-client-libraries", "api_shortname": "bigquery", "api_description": "is a fully managed, NoOps, low cost data analytics service.\nData can be streamed into BigQuery at millions of rows per second to enable real-time analysis.\nWith BigQuery you can easily deploy Petabyte-scale Databases." } diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a089b8b4..083dbfc4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,295 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [3.40.1](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-v3.40.0...google-cloud-bigquery-v3.40.1) (2026-02-12) + + +### Documentation + +* clarify that only jobs.query and jobs.getQueryResults are affec
 (#2349) ([73228432a3c821db05d898ea4a4788adf15b033d](https://github.com/googleapis/google-cloud-python/commit/73228432a3c821db05d898ea4a4788adf15b033d)) + + +### Bug Fixes + +* updates timeout/retry code to respect hanging server (#2408) ([24d45d0d5bf89762f253ba6bd6fdbee9d5993422](https://github.com/googleapis/google-cloud-python/commit/24d45d0d5bf89762f253ba6bd6fdbee9d5993422)) +* add timeout parameter to to_dataframe and to_arrow met
 (#2354) ([4f67ba20b49159e81f645ed98e401b9bb1359c1a](https://github.com/googleapis/google-cloud-python/commit/4f67ba20b49159e81f645ed98e401b9bb1359c1a)) + +## [3.40.0](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-v3.39.0...google-cloud-bigquery-v3.40.0) (2026-01-08) + + +### Features + +* support load_table and list_rows with picosecond timestamp (#2351) ([46764a59ca7a21ed14ad2c91eb7f98c302736c22](https://github.com/googleapis/google-cloud-python/commit/46764a59ca7a21ed14ad2c91eb7f98c302736c22)) +* support timestamp_precision in table schema (#2333) ([8d5785aea50b9f9e5b13bd4c91e8a08d6dac7778](https://github.com/googleapis/google-cloud-python/commit/8d5785aea50b9f9e5b13bd4c91e8a08d6dac7778)) + +## [3.39.0](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-v3.38.0...google-cloud-bigquery-v3.39.0) (2025-12-12) + + +### Documentation + +* remove experimental annotations from GA features (#2303) ([1f1f9d41e8a2c9016198d848ad3f1cbb88cf77b0](https://github.com/googleapis/google-cloud-python/commit/1f1f9d41e8a2c9016198d848ad3f1cbb88cf77b0)) + + +### Features + +* adds support for Python runtime 3.14 (#2322) ([6065e14c448cb430189982dd70025fa0575777ca](https://github.com/googleapis/google-cloud-python/commit/6065e14c448cb430189982dd70025fa0575777ca)) +* Add ExternalRuntimeOptions to BigQuery routine (#2311) ([fa76e310a16ea6cba0071ff1d767ca1c71514da7](https://github.com/googleapis/google-cloud-python/commit/fa76e310a16ea6cba0071ff1d767ca1c71514da7)) + + +### Bug Fixes + +* include `io.Base` in the `PathType` (#2323) ([b11e09cb6ee32e451b37eda66bece2220b9ceaba](https://github.com/googleapis/google-cloud-python/commit/b11e09cb6ee32e451b37eda66bece2220b9ceaba)) +* honor custom `retry` in `job.result()` (#2302) ([e118b029bbc89a5adbab83f39858c356c23665bf](https://github.com/googleapis/google-cloud-python/commit/e118b029bbc89a5adbab83f39858c356c23665bf)) +* remove ambiguous error codes from query retries (#2308) ([8bbd3d01026c493dfa5903b397d2b01c0e9bf43b](https://github.com/googleapis/google-cloud-python/commit/8bbd3d01026c493dfa5903b397d2b01c0e9bf43b)) + + +## [3.38.0](https://github.com/googleapis/python-bigquery/compare/v3.37.0...v3.38.0) (2025-09-15) + + +### Features + +* Add additional query stats ([#2270](https://github.com/googleapis/python-bigquery/issues/2270)) ([7b1b718](https://github.com/googleapis/python-bigquery/commit/7b1b718123afd80c0f68212946e4179bcd6db67f)) + +## [3.37.0](https://github.com/googleapis/python-bigquery/compare/v3.36.0...v3.37.0) (2025-09-08) + + +### Features + +* Updates to fastpath query execution ([#2268](https://github.com/googleapis/python-bigquery/issues/2268)) ([ef2740a](https://github.com/googleapis/python-bigquery/commit/ef2740a158199633b5543a7b6eb19587580792cd)) + + +### Bug Fixes + +* Remove deepcopy while setting properties for _QueryResults ([#2280](https://github.com/googleapis/python-bigquery/issues/2280)) ([33ea296](https://github.com/googleapis/python-bigquery/commit/33ea29616c06a2e2a106a785d216e784737ae386)) + + +### Documentation + +* Clarify that the presence of `XyzJob.errors` doesn't necessarily mean that the job has not completed or was unsuccessful ([#2278](https://github.com/googleapis/python-bigquery/issues/2278)) ([6e88d7d](https://github.com/googleapis/python-bigquery/commit/6e88d7dbe42ebfc35986da665d656b49ac481db4)) +* Clarify the api_method arg for client.query() ([#2277](https://github.com/googleapis/python-bigquery/issues/2277)) ([8a13c12](https://github.com/googleapis/python-bigquery/commit/8a13c12905ffcb3dbb6086a61df37556f0c2cd31)) + +## [3.36.0](https://github.com/googleapis/python-bigquery/compare/v3.35.1...v3.36.0) (2025-08-20) + + +### Features + +* Add created/started/ended properties to RowIterator. ([#2260](https://github.com/googleapis/python-bigquery/issues/2260)) ([0a95b24](https://github.com/googleapis/python-bigquery/commit/0a95b24192395cc3ccf801aa9bc318999873a2bf)) +* Retry query jobs if `jobBackendError` or `jobInternalError` are encountered ([#2256](https://github.com/googleapis/python-bigquery/issues/2256)) ([3deff1d](https://github.com/googleapis/python-bigquery/commit/3deff1d963980800e8b79fa3aaf5b712d4fd5062)) + + +### Documentation + +* Add a TROUBLESHOOTING.md file with tips for logging ([#2262](https://github.com/googleapis/python-bigquery/issues/2262)) ([b684832](https://github.com/googleapis/python-bigquery/commit/b68483227693ea68f6b12eacca2be1803cffb1d1)) +* Update README to break infinite redirect loop ([#2254](https://github.com/googleapis/python-bigquery/issues/2254)) ([8f03166](https://github.com/googleapis/python-bigquery/commit/8f031666114a826da2ad965f8ecd4727466cb480)) + +## [3.35.1](https://github.com/googleapis/python-bigquery/compare/v3.35.0...v3.35.1) (2025-07-21) + + +### Documentation + +* Specify the inherited-members directive for job classes ([#2244](https://github.com/googleapis/python-bigquery/issues/2244)) ([d207f65](https://github.com/googleapis/python-bigquery/commit/d207f6539b7a4c248a5de5719d7f384abbe20abe)) + +## [3.35.0](https://github.com/googleapis/python-bigquery/compare/v3.34.0...v3.35.0) (2025-07-15) + + +### Features + +* Add null_markers property to LoadJobConfig and CSVOptions ([#2239](https://github.com/googleapis/python-bigquery/issues/2239)) ([289446d](https://github.com/googleapis/python-bigquery/commit/289446dd8c356d11a0b63b8e6275629b1ae5dc08)) +* Add total slot ms to RowIterator ([#2233](https://github.com/googleapis/python-bigquery/issues/2233)) ([d44bf02](https://github.com/googleapis/python-bigquery/commit/d44bf0231e6e96369e4e03667a3f96618fb664e2)) +* Add UpdateMode to update_dataset ([#2204](https://github.com/googleapis/python-bigquery/issues/2204)) ([eb9c2af](https://github.com/googleapis/python-bigquery/commit/eb9c2aff242c5107f968bbd8b6a9d30cecc877f6)) +* Adds dataset_view parameter to get_dataset method ([#2198](https://github.com/googleapis/python-bigquery/issues/2198)) ([28a5750](https://github.com/googleapis/python-bigquery/commit/28a5750d455f0381548df6f9b1f7661823837d81)) +* Adds date_format to load job and external config ([#2231](https://github.com/googleapis/python-bigquery/issues/2231)) ([7d31828](https://github.com/googleapis/python-bigquery/commit/7d3182802deccfceb0646b87fc8d12275d0a569b)) +* Adds datetime_format as an option ([#2236](https://github.com/googleapis/python-bigquery/issues/2236)) ([54d3dc6](https://github.com/googleapis/python-bigquery/commit/54d3dc66244d50a031e3c80d43d372d2743ecbc3)) +* Adds source_column_match and associated tests ([#2227](https://github.com/googleapis/python-bigquery/issues/2227)) ([6d5d236](https://github.com/googleapis/python-bigquery/commit/6d5d23685cd457d85955356705c1101e9ec3cdcd)) +* Adds time_format and timestamp_format and associated tests ([#2238](https://github.com/googleapis/python-bigquery/issues/2238)) ([371ad29](https://github.com/googleapis/python-bigquery/commit/371ad292df537278767dba71d81822ed57dd8e7d)) +* Adds time_zone to external config and load job ([#2229](https://github.com/googleapis/python-bigquery/issues/2229)) ([b2300d0](https://github.com/googleapis/python-bigquery/commit/b2300d032843512b7e4a5703377632fe60ef3f8d)) + + +### Bug Fixes + +* Adds magics.context.project to eliminate issues with unit tests 
 ([#2228](https://github.com/googleapis/python-bigquery/issues/2228)) ([27ff3a8](https://github.com/googleapis/python-bigquery/commit/27ff3a89a5f97305fa3ff673aa9183baa7df200f)) +* Fix rows returned when both start_index and page_size are provided ([#2181](https://github.com/googleapis/python-bigquery/issues/2181)) ([45643a2](https://github.com/googleapis/python-bigquery/commit/45643a2e20ce5d503118522dd195aeca00dec3bc)) +* Make AccessEntry equality consistent with from_api_repr ([#2218](https://github.com/googleapis/python-bigquery/issues/2218)) ([4941de4](https://github.com/googleapis/python-bigquery/commit/4941de441cb32cabeb55ec0320f305fb62551155)) +* Update type hints for various BigQuery files ([#2206](https://github.com/googleapis/python-bigquery/issues/2206)) ([b863291](https://github.com/googleapis/python-bigquery/commit/b86329188ba35e61871db82ae1d95d2a576eed1b)) + + +### Documentation + +* Improve clarity of "Output Only" fields in Dataset class ([#2201](https://github.com/googleapis/python-bigquery/issues/2201)) ([bd5aba8](https://github.com/googleapis/python-bigquery/commit/bd5aba8ba40c2f35fb672a68eed11d6baedb304f)) + +## [3.34.0](https://github.com/googleapis/python-bigquery/compare/v3.33.0...v3.34.0) (2025-05-27) + + +### Features + +* Job creation mode GA ([#2190](https://github.com/googleapis/python-bigquery/issues/2190)) ([64cd39f](https://github.com/googleapis/python-bigquery/commit/64cd39fb395c4a03ef6d2ec8261e1709477b2186)) + + +### Bug Fixes + +* **deps:** Update all dependencies ([#2184](https://github.com/googleapis/python-bigquery/issues/2184)) ([12490f2](https://github.com/googleapis/python-bigquery/commit/12490f2f03681516465fc34217dcdf57000f6fdd)) + + +### Documentation + +* Update query.py ([#2192](https://github.com/googleapis/python-bigquery/issues/2192)) ([9b5ee78](https://github.com/googleapis/python-bigquery/commit/9b5ee78f046d9ca3f758eeca6244b8485fe35875)) +* Use query_and_wait in the array parameters sample ([#2202](https://github.com/googleapis/python-bigquery/issues/2202)) ([28a9994](https://github.com/googleapis/python-bigquery/commit/28a9994792ec90a6a4d16835faf2137c09c0fb02)) + +## [3.33.0](https://github.com/googleapis/python-bigquery/compare/v3.32.0...v3.33.0) (2025-05-19) + + +### Features + +* Add ability to set autodetect_schema query param in update_table ([#2171](https://github.com/googleapis/python-bigquery/issues/2171)) ([57f940d](https://github.com/googleapis/python-bigquery/commit/57f940d957613b4d80fb81ea40a1177b73856189)) +* Add dtype parameters to to_geodataframe functions ([#2176](https://github.com/googleapis/python-bigquery/issues/2176)) ([ebfd0a8](https://github.com/googleapis/python-bigquery/commit/ebfd0a83d43bcb96f65f5669437220aa6138b766)) +* Support job reservation ([#2186](https://github.com/googleapis/python-bigquery/issues/2186)) ([cb646ce](https://github.com/googleapis/python-bigquery/commit/cb646ceea172bf199f366ae0592546dff2d3bcb2)) + + +### Bug Fixes + +* Ensure AccessEntry equality and repr uses the correct `entity_type` ([#2182](https://github.com/googleapis/python-bigquery/issues/2182)) ([0217637](https://github.com/googleapis/python-bigquery/commit/02176377d5e2fc25b5cd4f46aa6ebfb1b6a960a6)) +* Ensure SchemaField.field_dtype returns a string ([#2188](https://github.com/googleapis/python-bigquery/issues/2188)) ([7ec2848](https://github.com/googleapis/python-bigquery/commit/7ec2848379d5743bbcb36700a1153540c451e0e0)) + +## [3.32.0](https://github.com/googleapis/python-bigquery/compare/v3.31.0...v3.32.0) (2025-05-12) + + +### Features + +* Add dataset access policy version attribute ([#2169](https://github.com/googleapis/python-bigquery/issues/2169)) ([b7656b9](https://github.com/googleapis/python-bigquery/commit/b7656b97c1bd6c204d0508b1851d114719686655)) +* Add preview support for incremental results ([#2145](https://github.com/googleapis/python-bigquery/issues/2145)) ([22b80bb](https://github.com/googleapis/python-bigquery/commit/22b80bba9d0bed319fd3102e567906c9b458dd02)) +* Add WRITE_TRUNCATE_DATA enum ([#2166](https://github.com/googleapis/python-bigquery/issues/2166)) ([4692747](https://github.com/googleapis/python-bigquery/commit/46927479085f13fd326e3f2388f60dfdd37f7f69)) +* Adds condition class and assoc. unit tests ([#2159](https://github.com/googleapis/python-bigquery/issues/2159)) ([a69d6b7](https://github.com/googleapis/python-bigquery/commit/a69d6b796d2edb6ba453980c9553bc9b206c5a6e)) +* Support BigLakeConfiguration (managed Iceberg tables) ([#2162](https://github.com/googleapis/python-bigquery/issues/2162)) ([a1c8e9a](https://github.com/googleapis/python-bigquery/commit/a1c8e9aaf60986924868d54a0ab0334e77002a39)) +* Update the AccessEntry class with a new condition attribute and unit tests ([#2163](https://github.com/googleapis/python-bigquery/issues/2163)) ([7301667](https://github.com/googleapis/python-bigquery/commit/7301667272dfbdd04b1a831418a9ad2d037171fb)) + + +### Bug Fixes + +* `query()` now warns when `job_id` is set and the default `job_retry` is ignored ([#2167](https://github.com/googleapis/python-bigquery/issues/2167)) ([ca1798a](https://github.com/googleapis/python-bigquery/commit/ca1798aaee2d5905fe688d3097f8ee5c989da333)) +* Empty record dtypes ([#2147](https://github.com/googleapis/python-bigquery/issues/2147)) ([77d7173](https://github.com/googleapis/python-bigquery/commit/77d71736fcc006d3ab8f8ba17955ad5f06e21876)) +* Table iterator should not use bqstorage when page_size is not None ([#2154](https://github.com/googleapis/python-bigquery/issues/2154)) ([e89a707](https://github.com/googleapis/python-bigquery/commit/e89a707b162182ededbf94cc9a0f7594bc2be475)) + +## [3.31.0](https://github.com/googleapis/python-bigquery/compare/v3.30.0...v3.31.0) (2025-03-20) + + +### Features + +* Add query text and total bytes processed to RowIterator ([#2140](https://github.com/googleapis/python-bigquery/issues/2140)) ([2d5f932](https://github.com/googleapis/python-bigquery/commit/2d5f9320d7103bc64c7ba496ba54bb0ef52b5605)) +* Add support for Python 3.13 ([0842aa1](https://github.com/googleapis/python-bigquery/commit/0842aa10967b1d8395cfb43e52c8ea091b381870)) + + +### Bug Fixes + +* Adding property setter for table constraints, [#1990](https://github.com/googleapis/python-bigquery/issues/1990) ([#2092](https://github.com/googleapis/python-bigquery/issues/2092)) ([f8572dd](https://github.com/googleapis/python-bigquery/commit/f8572dd86595361bae82c3232b2c0d159690a7b7)) +* Allow protobuf 6.x ([0842aa1](https://github.com/googleapis/python-bigquery/commit/0842aa10967b1d8395cfb43e52c8ea091b381870)) +* Avoid "Unable to determine type" warning with JSON columns in `to_dataframe` ([#1876](https://github.com/googleapis/python-bigquery/issues/1876)) ([968020d](https://github.com/googleapis/python-bigquery/commit/968020d5be9d2a30b90d046eaf52f91bb2c70911)) +* Remove setup.cfg configuration for creating universal wheels ([#2146](https://github.com/googleapis/python-bigquery/issues/2146)) ([d7f7685](https://github.com/googleapis/python-bigquery/commit/d7f76853d598c354bfd2e65f5dde28dae97da0ec)) + + +### Dependencies + +* Remove Python 3.7 and 3.8 as supported runtimes ([#2133](https://github.com/googleapis/python-bigquery/issues/2133)) ([fb7de39](https://github.com/googleapis/python-bigquery/commit/fb7de398cb2ad000b80a8a702d1f6539dc03d8e0)) + +## [3.30.0](https://github.com/googleapis/python-bigquery/compare/v3.29.0...v3.30.0) (2025-02-26) + + +### Features + +* Add roundingmode enum, wiring, and tests ([#2121](https://github.com/googleapis/python-bigquery/issues/2121)) ([3a48948](https://github.com/googleapis/python-bigquery/commit/3a4894827f6e73a4a88cb22933c2004697dabcc7)) +* Adds foreign_type_info attribute to table class and adds unit tests. ([#2126](https://github.com/googleapis/python-bigquery/issues/2126)) ([2c19681](https://github.com/googleapis/python-bigquery/commit/2c1968115bef8e1dc84e0125615f551b9b011a4b)) +* Support resource_tags for table ([#2093](https://github.com/googleapis/python-bigquery/issues/2093)) ([d4070ca](https://github.com/googleapis/python-bigquery/commit/d4070ca21b5797e900a9e87b966837ee1c278217)) + + +### Bug Fixes + +* Avoid blocking in download thread when using BQ Storage API ([#2034](https://github.com/googleapis/python-bigquery/issues/2034)) ([54c8d07](https://github.com/googleapis/python-bigquery/commit/54c8d07f06a8ae460c9e0fb1614e1fbc21efb5df)) +* Retry 404 errors in `Client.query(...)` ([#2135](https://github.com/googleapis/python-bigquery/issues/2135)) ([c6d5f8a](https://github.com/googleapis/python-bigquery/commit/c6d5f8aaec21ab8f17436407aded4bc2316323fd)) + + +### Dependencies + +* Updates required checks list in github ([#2136](https://github.com/googleapis/python-bigquery/issues/2136)) ([fea49ff](https://github.com/googleapis/python-bigquery/commit/fea49ffbf8aa1d53451864ceb7fd73189b6661cb)) +* Use pandas-gbq to determine schema in `load_table_from_dataframe` ([#2095](https://github.com/googleapis/python-bigquery/issues/2095)) ([7603bd7](https://github.com/googleapis/python-bigquery/commit/7603bd71d60592ef2a551d9eea09987b218edc73)) + + +### Documentation + +* Update magics.rst ([#2125](https://github.com/googleapis/python-bigquery/issues/2125)) ([b5bcfb3](https://github.com/googleapis/python-bigquery/commit/b5bcfb303d27015b747a3b0747ecd7f7ed0ed557)) + +## [3.29.0](https://github.com/googleapis/python-bigquery/compare/v3.28.0...v3.29.0) (2025-01-21) + + +### Features + +* Add ExternalCatalogTableOptions class and tests ([#2116](https://github.com/googleapis/python-bigquery/issues/2116)) ([cdc1a6e](https://github.com/googleapis/python-bigquery/commit/cdc1a6e1623b8305c6a6a1a481b3365e866a073d)) + + +### Bug Fixes + +* Add default value in SchemaField.from_api_repr() ([#2115](https://github.com/googleapis/python-bigquery/issues/2115)) ([7de6822](https://github.com/googleapis/python-bigquery/commit/7de6822e1c556a68cb8d50e90664c094697cca1d)) + +## [3.28.0](https://github.com/googleapis/python-bigquery/compare/v3.27.0...v3.28.0) (2025-01-15) + + +### Features + +* Add property for `allowNonIncrementalDefinition` for materialized view ([#2084](https://github.com/googleapis/python-bigquery/issues/2084)) ([3359ef3](https://github.com/googleapis/python-bigquery/commit/3359ef37b90243bea2d9e68bb996fe5d736f304c)) +* Add property for maxStaleness in table definitions ([#2087](https://github.com/googleapis/python-bigquery/issues/2087)) ([729322c](https://github.com/googleapis/python-bigquery/commit/729322c2288a30464f2f135ba18b9c4aa7d2f0da)) +* Add type hints to Client ([#2044](https://github.com/googleapis/python-bigquery/issues/2044)) ([40529de](https://github.com/googleapis/python-bigquery/commit/40529de923e25c41c6728c121b9c82a042967ada)) +* Adds ExternalCatalogDatasetOptions and tests ([#2111](https://github.com/googleapis/python-bigquery/issues/2111)) ([b929a90](https://github.com/googleapis/python-bigquery/commit/b929a900d49e2c15897134209ed9de5fc7f238cd)) +* Adds ForeignTypeInfo class and tests ([#2110](https://github.com/googleapis/python-bigquery/issues/2110)) ([55ca63c](https://github.com/googleapis/python-bigquery/commit/55ca63c23fcb56573e2de67e4f7899939628c4a1)) +* Adds new input validation function similar to isinstance. ([#2107](https://github.com/googleapis/python-bigquery/issues/2107)) ([a2bebb9](https://github.com/googleapis/python-bigquery/commit/a2bebb95c5ef32ac7c7cbe19c3e7a9412cbee60d)) +* Adds StorageDescriptor and tests ([#2109](https://github.com/googleapis/python-bigquery/issues/2109)) ([6be0272](https://github.com/googleapis/python-bigquery/commit/6be0272ff25dac97a38ae4ee5aa02016dc82a0d8)) +* Adds the SerDeInfo class and tests ([#2108](https://github.com/googleapis/python-bigquery/issues/2108)) ([62960f2](https://github.com/googleapis/python-bigquery/commit/62960f255d05b15940a8d2cdc595592175fada11)) +* Migrate to pyproject.toml ([#2041](https://github.com/googleapis/python-bigquery/issues/2041)) ([1061611](https://github.com/googleapis/python-bigquery/commit/106161180ead01aca1ead909cf06ca559f68666d)) +* Preserve unknown fields from the REST API representation in `SchemaField` ([#2097](https://github.com/googleapis/python-bigquery/issues/2097)) ([aaf1eb8](https://github.com/googleapis/python-bigquery/commit/aaf1eb85ada95ab866be0199812ea7f5c7f50766)) +* Resource tags in dataset ([#2090](https://github.com/googleapis/python-bigquery/issues/2090)) ([3e13016](https://github.com/googleapis/python-bigquery/commit/3e130166f43dcc06704fe90edf9068dfd44842a6)) +* Support setting max_stream_count when fetching query result ([#2051](https://github.com/googleapis/python-bigquery/issues/2051)) ([d461297](https://github.com/googleapis/python-bigquery/commit/d4612979b812d2a835e47200f27a87a66bcb856a)) + + +### Bug Fixes + +* Allow geopandas 1.x ([#2065](https://github.com/googleapis/python-bigquery/issues/2065)) ([f2ab8cb](https://github.com/googleapis/python-bigquery/commit/f2ab8cbfe00d442ad3b40683ecfec320e53b4688)) + + +### Documentation + +* Render fields correctly for update calls ([#2055](https://github.com/googleapis/python-bigquery/issues/2055)) ([a4d9534](https://github.com/googleapis/python-bigquery/commit/a4d9534a900f13ae7355904cda05097d781f27e3)) + +## [3.27.0](https://github.com/googleapis/python-bigquery/compare/v3.26.0...v3.27.0) (2024-11-01) + + +### Features + +* Updates to allow users to set max_stream_count ([#2039](https://github.com/googleapis/python-bigquery/issues/2039)) ([7372ad6](https://github.com/googleapis/python-bigquery/commit/7372ad659fd3316a602e90f224e9a3304d4c1419)) + +## [3.26.0](https://github.com/googleapis/python-bigquery/compare/v3.25.0...v3.26.0) (2024-09-25) + + +### Features + +* Include LegacyPandasError in init imports ([#2014](https://github.com/googleapis/python-bigquery/issues/2014)) ([3ab5e95](https://github.com/googleapis/python-bigquery/commit/3ab5e95984ad521027a4e1efd9f16767403e668d)) +* Use `bigquery-magics` package for the `%%bigquery` magic ([#1965](https://github.com/googleapis/python-bigquery/issues/1965)) ([60128a5](https://github.com/googleapis/python-bigquery/commit/60128a522375823422f238312521a2ce356d9177)) + + +### Bug Fixes + +* Add docfx to the presubmit configuration and delete docs-presubmit ([#1995](https://github.com/googleapis/python-bigquery/issues/1995)) ([bd83cfd](https://github.com/googleapis/python-bigquery/commit/bd83cfd2eb25cec58d59af8048f5188d748b083d)) +* Add warning when encountering unknown field types ([#1989](https://github.com/googleapis/python-bigquery/issues/1989)) ([8f5a41d](https://github.com/googleapis/python-bigquery/commit/8f5a41d283a965ca161019588d3a3b2947b04b5b)) +* Allow protobuf 5.x; require protobuf >=3.20.2; proto-plus >=1.22.3 ([#1976](https://github.com/googleapis/python-bigquery/issues/1976)) ([57bf873](https://github.com/googleapis/python-bigquery/commit/57bf873474382cc2cb34243b704bc928fa1b64c6)) +* Do not set job timeout extra property if None ([#1987](https://github.com/googleapis/python-bigquery/issues/1987)) ([edcb79c](https://github.com/googleapis/python-bigquery/commit/edcb79ca69dba30d8102abebb9d53bc76e4882ee)) +* Set pyarrow field nullable to False for a BigQuery field in REPEATED mode ([#1999](https://github.com/googleapis/python-bigquery/issues/1999)) ([5352870](https://github.com/googleapis/python-bigquery/commit/5352870283ca7d4652aefc73f12645bcf6e1363c)) + + +### Dependencies + +* Bump min version of google-api-core and google-cloud-core to 2.x ([#1972](https://github.com/googleapis/python-bigquery/issues/1972)) ([a958732](https://github.com/googleapis/python-bigquery/commit/a958732aed7d9bd51ffde3dc0e6cae9ad7455b54)) + + +### Documentation + +* Add short mode query sample & test ([#1978](https://github.com/googleapis/python-bigquery/issues/1978)) ([ba61a8a](https://github.com/googleapis/python-bigquery/commit/ba61a8ab0da541ba1940211875d7ea2e9e17dfa8)) +* Improve QueryJobConfig.destination docstring ([#2016](https://github.com/googleapis/python-bigquery/issues/2016)) ([1b4cca0](https://github.com/googleapis/python-bigquery/commit/1b4cca0a3cc788a4570705572d5f04172f6b4b24)) ## [3.25.0](https://github.com/googleapis/python-bigquery/compare/v3.24.0...v3.25.0) (2024-06-17) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7be61e6b6..3f8653f4b 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.7, 3.8, 3.9, 3.10, 3.11 and 3.12 on both UNIX and Windows. + 3.9, 3.10, 3.11, 3.12, 3.13 and 3.14 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests. - To run a single unit test:: - $ nox -s unit-3.12 -- -k + $ nox -s unit-3.13 -- -k .. note:: @@ -143,12 +143,12 @@ Running System Tests $ nox -s system # Run a single system test - $ nox -s system-3.8 -- -k + $ nox -s system-3.13 -- -k .. note:: - System tests are only configured to run under Python 3.8. + System tests are only configured to run under Python 3.9 and 3.13. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -195,11 +195,11 @@ configure them just like the System Tests. # Run all tests in a folder $ cd samples/snippets - $ nox -s py-3.8 + $ nox -s py-3.9 # Run a single sample test $ cd samples/snippets - $ nox -s py-3.8 -- -k + $ nox -s py-3.9 -- -k ******************************************** Note About ``README`` as it pertains to PyPI @@ -221,19 +221,19 @@ Supported Python Versions We support: -- `Python 3.7`_ -- `Python 3.8`_ - `Python 3.9`_ - `Python 3.10`_ - `Python 3.11`_ - `Python 3.12`_ +- `Python 3.13`_ +- `Python 3.14`_ -.. _Python 3.7: https://docs.python.org/3.7/ -.. _Python 3.8: https://docs.python.org/3.8/ .. _Python 3.9: https://docs.python.org/3.9/ .. _Python 3.10: https://docs.python.org/3.10/ .. _Python 3.11: https://docs.python.org/3.11/ .. _Python 3.12: https://docs.python.org/3.12/ +.. _Python 3.13: https://docs.python.org/3.13/ +.. _Python 3.14: https://docs.python.org/3.14/ Supported versions can be found in our ``noxfile.py`` `config`_. @@ -241,7 +241,7 @@ Supported versions can be found in our ``noxfile.py`` `config`_. .. _config: https://github.com/googleapis/python-bigquery/blob/main/noxfile.py -We also explicitly decided to support Python 3 beginning with version 3.7. +We also explicitly decided to support Python 3 beginning with version 3.9. Reasons for this include: - Encouraging use of newest versions of Python 3 diff --git a/README.rst b/README.rst index f81adc4b9..5f8650b2b 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ +:**NOTE**: **This github repository is archived. The repository contents and history have moved to** `google-cloud-python`_. + +.. _google-cloud-python: https://github.com/googleapis/google-cloud-python/tree/main/packages/google-cloud-bigquery + + Python Client for Google BigQuery ================================= @@ -18,7 +23,7 @@ processing power of Google's infrastructure. .. |versions| image:: https://img.shields.io/pypi/pyversions/google-cloud-bigquery.svg :target: https://pypi.org/project/google-cloud-bigquery/ .. _BigQuery: https://cloud.google.com/bigquery/what-is-bigquery -.. _Client Library Documentation: https://googleapis.dev/python/bigquery/latest +.. _Client Library Documentation: https://cloud.google.com/python/docs/reference/bigquery/latest/summary_overview .. _Product Documentation: https://cloud.google.com/bigquery/docs/reference/v2/ Quick Start @@ -52,11 +57,11 @@ dependencies. Supported Python Versions ^^^^^^^^^^^^^^^^^^^^^^^^^ -Python >= 3.7 +Python >= 3.9 Unsupported Python Versions ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Python == 2.7, Python == 3.5, Python == 3.6. +Python == 2.7, Python == 3.5, Python == 3.6, Python == 3.7, and Python == 3.8. The last version of this library compatible with Python 2.7 and 3.5 is `google-cloud-bigquery==1.28.0`. diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 000000000..7da12c440 --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,34 @@ +# Troubleshooting steps + +## Enable logging of BQ Storage Read API session creation + +It can be helpful to get the BQ Storage Read API session to allow the BigQuery +backend team to debug cases of API instability. The logs that share the session +creation are in a module-specific logger. To enable the logs, refer to the +following code sample: + +```python +import logging +import google.cloud.bigquery + +# Configure the basic logging to show DEBUG level messages +log_formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s' +) +handler = logging.StreamHandler() +handler.setFormatter(log_formatter) +default_logger = logging.getLogger() +default_logger.setLevel(logging.DEBUG) +default_logger.addHandler(handler) +to_dataframe_logger = logging.getLogger("google.cloud.bigquery._pandas_helpers") +to_dataframe_logger.setLevel(logging.DEBUG) +to_dataframe_logger.addHandler(handler) + +# Example code that touches the BQ Storage Read API. +bqclient = google.cloud.bigquery.Client() +results = bqclient.query_and_wait("SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`") +print(results.to_dataframe().head()) +``` + +In particular, watch for the text "with BQ Storage API session" in the logs +to get the streaming API session ID to share with your support person. diff --git a/docs/bigquery/legacy_proto_types.rst b/docs/bigquery/legacy_proto_types.rst index bc1e93715..36e9984b9 100644 --- a/docs/bigquery/legacy_proto_types.rst +++ b/docs/bigquery/legacy_proto_types.rst @@ -3,7 +3,7 @@ Legacy proto-based Types for Google Cloud Bigquery v2 API .. warning:: These types are provided for backward compatibility only, and are not maintained - anymore. They might also differ from the types uspported on the backend. It is + anymore. They might also differ from the types supported on the backend. It is therefore strongly advised to migrate to the types found in :doc:`standard_sql`. Also see the :doc:`3.0.0 Migration Guide<../UPGRADING>` for more information. diff --git a/docs/conf.py b/docs/conf.py index 826298090..df1c18b68 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # autodoc/autosummary flags autoclass_content = "both" -autodoc_default_options = {"members": True, "inherited-members": True} +autodoc_default_options = {"members": True} autosummary_generate = True @@ -109,7 +109,6 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [ - "google/cloud/bigquery_v2/**", # Legacy proto-based types. "_build", "**/.nox/**/*", "samples/AUTHORING_GUIDE.md", diff --git a/docs/magics.rst b/docs/magics.rst index aa14c6bfa..549d67f76 100644 --- a/docs/magics.rst +++ b/docs/magics.rst @@ -6,7 +6,7 @@ in a Jupyter notebook cell. .. code:: - %load_ext google.cloud.bigquery + %load_ext bigquery_magics This makes the ``%%bigquery`` magic available. @@ -27,8 +27,9 @@ Running a parameterized query: :start-after: [START bigquery_jupyter_query_params_scalars] :end-before: [END bigquery_jupyter_query_params_scalars] -API Reference -------------- +BigQuery Magics Reference +------------------------- -.. automodule:: google.cloud.bigquery.magics.magics - :members: +- `BigQuery Magics Documentation`_ + +.. _BigQuery Magics Documentation: https://googleapis.dev/python/bigquery-magics/latest diff --git a/docs/reference.rst b/docs/reference.rst index 6c00df077..d24a73596 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -22,6 +22,7 @@ Job === .. automodule:: google.cloud.bigquery.job + :inherited-members: .. toctree:: :maxdepth: 2 diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index e80907ec9..904bea3d4 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -44,6 +44,7 @@ from google.cloud.bigquery.enums import SqlTypeNames from google.cloud.bigquery.enums import StandardSqlTypeNames from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery.exceptions import LegacyPandasError from google.cloud.bigquery.exceptions import LegacyPyarrowError from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions @@ -97,6 +98,7 @@ from google.cloud.bigquery.routine import RoutineReference from google.cloud.bigquery.routine import RoutineType from google.cloud.bigquery.routine import RemoteFunctionOptions +from google.cloud.bigquery.routine import ExternalRuntimeOptions from google.cloud.bigquery.schema import PolicyTagList from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.schema import FieldElementType @@ -114,12 +116,25 @@ from google.cloud.bigquery.table import TimePartitioningType from google.cloud.bigquery.table import TimePartitioning from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +from google.cloud.bigquery import _versions_helpers try: import bigquery_magics # type: ignore except ImportError: bigquery_magics = None +sys_major, sys_minor, sys_micro = _versions_helpers.extract_runtime_version() + +if sys_major == 3 and sys_minor in (7, 8): + warnings.warn( + "The python-bigquery library no longer supports Python 3.7 " + "and Python 3.8. " + f"Your Python version is {sys_major}.{sys_minor}.{sys_micro}. We " + "recommend that you update soon to ensure ongoing support. For " + "more details, see: [Google Cloud Client Libraries Supported Python Versions policy](https://cloud.google.com/python/docs/supported-python-versions)", + FutureWarning, + ) + __all__ = [ "__version__", "Client", @@ -167,6 +182,7 @@ "RoutineArgument", "RoutineReference", "RemoteFunctionOptions", + "ExternalRuntimeOptions", # Shared helpers "SchemaField", "FieldElementType", diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 1eda80712..a35fe1677 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -21,8 +21,9 @@ import math import re import os +import textwrap import warnings -from typing import Optional, Union +from typing import Any, Optional, Tuple, Type, Union from dateutil import relativedelta from google.cloud._helpers import UTC # type: ignore @@ -31,6 +32,8 @@ from google.cloud._helpers import _RFC3339_MICROS from google.cloud._helpers import _RFC3339_NO_FRACTION from google.cloud._helpers import _to_bytes +from google.cloud.bigquery import enums + from google.auth import credentials as ga_credentials # type: ignore from google.api_core import client_options as client_options_lib @@ -133,243 +136,324 @@ def _not_null(value, field): return value is not None or (field is not None and field.mode != "NULLABLE") -def _int_from_json(value, field): - """Coerce 'value' to an int, if set or not nullable.""" - if _not_null(value, field): - return int(value) - +class CellDataParser: + """Converter from BigQuery REST resource to Python value for RowIterator and similar classes. -def _interval_from_json( - value: Optional[str], field -) -> Optional[relativedelta.relativedelta]: - """Coerce 'value' to an interval, if set or not nullable.""" - if not _not_null(value, field): - return None - if value is None: - raise TypeError(f"got {value} for REQUIRED field: {repr(field)}") - - parsed = _INTERVAL_PATTERN.match(value) - if parsed is None: - raise ValueError(f"got interval: '{value}' with unexpected format") - - calendar_sign = -1 if parsed.group("calendar_sign") == "-" else 1 - years = calendar_sign * int(parsed.group("years")) - months = calendar_sign * int(parsed.group("months")) - days = int(parsed.group("days")) - time_sign = -1 if parsed.group("time_sign") == "-" else 1 - hours = time_sign * int(parsed.group("hours")) - minutes = time_sign * int(parsed.group("minutes")) - seconds = time_sign * int(parsed.group("seconds")) - fraction = parsed.group("fraction") - microseconds = time_sign * int(fraction.ljust(6, "0")[:6]) if fraction else 0 - - return relativedelta.relativedelta( - years=years, - months=months, - days=days, - hours=hours, - minutes=minutes, - seconds=seconds, - microseconds=microseconds, - ) - - -def _float_from_json(value, field): - """Coerce 'value' to a float, if set or not nullable.""" - if _not_null(value, field): - return float(value) + See: "rows" field of + https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/list and + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/getQueryResults. + """ + def to_py(self, resource, field): + def default_converter(value, field): + _warn_unknown_field_type(field) + return value -def _decimal_from_json(value, field): - """Coerce 'value' to a Decimal, if set or not nullable.""" - if _not_null(value, field): - return decimal.Decimal(value) + converter = getattr( + self, f"{field.field_type.lower()}_to_py", default_converter + ) + if field.mode == "REPEATED": + return [converter(item["v"], field) for item in resource] + else: + return converter(resource, field) + + def bool_to_py(self, value, field): + """Coerce 'value' to a bool, if set or not nullable.""" + if _not_null(value, field): + # TODO(tswast): Why does _not_null care if the field is NULLABLE or + # REQUIRED? Do we actually need such client-side validation? + if value is None: + raise TypeError(f"got None for required boolean field {field}") + return value.lower() in ("t", "true", "1") + + def boolean_to_py(self, value, field): + """Coerce 'value' to a bool, if set or not nullable.""" + return self.bool_to_py(value, field) + + def integer_to_py(self, value, field): + """Coerce 'value' to an int, if set or not nullable.""" + if _not_null(value, field): + return int(value) + + def int64_to_py(self, value, field): + """Coerce 'value' to an int, if set or not nullable.""" + return self.integer_to_py(value, field) + + def interval_to_py( + self, value: Optional[str], field + ) -> Optional[relativedelta.relativedelta]: + """Coerce 'value' to an interval, if set or not nullable.""" + if not _not_null(value, field): + return None + if value is None: + raise TypeError(f"got {value} for REQUIRED field: {repr(field)}") + + parsed = _INTERVAL_PATTERN.match(value) + if parsed is None: + raise ValueError( + textwrap.dedent( + f""" + Got interval: '{value}' with unexpected format. + Expected interval in canonical format of "[sign]Y-M [sign]D [sign]H:M:S[.F]". + See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#interval_type + for more information. + """ + ), + ) + calendar_sign = -1 if parsed.group("calendar_sign") == "-" else 1 + years = calendar_sign * int(parsed.group("years")) + months = calendar_sign * int(parsed.group("months")) + days = int(parsed.group("days")) + time_sign = -1 if parsed.group("time_sign") == "-" else 1 + hours = time_sign * int(parsed.group("hours")) + minutes = time_sign * int(parsed.group("minutes")) + seconds = time_sign * int(parsed.group("seconds")) + fraction = parsed.group("fraction") + microseconds = time_sign * int(fraction.ljust(6, "0")[:6]) if fraction else 0 + + return relativedelta.relativedelta( + years=years, + months=months, + days=days, + hours=hours, + minutes=minutes, + seconds=seconds, + microseconds=microseconds, + ) -def _bool_from_json(value, field): - """Coerce 'value' to a bool, if set or not nullable.""" - if _not_null(value, field): - return value.lower() in ["t", "true", "1"] + def float_to_py(self, value, field): + """Coerce 'value' to a float, if set or not nullable.""" + if _not_null(value, field): + return float(value) + def float64_to_py(self, value, field): + """Coerce 'value' to a float, if set or not nullable.""" + return self.float_to_py(value, field) -def _string_from_json(value, _): - """NOOP string -> string coercion""" - return value + def numeric_to_py(self, value, field): + """Coerce 'value' to a Decimal, if set or not nullable.""" + if _not_null(value, field): + return decimal.Decimal(value) + def bignumeric_to_py(self, value, field): + """Coerce 'value' to a Decimal, if set or not nullable.""" + return self.numeric_to_py(value, field) -def _bytes_from_json(value, field): - """Base64-decode value""" - if _not_null(value, field): - return base64.standard_b64decode(_to_bytes(value)) + def string_to_py(self, value, _): + """NOOP string -> string coercion""" + return value + def geography_to_py(self, value, _): + """NOOP string -> string coercion""" + return value -def _timestamp_from_json(value, field): - """Coerce 'value' to a datetime, if set or not nullable.""" - if _not_null(value, field): - # value will be a integer in seconds, to microsecond precision, in UTC. - return _datetime_from_microseconds(int(value)) + def bytes_to_py(self, value, field): + """Base64-decode value""" + if _not_null(value, field): + return base64.standard_b64decode(_to_bytes(value)) + def timestamp_to_py(self, value, field) -> Union[datetime.datetime, str, None]: + """Coerce 'value' to a datetime, if set or not nullable. If timestamp + is of picosecond precision, preserve the string format.""" + if field.timestamp_precision == enums.TimestampPrecision.PICOSECOND: + return value + if _not_null(value, field): + # value will be a integer in seconds, to microsecond precision, in UTC. + return _datetime_from_microseconds(int(value)) + return None -def _timestamp_query_param_from_json(value, field): - """Coerce 'value' to a datetime, if set or not nullable. + def datetime_to_py(self, value, field): + """Coerce 'value' to a datetime, if set or not nullable. + + Args: + value (str): The timestamp. + field (google.cloud.bigquery.schema.SchemaField): + The field corresponding to the value. + + Returns: + Optional[datetime.datetime]: + The parsed datetime object from + ``value`` if the ``field`` is not null (otherwise it is + :data:`None`). + """ + if _not_null(value, field): + if "." in value: + # YYYY-MM-DDTHH:MM:SS.ffffff + return datetime.datetime.strptime(value, _RFC3339_MICROS_NO_ZULU) + else: + # YYYY-MM-DDTHH:MM:SS + return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION) + else: + return None - Args: - value (str): The timestamp. + def date_to_py(self, value, field): + """Coerce 'value' to a datetime date, if set or not nullable""" + if _not_null(value, field): + # value will be a string, in YYYY-MM-DD form. + return _date_from_iso8601_date(value) + + def time_to_py(self, value, field): + """Coerce 'value' to a datetime date, if set or not nullable""" + if _not_null(value, field): + if len(value) == 8: # HH:MM:SS + fmt = _TIMEONLY_WO_MICROS + elif len(value) == 15: # HH:MM:SS.micros + fmt = _TIMEONLY_W_MICROS + else: + raise ValueError( + textwrap.dedent( + f""" + Got {repr(value)} with unknown time format. + Expected HH:MM:SS or HH:MM:SS.micros. See + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type + for more information. + """ + ), + ) + return datetime.datetime.strptime(value, fmt).time() + + def record_to_py(self, value, field): + """Coerce 'value' to a mapping, if set or not nullable.""" + if _not_null(value, field): + record = {} + record_iter = zip(field.fields, value["f"]) + for subfield, cell in record_iter: + record[subfield.name] = self.to_py(cell["v"], subfield) + return record + + def struct_to_py(self, value, field): + """Coerce 'value' to a mapping, if set or not nullable.""" + return self.record_to_py(value, field) + + def json_to_py(self, value, field): + """Coerce 'value' to a Pythonic JSON representation.""" + if _not_null(value, field): + return json.loads(value) + else: + return None - field (google.cloud.bigquery.schema.SchemaField): - The field corresponding to the value. + def _range_element_to_py(self, value, field_element_type): + """Coerce 'value' to a range element value.""" + # Avoid circular imports by importing here. + from google.cloud.bigquery import schema - Returns: - Optional[datetime.datetime]: - The parsed datetime object from - ``value`` if the ``field`` is not null (otherwise it is - :data:`None`). - """ - if _not_null(value, field): - # Canonical formats for timestamps in BigQuery are flexible. See: - # g.co/cloud/bigquery/docs/reference/standard-sql/data-types#timestamp-type - # The separator between the date and time can be 'T' or ' '. - value = value.replace(" ", "T", 1) - # The UTC timezone may be formatted as Z or +00:00. - value = value.replace("Z", "") - value = value.replace("+00:00", "") - - if "." in value: - # YYYY-MM-DDTHH:MM:SS.ffffff - return datetime.datetime.strptime(value, _RFC3339_MICROS_NO_ZULU).replace( - tzinfo=UTC + if value == "UNBOUNDED": + return None + if field_element_type.element_type in _SUPPORTED_RANGE_ELEMENTS: + return self.to_py( + value, + schema.SchemaField("placeholder", field_element_type.element_type), ) else: - # YYYY-MM-DDTHH:MM:SS - return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION).replace( - tzinfo=UTC + raise ValueError( + textwrap.dedent( + f""" + Got unsupported range element type: {field_element_type.element_type}. + Exptected one of {repr(_SUPPORTED_RANGE_ELEMENTS)}. See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declare_a_range_type + for more information. + """ + ), ) - else: - return None - - -def _datetime_from_json(value, field): - """Coerce 'value' to a datetime, if set or not nullable. - - Args: - value (str): The timestamp. - field (google.cloud.bigquery.schema.SchemaField): - The field corresponding to the value. - - Returns: - Optional[datetime.datetime]: - The parsed datetime object from - ``value`` if the ``field`` is not null (otherwise it is - :data:`None`). - """ - if _not_null(value, field): - if "." in value: - # YYYY-MM-DDTHH:MM:SS.ffffff - return datetime.datetime.strptime(value, _RFC3339_MICROS_NO_ZULU) - else: - # YYYY-MM-DDTHH:MM:SS - return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION) - else: - return None - -def _date_from_json(value, field): - """Coerce 'value' to a datetime date, if set or not nullable""" - if _not_null(value, field): - # value will be a string, in YYYY-MM-DD form. - return _date_from_iso8601_date(value) + def range_to_py(self, value, field): + """Coerce 'value' to a range, if set or not nullable. + + Args: + value (str): The literal representation of the range. + field (google.cloud.bigquery.schema.SchemaField): + The field corresponding to the value. + + Returns: + Optional[dict]: + The parsed range object from ``value`` if the ``field`` is not + null (otherwise it is :data:`None`). + """ + if _not_null(value, field): + if _RANGE_PATTERN.match(value): + start, end = value[1:-1].split(", ") + start = self._range_element_to_py(start, field.range_element_type) + end = self._range_element_to_py(end, field.range_element_type) + return {"start": start, "end": end} + else: + raise ValueError( + textwrap.dedent( + f""" + Got unknown format for range value: {value}. + Expected format '[lower_bound, upper_bound)'. See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_with_literal + for more information. + """ + ), + ) -def _time_from_json(value, field): - """Coerce 'value' to a datetime date, if set or not nullable""" - if _not_null(value, field): - if len(value) == 8: # HH:MM:SS - fmt = _TIMEONLY_WO_MICROS - elif len(value) == 15: # HH:MM:SS.micros - fmt = _TIMEONLY_W_MICROS - else: - raise ValueError("Unknown time format: {}".format(value)) - return datetime.datetime.strptime(value, fmt).time() +CELL_DATA_PARSER = CellDataParser() -def _record_from_json(value, field): - """Coerce 'value' to a mapping, if set or not nullable.""" - if _not_null(value, field): - record = {} - record_iter = zip(field.fields, value["f"]) - for subfield, cell in record_iter: - record[subfield.name] = _field_from_json(cell["v"], subfield) - return record +class DataFrameCellDataParser(CellDataParser): + """Override of CellDataParser to handle differences in expression of values in DataFrame-like outputs. + This is used to turn the output of the REST API into a pyarrow Table, + emulating the serialized arrow from the BigQuery Storage Read API. + """ -def _json_from_json(value, field): - """Coerce 'value' to a Pythonic JSON representation.""" - if _not_null(value, field): - return json.loads(value) - else: - return None - + def json_to_py(self, value, _): + """No-op because DataFrame expects string for JSON output.""" + return value -def _range_element_from_json(value, field): - """Coerce 'value' to a range element value.""" - if value == "UNBOUNDED": - return None - if field.element_type in _SUPPORTED_RANGE_ELEMENTS: - return _CELLDATA_FROM_JSON[field.element_type](value, field.element_type) - else: - raise ValueError(f"Unsupported range element type: {field.element_type}") +DATA_FRAME_CELL_DATA_PARSER = DataFrameCellDataParser() -def _range_from_json(value, field): - """Coerce 'value' to a range, if set or not nullable. - Args: - value (str): The literal representation of the range. - field (google.cloud.bigquery.schema.SchemaField): - The field corresponding to the value. +class ScalarQueryParamParser(CellDataParser): + """Override of CellDataParser to handle the differences in the response from query params. - Returns: - Optional[dict]: - The parsed range object from ``value`` if the ``field`` is not - null (otherwise it is :data:`None`). + See: "value" field of + https://cloud.google.com/bigquery/docs/reference/rest/v2/QueryParameter#QueryParameterValue """ - if _not_null(value, field): - if _RANGE_PATTERN.match(value): - start, end = value[1:-1].split(", ") - start = _range_element_from_json(start, field.range_element_type) - end = _range_element_from_json(end, field.range_element_type) - return {"start": start, "end": end} - else: - raise ValueError(f"Unknown format for range value: {value}") - else: - return None + def timestamp_to_py(self, value, field): + """Coerce 'value' to a datetime, if set or not nullable. + + Args: + value (str): The timestamp. + + field (google.cloud.bigquery.schema.SchemaField): + The field corresponding to the value. + + Returns: + Optional[datetime.datetime]: + The parsed datetime object from + ``value`` if the ``field`` is not null (otherwise it is + :data:`None`). + """ + if _not_null(value, field): + # Canonical formats for timestamps in BigQuery are flexible. See: + # g.co/cloud/bigquery/docs/reference/standard-sql/data-types#timestamp-type + # The separator between the date and time can be 'T' or ' '. + value = value.replace(" ", "T", 1) + # The UTC timezone may be formatted as Z or +00:00. + value = value.replace("Z", "") + value = value.replace("+00:00", "") + + if "." in value: + # YYYY-MM-DDTHH:MM:SS.ffffff + return datetime.datetime.strptime( + value, _RFC3339_MICROS_NO_ZULU + ).replace(tzinfo=UTC) + else: + # YYYY-MM-DDTHH:MM:SS + return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION).replace( + tzinfo=UTC + ) + else: + return None -# Parse BigQuery API response JSON into a Python representation. -_CELLDATA_FROM_JSON = { - "INTEGER": _int_from_json, - "INT64": _int_from_json, - "INTERVAL": _interval_from_json, - "FLOAT": _float_from_json, - "FLOAT64": _float_from_json, - "NUMERIC": _decimal_from_json, - "BIGNUMERIC": _decimal_from_json, - "BOOLEAN": _bool_from_json, - "BOOL": _bool_from_json, - "STRING": _string_from_json, - "GEOGRAPHY": _string_from_json, - "BYTES": _bytes_from_json, - "TIMESTAMP": _timestamp_from_json, - "DATETIME": _datetime_from_json, - "DATE": _date_from_json, - "TIME": _time_from_json, - "RECORD": _record_from_json, - "JSON": _json_from_json, - "RANGE": _range_from_json, -} -_QUERY_PARAMS_FROM_JSON = dict(_CELLDATA_FROM_JSON) -_QUERY_PARAMS_FROM_JSON["TIMESTAMP"] = _timestamp_query_param_from_json +SCALAR_QUERY_PARAM_PARSER = ScalarQueryParamParser() def _field_to_index_mapping(schema): @@ -377,18 +461,6 @@ def _field_to_index_mapping(schema): return {f.name: i for i, f in enumerate(schema)} -def _field_from_json(resource, field): - def default_converter(value, field): - _warn_unknown_field_type(field) - return value - - converter = _CELLDATA_FROM_JSON.get(field.field_type, default_converter) - if field.mode == "REPEATED": - return [converter(item["v"], field) for item in resource] - else: - return converter(resource, field) - - def _row_tuple_from_json(row, schema): """Convert JSON row data to row with appropriate types. @@ -410,7 +482,7 @@ def _row_tuple_from_json(row, schema): row_data = [] for field, cell in zip(schema, row["f"]): - row_data.append(_field_from_json(cell["v"], field)) + row_data.append(CELL_DATA_PARSER.to_py(cell["v"], field)) return tuple(row_data) @@ -978,11 +1050,11 @@ def _build_resource_from_properties(obj, filter_fields): """ partial = {} for filter_field in filter_fields: - api_field = obj._PROPERTY_TO_API_FIELD.get(filter_field) + api_field = _get_sub_prop(obj._PROPERTY_TO_API_FIELD, filter_field) if api_field is None and filter_field not in obj._properties: raise ValueError("No property %s" % filter_field) elif api_field is not None: - partial[api_field] = obj._properties.get(api_field) + _set_sub_prop(partial, api_field, _get_sub_prop(obj._properties, api_field)) else: # allows properties that are not defined in the library # and properties that have the same name as API resource key @@ -1004,3 +1076,33 @@ def _verify_job_config_type(job_config, expected_type, param_name="job_config"): job_config=job_config, ) ) + + +def _isinstance_or_raise( + value: Any, + dtype: Union[Type, Tuple[Type, ...]], + none_allowed: Optional[bool] = False, +) -> Any: + """Determine whether a value type matches a given datatype or None. + Args: + value (Any): Value to be checked. + dtype (type): Expected data type or tuple of data types. + none_allowed Optional(bool): whether value is allowed to be None. Default + is False. + Returns: + Any: Returns the input value if the type check is successful. + Raises: + TypeError: If the input value's type does not match the expected data type(s). + """ + if none_allowed and value is None: + return value + + if isinstance(value, dtype): + return value + + or_none = "" + if none_allowed: + or_none = " (or None)" + + msg = f"Pass {value} as a '{dtype}'{or_none}. Got {type(value)}." + raise TypeError(msg) diff --git a/google/cloud/bigquery/_job_helpers.py b/google/cloud/bigquery/_job_helpers.py index e66ab2763..30f89759e 100644 --- a/google/cloud/bigquery/_job_helpers.py +++ b/google/cloud/bigquery/_job_helpers.py @@ -35,18 +35,26 @@ predicates where it is safe to generate a new query ID. """ +from __future__ import annotations + import copy +import dataclasses +import datetime import functools -import os import uuid -from typing import Any, Dict, Optional, TYPE_CHECKING, Union +import textwrap +from typing import Any, Callable, Dict, Optional, TYPE_CHECKING, Union +import warnings import google.api_core.exceptions as core_exceptions from google.api_core import retry as retries +from google.cloud.bigquery import enums from google.cloud.bigquery import job +import google.cloud.bigquery.job.query import google.cloud.bigquery.query from google.cloud.bigquery import table +import google.cloud.bigquery.retry from google.cloud.bigquery.retry import POLLING_DEFAULT_VALUE # Avoid circular imports @@ -114,14 +122,21 @@ def query_jobs_insert( retry: Optional[retries.Retry], timeout: Optional[float], job_retry: Optional[retries.Retry], + *, + callback: Callable = lambda _: None, ) -> job.QueryJob: """Initiate a query using jobs.insert. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + + Args: + callback (Callable): + A callback function used by bigframes to report query progress. """ job_id_given = job_id is not None job_id_save = job_id job_config_save = job_config + query_sent_factory = QuerySentEventFactory() def do_query(): # Make a copy now, so that original doesn't get changed by the process @@ -134,6 +149,16 @@ def do_query(): try: query_job._begin(retry=retry, timeout=timeout) + if job_config is not None and not job_config.dry_run: + callback( + query_sent_factory( + query=query, + billing_project=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + request_id=None, + ) + ) except core_exceptions.Conflict as create_exc: # The thought is if someone is providing their own job IDs and they get # their job ID generation wrong, this could end up returning results for @@ -142,12 +167,28 @@ def do_query(): raise create_exc try: + # Sometimes we get a 404 after a Conflict. In this case, we + # have pretty high confidence that by retrying the 404, we'll + # (hopefully) eventually recover the job. + # https://github.com/googleapis/python-bigquery/issues/2134 + # + # Allow users who want to completely disable retries to + # continue to do so by setting retry to None. + get_job_retry = retry + if retry is not None: + # TODO(tswast): Amend the user's retry object with allowing + # 404 to retry when there's a public way to do so. + # https://github.com/googleapis/python-api-core/issues/796 + get_job_retry = ( + google.cloud.bigquery.retry._DEFAULT_GET_JOB_CONFLICT_RETRY + ) + query_job = client.get_job( job_id, project=project, location=location, - retry=retry, - timeout=timeout, + retry=get_job_retry, + timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, ) except core_exceptions.GoogleAPIError: # (includes RetryError) raise @@ -156,7 +197,13 @@ def do_query(): else: return query_job + # Allow users who want to completely disable retries to + # continue to do so by setting job_retry to None. + if job_retry is not None: + do_query = google.cloud.bigquery.retry._DEFAULT_QUERY_JOB_INSERT_RETRY(do_query) + future = do_query() + # The future might be in a failed state now, but if it's # unrecoverable, we'll find out when we ask for it's result, at which # point, we may retry. @@ -175,12 +222,51 @@ def _validate_job_config(request_body: Dict[str, Any], invalid_key: str): raise ValueError(f"got unexpected key {repr(invalid_key)} in job_config") +def validate_job_retry(job_id: Optional[str], job_retry: Optional[retries.Retry]): + """Catch common mistakes, such as setting a job_id and job_retry at the same + time. + """ + if job_id is not None and job_retry is not None: + # TODO(tswast): To avoid breaking changes but still allow a default + # query job retry, we currently only raise if they explicitly set a + # job_retry other than the default. In a future version, we may want to + # avoid this check for DEFAULT_JOB_RETRY and always raise. + if job_retry is not google.cloud.bigquery.retry.DEFAULT_JOB_RETRY: + raise TypeError( + textwrap.dedent( + """ + `job_retry` was provided, but the returned job is + not retryable, because a custom `job_id` was + provided. To customize the job ID and allow for job + retries, set job_id_prefix, instead. + """ + ).strip() + ) + else: + warnings.warn( + textwrap.dedent( + """ + job_retry must be explicitly set to None if job_id is set. + BigQuery cannot retry a failed job by using the exact + same ID. Setting job_id without explicitly disabling + job_retry will raise an error in the future. To avoid this + warning, either use job_id_prefix instead (preferred) or + set job_retry=None. + """ + ).strip(), + category=FutureWarning, + # user code -> client.query / client.query_and_wait -> validate_job_retry + stacklevel=3, + ) + + def _to_query_request( job_config: Optional[job.QueryJobConfig] = None, *, query: str, location: Optional[str] = None, timeout: Optional[float] = None, + timestamp_precision: Optional[enums.TimestampPrecision] = None, ) -> Dict[str, Any]: """Transform from Job resource to QueryRequest resource. @@ -201,10 +287,15 @@ def _to_query_request( # Default to standard SQL. request_body.setdefault("useLegacySql", False) - # Since jobs.query can return results, ensure we use the lossless timestamp - # format. See: https://github.com/googleapis/python-bigquery/issues/395 request_body.setdefault("formatOptions", {}) - request_body["formatOptions"]["useInt64Timestamp"] = True # type: ignore + + # Cannot specify both use_int64_timestamp and timestamp_output_format. + if timestamp_precision == enums.TimestampPrecision.PICOSECOND: + request_body["formatOptions"]["timestampOutputFormat"] = "ISO8601_STRING" # type: ignore + else: + # Since jobs.query can return results, ensure we use the lossless + # timestamp format. See: https://github.com/googleapis/python-bigquery/issues/395 + request_body["formatOptions"]["useInt64Timestamp"] = True # type: ignore if timeout is not None: # Subtract a buffer for context switching, network latency, etc. @@ -285,7 +376,8 @@ def query_jobs_query( project: str, retry: retries.Retry, timeout: Optional[float], - job_retry: retries.Retry, + job_retry: Optional[retries.Retry], + timestamp_precision: Optional[enums.TimestampPrecision] = None, ) -> job.QueryJob: """Initiate a query using jobs.query with jobCreationMode=JOB_CREATION_REQUIRED. @@ -293,7 +385,11 @@ def query_jobs_query( """ path = _to_query_path(project) request_body = _to_query_request( - query=query, job_config=job_config, location=location, timeout=timeout + query=query, + job_config=job_config, + location=location, + timeout=timeout, + timestamp_precision=timestamp_precision, ) def do_query(): @@ -334,15 +430,10 @@ def query_and_wait( job_retry: Optional[retries.Retry], page_size: Optional[int] = None, max_results: Optional[int] = None, + callback: Callable = lambda _: None, ) -> table.RowIterator: """Run the query, wait for it to finish, and return the results. - While ``jobCreationMode=JOB_CREATION_OPTIONAL`` is in preview in the - ``jobs.query`` REST API, use the default ``jobCreationMode`` unless - the environment variable ``QUERY_PREVIEW_ENABLED=true``. After - ``jobCreationMode`` is GA, this method will always use - ``jobCreationMode=JOB_CREATION_OPTIONAL``. See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query Args: client: @@ -359,9 +450,8 @@ def query_and_wait( location (Optional[str]): Location where to run the job. Must match the location of the table used in the query as well as the destination table. - project (Optional[str]): - Project ID of the project of where to run the job. Defaults - to the client's project. + project (str): + Project ID of the project of where to run the job. api_timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. @@ -385,6 +475,8 @@ def query_and_wait( request. Non-positive values are ignored. max_results (Optional[int]): The maximum total number of rows from this request. + callback (Callable): + A callback function used by bigframes to report query progress. Returns: google.cloud.bigquery.table.RowIterator: @@ -423,12 +515,14 @@ def query_and_wait( retry=retry, timeout=api_timeout, job_retry=job_retry, + callback=callback, ), api_timeout=api_timeout, wait_timeout=wait_timeout, retry=retry, page_size=page_size, max_results=max_results, + callback=callback, ) path = _to_query_path(project) @@ -437,14 +531,27 @@ def query_and_wait( request_body["maxResults"] = min(page_size, max_results) elif page_size is not None or max_results is not None: request_body["maxResults"] = page_size or max_results + if client.default_job_creation_mode: + request_body["jobCreationMode"] = client.default_job_creation_mode - if os.getenv("QUERY_PREVIEW_ENABLED", "").casefold() == "true": - request_body["jobCreationMode"] = "JOB_CREATION_OPTIONAL" + query_sent_factory = QuerySentEventFactory() def do_query(): - request_body["requestId"] = make_job_id() + request_id = make_job_id() + request_body["requestId"] = request_id span_attributes = {"path": path} + if "dryRun" not in request_body: + callback( + query_sent_factory( + query=query, + billing_project=project, + location=location, + job_id=None, + request_id=request_id, + ) + ) + # For easier testing, handle the retries ourselves. if retry is not None: response = retry(client._call_api)( @@ -487,8 +594,25 @@ def do_query(): retry=retry, page_size=page_size, max_results=max_results, + callback=callback, ) + if "dryRun" not in request_body: + callback( + QueryFinishedEvent( + billing_project=project, + location=query_results.location, + query_id=query_results.query_id, + job_id=query_results.job_id, + total_rows=query_results.total_rows, + total_bytes_processed=query_results.total_bytes_processed, + slot_millis=query_results.slot_millis, + destination=None, + created=query_results.created, + started=query_results.started, + ended=query_results.ended, + ) + ) return table.RowIterator( client=client, api_request=functools.partial(client._call_api, retry, timeout=api_timeout), @@ -503,6 +627,12 @@ def do_query(): query_id=query_results.query_id, project=query_results.project, num_dml_affected_rows=query_results.num_dml_affected_rows, + query=query, + total_bytes_processed=query_results.total_bytes_processed, + slot_millis=query_results.slot_millis, + created=query_results.created, + started=query_results.started, + ended=query_results.ended, ) if job_retry is not None: @@ -539,6 +669,10 @@ def _supported_by_jobs_query(request_body: Dict[str, Any]) -> bool: "maximumBytesBilled", "requestId", "createSession", + "writeIncrementalResults", + "jobTimeoutMs", + "reservation", + "maxSlots", } unsupported_keys = request_keys - keys_allowlist @@ -552,6 +686,8 @@ def _wait_or_cancel( retry: Optional[retries.Retry], page_size: Optional[int], max_results: Optional[int], + *, + callback: Callable = lambda _: None, ) -> table.RowIterator: """Wait for a job to complete and return the results. @@ -559,12 +695,43 @@ def _wait_or_cancel( the job. """ try: - return job.result( + if not job.dry_run: + callback( + QueryReceivedEvent( + billing_project=job.project, + location=job.location, + job_id=job.job_id, + statement_type=job.statement_type, + state=job.state, + query_plan=job.query_plan, + created=job.created, + started=job.started, + ended=job.ended, + ) + ) + query_results = job.result( page_size=page_size, max_results=max_results, retry=retry, timeout=wait_timeout, ) + if not job.dry_run: + callback( + QueryFinishedEvent( + billing_project=job.project, + location=query_results.location, + query_id=query_results.query_id, + job_id=query_results.job_id, + total_rows=query_results.total_rows, + total_bytes_processed=query_results.total_bytes_processed, + slot_millis=query_results.slot_millis, + destination=job.destination, + created=job.created, + started=job.started, + ended=job.ended, + ) + ) + return query_results except Exception: # Attempt to cancel the job since we can't return the results. try: @@ -573,3 +740,62 @@ def _wait_or_cancel( # Don't eat the original exception if cancel fails. pass raise + + +@dataclasses.dataclass(frozen=True) +class QueryFinishedEvent: + """Query finished successfully.""" + + billing_project: Optional[str] + location: Optional[str] + query_id: Optional[str] + job_id: Optional[str] + destination: Optional[table.TableReference] + total_rows: Optional[int] + total_bytes_processed: Optional[int] + slot_millis: Optional[int] + created: Optional[datetime.datetime] + started: Optional[datetime.datetime] + ended: Optional[datetime.datetime] + + +@dataclasses.dataclass(frozen=True) +class QueryReceivedEvent: + """Query received and acknowledged by the BigQuery API.""" + + billing_project: Optional[str] + location: Optional[str] + job_id: Optional[str] + statement_type: Optional[str] + state: Optional[str] + query_plan: Optional[list[google.cloud.bigquery.job.query.QueryPlanEntry]] + created: Optional[datetime.datetime] + started: Optional[datetime.datetime] + ended: Optional[datetime.datetime] + + +@dataclasses.dataclass(frozen=True) +class QuerySentEvent: + """Query sent to BigQuery.""" + + query: str + billing_project: Optional[str] + location: Optional[str] + job_id: Optional[str] + request_id: Optional[str] + + +class QueryRetryEvent(QuerySentEvent): + """Query sent another time because the previous attempt failed.""" + + +class QuerySentEventFactory: + """Creates a QuerySentEvent first, then QueryRetryEvent after that.""" + + def __init__(self): + self._event_constructor = QuerySentEvent + + def __call__(self, **kwargs): + result = self._event_constructor(**kwargs) + self._event_constructor = QueryRetryEvent + return result diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index c21a02569..7bd9f99b6 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -12,7 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Shared helper functions for connecting BigQuery and pandas.""" +"""Shared helper functions for connecting BigQuery and pandas. + +NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package, +instead. See: go/pandas-gbq-and-bigframes-redundancy and +https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pandas_to_bigquery.py +""" import concurrent.futures from datetime import datetime @@ -20,14 +25,18 @@ from itertools import islice import logging import queue +import threading +import time import warnings -from typing import Any, Union +from typing import Any, Union, Optional, Callable, Generator, List from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery import _versions_helpers +from google.cloud.bigquery import retry as bq_retry from google.cloud.bigquery import schema + try: import pandas # type: ignore @@ -38,6 +47,16 @@ else: import numpy + +try: + import pandas_gbq.schema.pandas_to_bigquery # type: ignore + + pandas_gbq_import_exception = None +except ImportError as exc: + pandas_gbq = None + pandas_gbq_import_exception = exc + + try: import db_dtypes # type: ignore @@ -75,7 +94,7 @@ def _to_wkb(v): _to_wkb = _to_wkb() try: - from google.cloud.bigquery_storage import ArrowSerializationOptions + from google.cloud.bigquery_storage_v1.types import ArrowSerializationOptions except ImportError: _ARROW_COMPRESSION_SUPPORT = False else: @@ -118,6 +137,21 @@ def __init__(self): # be an atomic operation in the Python language definition (enforced by # the global interpreter lock). self.done = False + # To assist with testing and understanding the behavior of the + # download, use this object as shared state to track how many worker + # threads have started and have gracefully shutdown. + self._started_workers_lock = threading.Lock() + self.started_workers = 0 + self._finished_workers_lock = threading.Lock() + self.finished_workers = 0 + + def start(self): + with self._started_workers_lock: + self.started_workers += 1 + + def finish(self): + with self._finished_workers_lock: + self.finished_workers += 1 BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { @@ -126,6 +160,7 @@ def __init__(self): b"ARROW:extension:metadata": b'{"encoding": "WKT"}', }, "DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"}, + "JSON": {b"ARROW:extension:name": b"google:sqlType:json"}, } @@ -200,7 +235,7 @@ def bq_to_arrow_field(bq_field, array_type=None): # local NULL values. Arrow will gladly interpret these NULL values # as non-NULL and give you an arbitrary value. See: # https://github.com/googleapis/python-bigquery/issues/1692 - nullable=True, + nullable=False if bq_field.mode.upper() == "REPEATED" else True, metadata=metadata, ) @@ -428,6 +463,10 @@ def _first_array_valid(series): def dataframe_to_bq_schema(dataframe, bq_schema): """Convert a pandas DataFrame schema to a BigQuery schema. + DEPRECATED: Use + pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(), + instead. See: go/pandas-gbq-and-bigframes-redundancy. + Args: dataframe (pandas.DataFrame): DataFrame for which the client determines the BigQuery schema. @@ -443,6 +482,20 @@ def dataframe_to_bq_schema(dataframe, bq_schema): The automatically determined schema. Returns None if the type of any column cannot be determined. """ + if pandas_gbq is None: + warnings.warn( + "Loading pandas DataFrame into BigQuery will require pandas-gbq " + "package version 0.26.1 or greater in the future. " + f"Tried to import pandas-gbq and got: {pandas_gbq_import_exception}", + category=FutureWarning, + ) + else: + return pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields( + dataframe, + override_bigquery_fields=bq_schema, + index=True, + ) + if bq_schema: bq_schema = schema._to_schema_fields(bq_schema) bq_schema_index = {field.name: field for field in bq_schema} @@ -452,31 +505,37 @@ def dataframe_to_bq_schema(dataframe, bq_schema): bq_schema_unused = set() bq_schema_out = [] - unknown_type_fields = [] - + unknown_type_columns = [] + dataframe_reset_index = dataframe.reset_index() for column, dtype in list_columns_and_indexes(dataframe): - # Use provided type from schema, if present. + # Step 1: use provided type from schema, if present. bq_field = bq_schema_index.get(column) if bq_field: bq_schema_out.append(bq_field) bq_schema_unused.discard(bq_field.name) continue - # Otherwise, try to automatically determine the type based on the + # Step 2: try to automatically determine the type based on the # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) if bq_type is None: - sample_data = _first_valid(dataframe.reset_index()[column]) + sample_data = _first_valid(dataframe_reset_index[column]) if ( isinstance(sample_data, _BaseGeometry) and sample_data is not None # Paranoia ): bq_type = "GEOGRAPHY" - bq_field = schema.SchemaField(column, bq_type) - bq_schema_out.append(bq_field) + if bq_type is not None: + bq_schema_out.append(schema.SchemaField(column, bq_type)) + continue - if bq_field.field_type is None: - unknown_type_fields.append(bq_field) + # Step 3: try with pyarrow if available + bq_field = _get_schema_by_pyarrow(column, dataframe_reset_index[column]) + if bq_field is not None: + bq_schema_out.append(bq_field) + continue + + unknown_type_columns.append(column) # Catch any schema mismatch. The developer explicitly asked to serialize a # column, but it was not found. @@ -487,98 +546,70 @@ def dataframe_to_bq_schema(dataframe, bq_schema): ) ) - # If schema detection was not successful for all columns, also try with - # pyarrow, if available. - if unknown_type_fields: - if not pyarrow: - msg = "Could not determine the type of columns: {}".format( - ", ".join(field.name for field in unknown_type_fields) - ) - warnings.warn(msg) - return None # We cannot detect the schema in full. - - # The augment_schema() helper itself will also issue unknown type - # warnings if detection still fails for any of the fields. - bq_schema_out = augment_schema(dataframe, bq_schema_out) + if unknown_type_columns != []: + msg = "Could not determine the type of columns: {}".format( + ", ".join(unknown_type_columns) + ) + warnings.warn(msg) + return None # We cannot detect the schema in full. - return tuple(bq_schema_out) if bq_schema_out else None + return tuple(bq_schema_out) -def augment_schema(dataframe, current_bq_schema): - """Try to deduce the unknown field types and return an improved schema. +def _get_schema_by_pyarrow(name, series): + """Attempt to detect the type of the given series by leveraging PyArrow's + type detection capabilities. - This function requires ``pyarrow`` to run. If all the missing types still - cannot be detected, ``None`` is returned. If all types are already known, - a shallow copy of the given schema is returned. + This function requires the ``pyarrow`` library to be installed and + available. If the series type cannot be determined or ``pyarrow`` is not + available, ``None`` is returned. Args: - dataframe (pandas.DataFrame): - DataFrame for which some of the field types are still unknown. - current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]): - A BigQuery schema for ``dataframe``. The types of some or all of - the fields may be ``None``. + name (str): + the column name of the SchemaField. + series (pandas.Series): + The Series data for which to detect the data type. Returns: - Optional[Sequence[google.cloud.bigquery.schema.SchemaField]] + Optional[google.cloud.bigquery.schema.SchemaField]: + A tuple containing the BigQuery-compatible type string (e.g., + "STRING", "INTEGER", "TIMESTAMP", "DATETIME", "NUMERIC", "BIGNUMERIC") + and the mode string ("NULLABLE", "REPEATED"). + Returns ``None`` if the type cannot be determined or ``pyarrow`` + is not imported. """ - # pytype: disable=attribute-error - augmented_schema = [] - unknown_type_fields = [] - for field in current_bq_schema: - if field.field_type is not None: - augmented_schema.append(field) - continue - - arrow_table = pyarrow.array(dataframe.reset_index()[field.name]) - - if pyarrow.types.is_list(arrow_table.type): - # `pyarrow.ListType` - detected_mode = "REPEATED" - detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq( - arrow_table.values.type.id - ) - # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds - # it to such datetimes, causing them to be recognized as TIMESTAMP type. - # We thus additionally check the actual data to see if we need to overrule - # that and choose DATETIME instead. - # Note that this should only be needed for datetime values inside a list, - # since scalar datetime values have a proper Pandas dtype that allows - # distinguishing between timezone-naive and timezone-aware values before - # even requiring the additional schema augment logic in this method. - if detected_type == "TIMESTAMP": - valid_item = _first_array_valid(dataframe[field.name]) - if isinstance(valid_item, datetime) and valid_item.tzinfo is None: - detected_type = "DATETIME" - else: - detected_mode = field.mode - detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id) - if detected_type == "NUMERIC" and arrow_table.type.scale > 9: - detected_type = "BIGNUMERIC" - - if detected_type is None: - unknown_type_fields.append(field) - continue + if not pyarrow: + return None - new_field = schema.SchemaField( - name=field.name, - field_type=detected_type, - mode=detected_mode, - description=field.description, - fields=field.fields, - ) - augmented_schema.append(new_field) + arrow_table = pyarrow.array(series) + if pyarrow.types.is_list(arrow_table.type): + # `pyarrow.ListType` + mode = "REPEATED" + type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.values.type.id) + + # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds + # it to such datetimes, causing them to be recognized as TIMESTAMP type. + # We thus additionally check the actual data to see if we need to overrule + # that and choose DATETIME instead. + # Note that this should only be needed for datetime values inside a list, + # since scalar datetime values have a proper Pandas dtype that allows + # distinguishing between timezone-naive and timezone-aware values before + # even requiring the additional schema augment logic in this method. + if type == "TIMESTAMP": + valid_item = _first_array_valid(series) + if isinstance(valid_item, datetime) and valid_item.tzinfo is None: + type = "DATETIME" + else: + mode = "NULLABLE" # default mode + type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id) + if type == "NUMERIC" and arrow_table.type.scale > 9: + type = "BIGNUMERIC" - if unknown_type_fields: - warnings.warn( - "Pyarrow could not determine the type of columns: {}.".format( - ", ".join(field.name for field in unknown_type_fields) - ) - ) + if type is not None: + return schema.SchemaField(name, type, mode) + else: return None - return augmented_schema - # pytype: enable=attribute-error - def dataframe_to_arrow(dataframe, bq_schema): """Convert pandas dataframe to Arrow table, using BigQuery schema. @@ -710,7 +741,7 @@ def _row_iterator_page_to_arrow(page, column_names, arrow_types): return pyarrow.RecordBatch.from_arrays(arrays, names=column_names) -def download_arrow_row_iterator(pages, bq_schema): +def download_arrow_row_iterator(pages, bq_schema, timeout=None): """Use HTTP JSON RowIterator to construct an iterable of RecordBatches. Args: @@ -721,6 +752,10 @@ def download_arrow_row_iterator(pages, bq_schema): Mapping[str, Any] \ ]]): A decription of the fields in result pages. + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. + Yields: :class:`pyarrow.RecordBatch` The next page of records as a ``pyarrow`` record batch. @@ -729,8 +764,16 @@ def download_arrow_row_iterator(pages, bq_schema): column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema] arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema] - for page in pages: - yield _row_iterator_page_to_arrow(page, column_names, arrow_types) + if timeout is None: + for page in pages: + yield _row_iterator_page_to_arrow(page, column_names, arrow_types) + else: + start_time = time.monotonic() + for page in pages: + if time.monotonic() - start_time > timeout: + raise concurrent.futures.TimeoutError() + + yield _row_iterator_page_to_arrow(page, column_names, arrow_types) def _row_iterator_page_to_dataframe(page, column_names, dtypes): @@ -748,7 +791,7 @@ def _row_iterator_page_to_dataframe(page, column_names, dtypes): return pandas.DataFrame(columns, columns=column_names) -def download_dataframe_row_iterator(pages, bq_schema, dtypes): +def download_dataframe_row_iterator(pages, bq_schema, dtypes, timeout=None): """Use HTTP JSON RowIterator to construct a DataFrame. Args: @@ -762,14 +805,27 @@ def download_dataframe_row_iterator(pages, bq_schema, dtypes): dtypes(Mapping[str, numpy.dtype]): The types of columns in result data to hint construction of the resulting DataFrame. Not all column types have to be specified. + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. + Yields: :class:`pandas.DataFrame` The next page of records as a ``pandas.DataFrame`` record batch. """ bq_schema = schema._to_schema_fields(bq_schema) column_names = [field.name for field in bq_schema] - for page in pages: - yield _row_iterator_page_to_dataframe(page, column_names, dtypes) + + if timeout is None: + for page in pages: + yield _row_iterator_page_to_dataframe(page, column_names, dtypes) + else: + start_time = time.monotonic() + for page in pages: + if time.monotonic() - start_time > timeout: + raise concurrent.futures.TimeoutError() + + yield _row_iterator_page_to_dataframe(page, column_names, dtypes) def _bqstorage_page_to_arrow(page): @@ -785,20 +841,35 @@ def _bqstorage_page_to_dataframe(column_names, dtypes, page): def _download_table_bqstorage_stream( download_state, bqstorage_client, session, stream, worker_queue, page_to_item ): - reader = bqstorage_client.read_rows(stream.name) - - # Avoid deprecation warnings for passing in unnecessary read session. - # https://github.com/googleapis/python-bigquery-storage/issues/229 - if _versions_helpers.BQ_STORAGE_VERSIONS.is_read_session_optional: - rowstream = reader.rows() - else: - rowstream = reader.rows(session) + download_state.start() + try: + reader = bqstorage_client.read_rows(stream.name) - for page in rowstream.pages: - if download_state.done: - return - item = page_to_item(page) - worker_queue.put(item) + # Avoid deprecation warnings for passing in unnecessary read session. + # https://github.com/googleapis/python-bigquery-storage/issues/229 + if _versions_helpers.BQ_STORAGE_VERSIONS.is_read_session_optional: + rowstream = reader.rows() + else: + rowstream = reader.rows(session) + + for page in rowstream.pages: + item = page_to_item(page) + + # Make sure we set a timeout on put() so that we give the worker + # thread opportunities to shutdown gracefully, for example if the + # parent thread shuts down or the parent generator object which + # collects rows from all workers goes out of scope. See: + # https://github.com/googleapis/python-bigquery/issues/2032 + while True: + if download_state.done: + return + try: + worker_queue.put(item, timeout=_PROGRESS_INTERVAL) + break + except queue.Full: + continue + finally: + download_state.finish() def _nowait(futures): @@ -816,18 +887,64 @@ def _nowait(futures): def _download_table_bqstorage( - project_id, - table, - bqstorage_client, - preserve_order=False, - selected_fields=None, - page_to_item=None, - max_queue_size=_MAX_QUEUE_SIZE_DEFAULT, -): - """Use (faster, but billable) BQ Storage API to construct DataFrame.""" + project_id: str, + table: Any, + bqstorage_client: Any, + preserve_order: bool = False, + selected_fields: Optional[List[Any]] = None, + page_to_item: Optional[Callable] = None, + max_queue_size: Any = _MAX_QUEUE_SIZE_DEFAULT, + max_stream_count: Optional[int] = None, + download_state: Optional[_DownloadState] = None, + timeout: Optional[float] = None, +) -> Generator[Any, None, None]: + """Downloads a BigQuery table using the BigQuery Storage API. + + This method uses the faster, but potentially more expensive, BigQuery + Storage API to download a table as a Pandas DataFrame. It supports + parallel downloads and optional data transformations. + + Args: + project_id (str): The ID of the Google Cloud project containing + the table. + table (Any): The BigQuery table to download. + bqstorage_client (Any): An + authenticated BigQuery Storage API client. + preserve_order (bool, optional): Whether to preserve the order + of the rows as they are read from BigQuery. If True this limits + the number of streams to one and overrides `max_stream_count`. + Defaults to False. + selected_fields (Optional[List[SchemaField]]): + A list of BigQuery schema fields to select for download. If None, + all fields are downloaded. Defaults to None. + page_to_item (Optional[Callable]): An optional callable + function that takes a page of data from the BigQuery Storage API + max_stream_count (Optional[int]): The maximum number of + concurrent streams to use for downloading data. If `preserve_order` + is True, the requested streams are limited to 1 regardless of the + `max_stream_count` value. If 0 or None, then the number of + requested streams will be unbounded. Defaults to None. + download_state (Optional[_DownloadState]): + A threadsafe state object which can be used to observe the + behavior of the worker threads created by this method. + timeout (Optional[float]): + The number of seconds to wait for the download to complete. + If None, wait indefinitely. + + Yields: + pandas.DataFrame: Pandas DataFrames, one for each chunk of data + downloaded from BigQuery. + + Raises: + ValueError: If attempting to read from a specific partition or snapshot. + concurrent.futures.TimeoutError: + If the download does not complete within the specified timeout. + + Note: + This method requires the `google-cloud-bigquery-storage` library + to be installed. + """ - # Passing a BQ Storage client in implies that the BigQuery Storage library - # is available and can be imported. from google.cloud import bigquery_storage if "$" in table.table_id: @@ -837,10 +954,12 @@ def _download_table_bqstorage( if "@" in table.table_id: raise ValueError("Reading from a specific snapshot is not currently supported.") - requested_streams = 1 if preserve_order else 0 + start_time = time.monotonic() + requested_streams = determine_requested_streams(preserve_order, max_stream_count) - requested_session = bigquery_storage.types.ReadSession( - table=table.to_bqstorage(), data_format=bigquery_storage.types.DataFormat.ARROW + requested_session = bigquery_storage.types.stream.ReadSession( + table=table.to_bqstorage(), + data_format=bigquery_storage.types.stream.DataFormat.ARROW, ) if selected_fields is not None: for field in selected_fields: @@ -848,13 +967,20 @@ def _download_table_bqstorage( if _ARROW_COMPRESSION_SUPPORT: requested_session.read_options.arrow_serialization_options.buffer_compression = ( - ArrowSerializationOptions.CompressionCodec.LZ4_FRAME + # CompressionCodec(1) -> LZ4_FRAME + ArrowSerializationOptions.CompressionCodec(1) ) + retry_policy = ( + bq_retry.DEFAULT_RETRY.with_deadline(timeout) if timeout is not None else None + ) + session = bqstorage_client.create_read_session( parent="projects/{}".format(project_id), read_session=requested_session, max_stream_count=requested_streams, + retry=retry_policy, + timeout=timeout, ) _LOGGER.debug( @@ -871,7 +997,8 @@ def _download_table_bqstorage( # Use _DownloadState to notify worker threads when to quit. # See: https://stackoverflow.com/a/29237343/101923 - download_state = _DownloadState() + if download_state is None: + download_state = _DownloadState() # Create a queue to collect frames as they are created in each thread. # @@ -884,62 +1011,73 @@ def _download_table_bqstorage( elif max_queue_size is None: max_queue_size = 0 # unbounded - worker_queue = queue.Queue(maxsize=max_queue_size) + worker_queue: queue.Queue[int] = queue.Queue(maxsize=max_queue_size) - with concurrent.futures.ThreadPoolExecutor(max_workers=total_streams) as pool: - try: - # Manually submit jobs and wait for download to complete rather - # than using pool.map because pool.map continues running in the - # background even if there is an exception on the main thread. - # See: https://github.com/googleapis/google-cloud-python/pull/7698 - not_done = [ - pool.submit( - _download_table_bqstorage_stream, - download_state, - bqstorage_client, - session, - stream, - worker_queue, - page_to_item, - ) - for stream in session.streams - ] - - while not_done: - # Don't block on the worker threads. For performance reasons, - # we want to block on the queue's get method, instead. This - # prevents the queue from filling up, because the main thread - # has smaller gaps in time between calls to the queue's get - # method. For a detailed explaination, see: - # https://friendliness.dev/2019/06/18/python-nowait/ - done, not_done = _nowait(not_done) - for future in done: - # Call result() on any finished threads to raise any - # exceptions encountered. - future.result() - - try: - frame = worker_queue.get(timeout=_PROGRESS_INTERVAL) - yield frame - except queue.Empty: # pragma: NO COVER - continue + # Manually manage the pool to control shutdown behavior on timeout. + pool = concurrent.futures.ThreadPoolExecutor(max_workers=max(1, total_streams)) + wait_on_shutdown = True + try: + # Manually submit jobs and wait for download to complete rather + # than using pool.map because pool.map continues running in the + # background even if there is an exception on the main thread. + # See: https://github.com/googleapis/google-cloud-python/pull/7698 + not_done = [ + pool.submit( + _download_table_bqstorage_stream, + download_state, + bqstorage_client, + session, + stream, + worker_queue, + page_to_item, + ) + for stream in session.streams + ] + + while not_done: + # Check for timeout + if timeout is not None: + elapsed = time.monotonic() - start_time + if elapsed > timeout: + wait_on_shutdown = False + raise concurrent.futures.TimeoutError( + f"Download timed out after {timeout} seconds." + ) + + # Don't block on the worker threads. For performance reasons, + # we want to block on the queue's get method, instead. This + # prevents the queue from filling up, because the main thread + # has smaller gaps in time between calls to the queue's get + # method. For a detailed explanation, see: + # https://friendliness.dev/2019/06/18/python-nowait/ + done, not_done = _nowait(not_done) + for future in done: + # Call result() on any finished threads to raise any + # exceptions encountered. + future.result() + + try: + frame = worker_queue.get(timeout=_PROGRESS_INTERVAL) + yield frame + except queue.Empty: # pragma: NO COVER + continue - # Return any remaining values after the workers finished. - while True: # pragma: NO COVER - try: - frame = worker_queue.get_nowait() - yield frame - except queue.Empty: # pragma: NO COVER - break - finally: - # No need for a lock because reading/replacing a variable is - # defined to be an atomic operation in the Python language - # definition (enforced by the global interpreter lock). - download_state.done = True + # Return any remaining values after the workers finished. + while True: # pragma: NO COVER + try: + frame = worker_queue.get_nowait() + yield frame + except queue.Empty: # pragma: NO COVER + break + finally: + # No need for a lock because reading/replacing a variable is + # defined to be an atomic operation in the Python language + # definition (enforced by the global interpreter lock). + download_state.done = True - # Shutdown all background threads, now that they should know to - # exit early. - pool.shutdown(wait=True) + # Shutdown all background threads, now that they should know to + # exit early. + pool.shutdown(wait=wait_on_shutdown) def download_arrow_bqstorage( @@ -949,6 +1087,8 @@ def download_arrow_bqstorage( preserve_order=False, selected_fields=None, max_queue_size=_MAX_QUEUE_SIZE_DEFAULT, + max_stream_count=None, + timeout=None, ): return _download_table_bqstorage( project_id, @@ -958,6 +1098,8 @@ def download_arrow_bqstorage( selected_fields=selected_fields, page_to_item=_bqstorage_page_to_arrow, max_queue_size=max_queue_size, + max_stream_count=max_stream_count, + timeout=timeout, ) @@ -970,6 +1112,8 @@ def download_dataframe_bqstorage( preserve_order=False, selected_fields=None, max_queue_size=_MAX_QUEUE_SIZE_DEFAULT, + max_stream_count=None, + timeout=None, ): page_to_item = functools.partial(_bqstorage_page_to_dataframe, column_names, dtypes) return _download_table_bqstorage( @@ -980,6 +1124,8 @@ def download_dataframe_bqstorage( selected_fields=selected_fields, page_to_item=page_to_item, max_queue_size=max_queue_size, + max_stream_count=max_stream_count, + timeout=timeout, ) @@ -1024,3 +1170,40 @@ def verify_pandas_imports(): raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception if db_dtypes is None: raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception + + +def determine_requested_streams( + preserve_order: bool, + max_stream_count: Union[int, None], +) -> int: + """Determines the value of requested_streams based on the values of + `preserve_order` and `max_stream_count`. + + Args: + preserve_order (bool): Whether to preserve the order of streams. If True, + this limits the number of streams to one. `preserve_order` takes + precedence over `max_stream_count`. + max_stream_count (Union[int, None]]): The maximum number of streams + allowed. Must be a non-negative number or None, where None indicates + the value is unset. NOTE: if `preserve_order` is also set, it takes + precedence over `max_stream_count`, thus to ensure that `max_stream_count` + is used, ensure that `preserve_order` is None. + + Returns: + (int) The appropriate value for requested_streams. + """ + + if preserve_order: + # If preserve order is set, it takes precedence. + # Limit the requested streams to 1, to ensure that order + # is preserved) + return 1 + + elif max_stream_count is not None: + # If preserve_order is not set, only then do we consider max_stream_count + if max_stream_count <= -1: + raise ValueError("max_stream_count must be non-negative OR None") + return max_stream_count + + # Default to zero requested streams (unbounded). + return 0 diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 3c745a611..03c70bf63 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -12,17 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Shared helper functions for connecting BigQuery and pyarrow.""" +"""Shared helper functions for connecting BigQuery and pyarrow. -from typing import Any +NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package, +instead. See: go/pandas-gbq-and-bigframes-redundancy, +https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py +and +https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py +""" -from packaging import version +from typing import Any try: import pyarrow # type: ignore except ImportError: pyarrow = None +try: + import db_dtypes # type: ignore + + db_dtypes_import_exception = None +except ImportError as exc: + db_dtypes = None + db_dtypes_import_exception = exc + def pyarrow_datetime(): return pyarrow.timestamp("us", tz=None) @@ -64,12 +77,18 @@ def pyarrow_timestamp(): "GEOGRAPHY": pyarrow.string, "INT64": pyarrow.int64, "INTEGER": pyarrow.int64, + # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0), + # but we'd like this to map as closely to the BQ Storage API as + # possible, which uses the string() dtype, as JSON support in Arrow + # predates JSON support in BigQuery by several years. + "JSON": pyarrow.string, "NUMERIC": pyarrow_numeric, "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, } + # DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead. _ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes pyarrow.bool_().id: "BOOL", @@ -94,20 +113,22 @@ def pyarrow_timestamp(): pyarrow.large_string().id: "STRING", # The exact scale and precision don't matter, see below. pyarrow.decimal128(38, scale=9).id: "NUMERIC", + # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType) + # have the same id (31 as of version 19.0.1), so these should not be + # matched by id. } - # Adds bignumeric support only if pyarrow version >= 3.0.0 - # Decimal256 support was added to arrow 3.0.0 - # https://arrow.apache.org/blog/2021/01/25/3.0.0-release/ - if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): - _BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric - # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal256 instances. - _ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + _BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + _ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" def bq_to_arrow_scalars(bq_scalar: str): """ + DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is + to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893. + Returns: The Arrow scalar type that the input BigQuery scalar type maps to. If it cannot find the BigQuery scalar, return None. @@ -117,6 +138,8 @@ def bq_to_arrow_scalars(bq_scalar: str): def arrow_scalar_ids_to_bq(arrow_scalar: Any): """ + DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead. + Returns: The BigQuery scalar type that the input arrow scalar type maps to. If it cannot find the arrow scalar, return None. diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index 72d4c921d..cfbf70a8e 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -14,6 +14,7 @@ """Shared helper functions for verifying versions of installed modules.""" +import sys from typing import Any import packaging.version @@ -248,3 +249,16 @@ def try_import(self, raise_if_error: bool = False) -> Any: and PYARROW_VERSIONS.try_import() is not None and PYARROW_VERSIONS.installed_version >= _MIN_PYARROW_VERSION_RANGE ) + + +def extract_runtime_version(): + # Retrieve the version information + version_info = sys.version_info + + # Extract the major, minor, and micro components + major = version_info.major + minor = version_info.minor + micro = version_info.micro + + # Display the version number in a clear format + return major, minor, micro diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 1c222f2dd..54c8886cd 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -15,6 +15,7 @@ """Client for interacting with the Google BigQuery API.""" from __future__ import absolute_import +from __future__ import annotations from __future__ import division from collections import abc as collections_abc @@ -31,6 +32,7 @@ import typing from typing import ( Any, + Callable, Dict, IO, Iterable, @@ -44,6 +46,8 @@ import uuid import warnings +import requests + from google import resumable_media # type: ignore from google.resumable_media.requests import MultipartUpload # type: ignore from google.resumable_media.requests import ResumableUpload @@ -65,6 +69,7 @@ DEFAULT_BQSTORAGE_CLIENT_INFO = None # type: ignore +from google.auth.credentials import Credentials from google.cloud.bigquery._http import Connection from google.cloud.bigquery import _job_helpers from google.cloud.bigquery import _pandas_helpers @@ -87,7 +92,8 @@ from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference -from google.cloud.bigquery.enums import AutoRowIDs + +from google.cloud.bigquery.enums import AutoRowIDs, DatasetView, UpdateMode from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery.job import ( CopyJob, @@ -126,15 +132,14 @@ _versions_helpers.PANDAS_VERSIONS.try_import() ) # mypy check fails because pandas import is outside module, there are type: ignore comments related to this + ResumableTimeoutType = Union[ None, float, Tuple[float, float] ] # for resumable media methods if typing.TYPE_CHECKING: # pragma: NO COVER # os.PathLike is only subscriptable in Python 3.9+, thus shielding with a condition. - PathType = Union[str, bytes, os.PathLike[str], os.PathLike[bytes]] - import requests # required by api-core - + PathType = Union[str, bytes, os.PathLike[str], os.PathLike[bytes], io.IOBase] _DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB _MAX_MULTIPART_SIZE = 5 * 1024 * 1024 _DEFAULT_NUM_RETRIES = 6 @@ -219,6 +224,10 @@ class Client(ClientWithProject): client_options (Optional[Union[google.api_core.client_options.ClientOptions, Dict]]): Client options used to set user options on the client. API Endpoint should be set through client_options. + default_job_creation_mode (Optional[str]): + Sets the default job creation mode used by query methods such as + query_and_wait(). For lightweight queries, JOB_CREATION_OPTIONAL is + generally recommended. Raises: google.auth.exceptions.DefaultCredentialsError: @@ -231,15 +240,24 @@ class Client(ClientWithProject): def __init__( self, - project=None, - credentials=None, - _http=None, - location=None, - default_query_job_config=None, - default_load_job_config=None, - client_info=None, - client_options=None, + project: Optional[str] = None, + credentials: Optional[Credentials] = None, + _http: Optional[requests.Session] = None, + location: Optional[str] = None, + default_query_job_config: Optional[QueryJobConfig] = None, + default_load_job_config: Optional[LoadJobConfig] = None, + client_info: Optional[google.api_core.client_info.ClientInfo] = None, + client_options: Optional[ + Union[google.api_core.client_options.ClientOptions, Dict[str, Any]] + ] = None, + default_job_creation_mode: Optional[str] = None, ) -> None: + if client_options is None: + client_options = {} + if isinstance(client_options, dict): + client_options = google.api_core.client_options.from_dict(client_options) + # assert isinstance(client_options, google.api_core.client_options.ClientOptions) + super(Client, self).__init__( project=project, credentials=credentials, @@ -247,14 +265,10 @@ def __init__( _http=_http, ) - kw_args = {"client_info": client_info} + kw_args: Dict[str, Any] = {"client_info": client_info} bq_host = _get_bigquery_host() kw_args["api_endpoint"] = bq_host if bq_host != _DEFAULT_HOST else None client_universe = None - if client_options is None: - client_options = {} - if isinstance(client_options, dict): - client_options = google.api_core.client_options.from_dict(client_options) if client_options.api_endpoint: api_endpoint = client_options.api_endpoint kw_args["api_endpoint"] = api_endpoint @@ -271,6 +285,7 @@ def __init__( self._connection = Connection(self, **kw_args) self._location = location self._default_load_job_config = copy.deepcopy(default_load_job_config) + self.default_job_creation_mode = default_job_creation_mode # Use property setter so validation can run. self.default_query_job_config = default_query_job_config @@ -280,6 +295,15 @@ def location(self): """Default location for jobs / datasets / tables.""" return self._location + @property + def default_job_creation_mode(self): + """Default job creation mode used for query execution.""" + return self._default_job_creation_mode + + @default_job_creation_mode.setter + def default_job_creation_mode(self, value: Optional[str]): + self._default_job_creation_mode = value + @property def default_query_job_config(self) -> Optional[QueryJobConfig]: """Default ``QueryJobConfig`` or ``None``. @@ -328,6 +352,15 @@ def get_service_account_email( ) -> str: """Get the email address of the project's BigQuery service account + Example: + + .. code-block:: python + + from google.cloud import bigquery + client = bigquery.Client() + client.get_service_account_email() + # returns an email similar to: my_service_account@my-project.iam.gserviceaccount.com + Note: This is the service account that BigQuery uses to manage tables encrypted by a key in KMS. @@ -345,13 +378,6 @@ def get_service_account_email( str: service account email address - Example: - - >>> from google.cloud import bigquery - >>> client = bigquery.Client() - >>> client.get_service_account_email() - my_service_account@my-project.iam.gserviceaccount.com - """ if project is None: project = self.project @@ -629,9 +655,19 @@ def create_dataset( ) -> Dataset: """API call: create the dataset via a POST request. + See https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/insert + Example: + + .. code-block:: python + + from google.cloud import bigquery + client = bigquery.Client() + dataset = bigquery.Dataset('my_project.my_dataset') + dataset = client.create_dataset(dataset) + Args: dataset (Union[ \ google.cloud.bigquery.dataset.Dataset, \ @@ -658,14 +694,6 @@ def create_dataset( Raises: google.cloud.exceptions.Conflict: If the dataset already exists. - - Example: - - >>> from google.cloud import bigquery - >>> client = bigquery.Client() - >>> dataset = bigquery.Dataset('my_project.my_dataset') - >>> dataset = client.create_dataset(dataset) - """ dataset = self._dataset_from_arg(dataset) if isinstance(dataset, DatasetReference): @@ -839,6 +867,7 @@ def get_dataset( dataset_ref: Union[DatasetReference, str], retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, + dataset_view: Optional[DatasetView] = None, ) -> Dataset: """Fetch the dataset referenced by ``dataset_ref`` @@ -856,7 +885,21 @@ def get_dataset( timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. - + dataset_view (Optional[google.cloud.bigquery.enums.DatasetView]): + Specifies the view that determines which dataset information is + returned. By default, dataset metadata (e.g. friendlyName, description, + labels, etc) and ACL information are returned. This argument can + take on the following possible enum values. + + * :attr:`~google.cloud.bigquery.enums.DatasetView.ACL`: + Includes dataset metadata and the ACL. + * :attr:`~google.cloud.bigquery.enums.DatasetView.FULL`: + Includes all dataset metadata, including the ACL and table metadata. + This view is not supported by the `datasets.list` API method. + * :attr:`~google.cloud.bigquery.enums.DatasetView.METADATA`: + Includes basic dataset metadata, but not the ACL. + * :attr:`~google.cloud.bigquery.enums.DatasetView.DATASET_VIEW_UNSPECIFIED`: + The server will decide which view to use. Currently defaults to FULL. Returns: google.cloud.bigquery.dataset.Dataset: A ``Dataset`` instance. @@ -866,6 +909,12 @@ def get_dataset( dataset_ref, default_project=self.project ) path = dataset_ref.path + + if dataset_view: + query_params = {"datasetView": dataset_view.value} + else: + query_params = {} + span_attributes = {"path": path} api_response = self._call_api( retry, @@ -874,6 +923,7 @@ def get_dataset( method="GET", path=path, timeout=timeout, + query_params=query_params, ) return Dataset.from_api_repr(api_response) @@ -1173,6 +1223,7 @@ def update_dataset( fields: Sequence[str], retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, + update_mode: Optional[UpdateMode] = None, ) -> Dataset: """Change some fields of a dataset. @@ -1180,6 +1231,19 @@ def update_dataset( must be provided. If a field is listed in ``fields`` and is ``None`` in ``dataset``, it will be deleted. + For example, to update the default expiration times, specify + both properties in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_dataset( + dataset, + [ + "default_partition_expiration_ms", + "default_table_expiration_ms", + ] + ) + If ``dataset.etag`` is not ``None``, the update will only succeed if the dataset on the server has the same ETag. Thus reading a dataset with ``get_dataset``, changing its fields, @@ -1194,24 +1258,25 @@ def update_dataset( The properties of ``dataset`` to change. These are strings corresponding to the properties of :class:`~google.cloud.bigquery.dataset.Dataset`. - - For example, to update the default expiration times, specify - both properties in the ``fields`` argument: - - .. code-block:: python - - bigquery_client.update_dataset( - dataset, - [ - "default_partition_expiration_ms", - "default_table_expiration_ms", - ] - ) retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. + update_mode (Optional[google.cloud.bigquery.enums.UpdateMode]): + Specifies the kind of information to update in a dataset. + By default, dataset metadata (e.g. friendlyName, description, + labels, etc) and ACL information are updated. This argument can + take on the following possible enum values. + + * :attr:`~google.cloud.bigquery.enums.UPDATE_MODE_UNSPECIFIED`: + The default value. Behavior defaults to UPDATE_FULL. + * :attr:`~google.cloud.bigquery.enums.UpdateMode.UPDATE_METADATA`: + Includes metadata information for the dataset, such as friendlyName, description, labels, etc. + * :attr:`~google.cloud.bigquery.enums.UpdateMode.UPDATE_ACL`: + Includes ACL information for the dataset, which defines dataset access for one or more entities. + * :attr:`~google.cloud.bigquery.enums.UpdateMode.UPDATE_FULL`: + Includes both dataset metadata and ACL information. Returns: google.cloud.bigquery.dataset.Dataset: @@ -1225,6 +1290,11 @@ def update_dataset( path = dataset.path span_attributes = {"path": path, "fields": fields} + if update_mode: + query_params = {"updateMode": update_mode.value} + else: + query_params = {} + api_response = self._call_api( retry, span_name="BigQuery.updateDataset", @@ -1234,6 +1304,7 @@ def update_dataset( data=partial, headers=headers, timeout=timeout, + query_params=query_params, ) return Dataset.from_api_repr(api_response) @@ -1250,6 +1321,15 @@ def update_model( must be provided. If a field is listed in ``fields`` and is ``None`` in ``model``, the field value will be deleted. + For example, to update the descriptive properties of the model, + specify them in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_model( + model, ["description", "friendly_name"] + ) + If ``model.etag`` is not ``None``, the update will only succeed if the model on the server has the same ETag. Thus reading a model with ``get_model``, changing its fields, and then passing it to @@ -1262,15 +1342,6 @@ def update_model( The properties of ``model`` to change. These are strings corresponding to the properties of :class:`~google.cloud.bigquery.model.Model`. - - For example, to update the descriptive properties of the model, - specify them in the ``fields`` argument: - - .. code-block:: python - - bigquery_client.update_model( - model, ["description", "friendly_name"] - ) retry (Optional[google.api_core.retry.Retry]): A description of how to retry the API call. timeout (Optional[float]): @@ -1314,6 +1385,15 @@ def update_routine( must be provided. If a field is listed in ``fields`` and is ``None`` in ``routine``, the field value will be deleted. + For example, to update the description property of the routine, + specify it in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_routine( + routine, ["description"] + ) + .. warning:: During beta, partial updates are not supported. You must provide all fields in the resource. @@ -1332,15 +1412,6 @@ def update_routine( fields (Sequence[str]): The fields of ``routine`` to change, spelled as the :class:`~google.cloud.bigquery.routine.Routine` properties. - - For example, to update the description property of the routine, - specify it in the ``fields`` argument: - - .. code-block:: python - - bigquery_client.update_routine( - routine, ["description"] - ) retry (Optional[google.api_core.retry.Retry]): A description of how to retry the API call. timeout (Optional[float]): @@ -1379,6 +1450,7 @@ def update_table( self, table: Table, fields: Sequence[str], + autodetect_schema: bool = False, retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, ) -> Table: @@ -1388,6 +1460,16 @@ def update_table( must be provided. If a field is listed in ``fields`` and is ``None`` in ``table``, the field value will be deleted. + For example, to update the descriptive properties of the table, + specify them in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_table( + table, + ["description", "friendly_name"] + ) + If ``table.etag`` is not ``None``, the update will only succeed if the table on the server has the same ETag. Thus reading a table with ``get_table``, changing its fields, and then passing it to @@ -1399,16 +1481,10 @@ def update_table( fields (Sequence[str]): The fields of ``table`` to change, spelled as the :class:`~google.cloud.bigquery.table.Table` properties. - - For example, to update the descriptive properties of the table, - specify them in the ``fields`` argument: - - .. code-block:: python - - bigquery_client.update_table( - table, - ["description", "friendly_name"] - ) + autodetect_schema (bool): + Specifies if the schema of the table should be autodetected when + updating the table from the underlying source. Only applicable + for external tables. retry (Optional[google.api_core.retry.Retry]): A description of how to retry the API call. timeout (Optional[float]): @@ -1428,12 +1504,18 @@ def update_table( path = table.path span_attributes = {"path": path, "fields": fields} + if autodetect_schema: + query_params = {"autodetect_schema": True} + else: + query_params = {} + api_response = self._call_api( retry, span_name="BigQuery.updateTable", span_attributes=span_attributes, method="PATCH", path=path, + query_params=query_params, data=partial, headers=headers, timeout=timeout, @@ -1965,6 +2047,7 @@ def _get_query_results( location: Optional[str] = None, timeout: TimeoutType = DEFAULT_TIMEOUT, page_size: int = 0, + start_index: Optional[int] = None, ) -> _QueryResults: """Get the query results object for a query job. @@ -1983,9 +2066,12 @@ def _get_query_results( before using ``retry``. If set, this connection timeout may be increased to a minimum value. This prevents retries on what would otherwise be a successful response. - page_size (int): + page_size (Optional[int]): Maximum number of rows in a single response. See maxResults in the jobs.getQueryResults REST API. + start_index (Optional[int]): + Zero-based index of the starting row. See startIndex in the + jobs.getQueryResults REST API. Returns: google.cloud.bigquery.query._QueryResults: @@ -2015,6 +2101,9 @@ def _get_query_results( if location is not None: extra_params["location"] = location + if start_index is not None: + extra_params["startIndex"] = start_index + path = "/projects/{}/queries/{}".format(project, job_id) # This call is typically made in a polling loop that checks whether the @@ -3378,8 +3467,10 @@ def query( project: Optional[str] = None, retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, - job_retry: retries.Retry = DEFAULT_JOB_RETRY, + job_retry: Optional[retries.Retry] = DEFAULT_JOB_RETRY, api_method: Union[str, enums.QueryApiMethod] = enums.QueryApiMethod.INSERT, + *, + timestamp_precision: Optional[enums.TimestampPrecision] = None, ) -> job.QueryJob: """Run a SQL query. @@ -3430,10 +3521,16 @@ def query( specified here becomes the default ``job_retry`` for ``result()``, where it can also be specified. api_method (Union[str, enums.QueryApiMethod]): - Method with which to start the query job. + Method with which to start the query job. By default, + the jobs.insert API is used for starting a query. See :class:`google.cloud.bigquery.enums.QueryApiMethod` for details on the difference between the query start methods. + timestamp_precision (Optional[enums.TimestampPrecision]): + [Private Preview] If set to `enums.TimestampPrecision.PICOSECOND`, + timestamp columns of picosecond precision will be returned with + full precision. Otherwise, will truncate to microsecond + precision. Only applies when api_method == `enums.QueryApiMethod.QUERY`. Returns: google.cloud.bigquery.job.QueryJob: A new query job instance. @@ -3445,23 +3542,23 @@ def query( class, or if both ``job_id`` and non-``None`` non-default ``job_retry`` are provided. """ - job_id_given = job_id is not None - if ( - job_id_given - and job_retry is not None - and job_retry is not DEFAULT_JOB_RETRY - ): - raise TypeError( - "`job_retry` was provided, but the returned job is" - " not retryable, because a custom `job_id` was" - " provided." - ) + _job_helpers.validate_job_retry(job_id, job_retry) + job_id_given = job_id is not None if job_id_given and api_method == enums.QueryApiMethod.QUERY: raise TypeError( "`job_id` was provided, but the 'QUERY' `api_method` was requested." ) + if ( + timestamp_precision == enums.TimestampPrecision.PICOSECOND + and api_method != enums.QueryApiMethod.QUERY + ): + raise ValueError( + "Picosecond Timestamp is only supported when `api_method " + "== enums.QueryApiMethod.QUERY`." + ) + if project is None: project = self.project @@ -3487,6 +3584,7 @@ def query( retry, timeout, job_retry, + timestamp_precision=timestamp_precision, ) elif api_method == enums.QueryApiMethod.INSERT: return _job_helpers.query_jobs_insert( @@ -3520,13 +3618,6 @@ def query_and_wait( ) -> RowIterator: """Run the query, wait for it to finish, and return the results. - While ``jobCreationMode=JOB_CREATION_OPTIONAL`` is in preview in the - ``jobs.query`` REST API, use the default ``jobCreationMode`` unless - the environment variable ``QUERY_PREVIEW_ENABLED=true``. After - ``jobCreationMode`` is GA, this method will always use - ``jobCreationMode=JOB_CREATION_OPTIONAL``. See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query - Args: query (str): SQL query to be executed. Defaults to the standard SQL @@ -3562,8 +3653,13 @@ def query_and_wait( rate-limit-exceeded errors. Passing ``None`` disables job retry. Not all jobs can be retried. page_size (Optional[int]): - The maximum number of rows in each page of results from this - request. Non-positive values are ignored. + The maximum number of rows in each page of results from the + initial jobs.query request. Non-positive values are ignored. + + This parameter only affects the jobs.query and + jobs.getQueryResults API calls. Large results downloaded with + the BigQuery Storage Read API are intentionally unaffected + by this parameter. max_results (Optional[int]): The maximum total number of rows from this request. @@ -3585,6 +3681,39 @@ def query_and_wait( :class:`~google.cloud.bigquery.job.QueryJobConfig` class. """ + return self._query_and_wait_bigframes( + query, + job_config=job_config, + location=location, + project=project, + api_timeout=api_timeout, + wait_timeout=wait_timeout, + retry=retry, + job_retry=job_retry, + page_size=page_size, + max_results=max_results, + ) + + def _query_and_wait_bigframes( + self, + query, + *, + job_config: Optional[QueryJobConfig] = None, + location: Optional[str] = None, + project: Optional[str] = None, + api_timeout: TimeoutType = DEFAULT_TIMEOUT, + wait_timeout: Union[Optional[float], object] = POLLING_DEFAULT_VALUE, + retry: retries.Retry = DEFAULT_RETRY, + job_retry: retries.Retry = DEFAULT_JOB_RETRY, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + callback: Callable = lambda _: None, + ) -> RowIterator: + """See query_and_wait. + + This method has an extra callback parameter, which is used by bigframes + to create better progress bars. + """ if project is None: project = self.project @@ -3610,6 +3739,7 @@ def query_and_wait( job_retry=job_retry, page_size=page_size, max_results=max_results, + callback=callback, ) def insert_rows( @@ -3954,6 +4084,8 @@ def list_rows( page_size: Optional[int] = None, retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, + *, + timestamp_precision: Optional[enums.TimestampPrecision] = None, ) -> RowIterator: """List the rows of the table. @@ -4002,6 +4134,11 @@ def list_rows( before using ``retry``. If multiple requests are made under the hood, ``timeout`` applies to each individual request. + timestamp_precision (Optional[enums.TimestampPrecision]): + [Private Preview] If set to `enums.TimestampPrecision.PICOSECOND`, + timestamp columns of picosecond precision will be returned with + full precision. Otherwise, will truncate to microsecond + precision. Returns: google.cloud.bigquery.table.RowIterator: @@ -4035,7 +4172,12 @@ def list_rows( if start_index is not None: params["startIndex"] = start_index - params["formatOptions.useInt64Timestamp"] = True + # Cannot specify both use_int64_timestamp and timestamp_output_format. + if timestamp_precision == enums.TimestampPrecision.PICOSECOND: + params["formatOptions.timestampOutputFormat"] = "ISO8601_STRING" + else: + params["formatOptions.useInt64Timestamp"] = True + row_iterator = RowIterator( client=self, api_request=functools.partial(self._call_api, retry, timeout=timeout), @@ -4071,6 +4213,12 @@ def _list_rows_from_query_results( query_id: Optional[str] = None, first_page_response: Optional[Dict[str, Any]] = None, num_dml_affected_rows: Optional[int] = None, + query: Optional[str] = None, + total_bytes_processed: Optional[int] = None, + slot_millis: Optional[int] = None, + created: Optional[datetime.datetime] = None, + started: Optional[datetime.datetime] = None, + ended: Optional[datetime.datetime] = None, ) -> RowIterator: """List the rows of a completed query. See @@ -4118,6 +4266,18 @@ def _list_rows_from_query_results( num_dml_affected_rows (Optional[int]): If this RowIterator is the result of a DML query, the number of rows that were affected. + query (Optional[str]): + The query text used. + total_bytes_processed (Optional[int]): + total bytes processed from job statistics, if present. + slot_millis (Optional[int]): + Number of slot ms the user is actually billed for. + created (Optional[datetime.datetime]): + Datetime at which the job was created. + started (Optional[datetime.datetime]): + Datetime at which the job was started. + ended (Optional[datetime.datetime]): + Datetime at which the job finished. Returns: google.cloud.bigquery.table.RowIterator: @@ -4155,6 +4315,12 @@ def _list_rows_from_query_results( query_id=query_id, first_page_response=first_page_response, num_dml_affected_rows=num_dml_affected_rows, + query=query, + total_bytes_processed=total_bytes_processed, + slot_millis=slot_millis, + created=created, + started=started, + ended=ended, ) return row_iterator diff --git a/google/cloud/bigquery/dataset.py b/google/cloud/bigquery/dataset.py index c49a52faf..878b77d41 100644 --- a/google/cloud/bigquery/dataset.py +++ b/google/cloud/bigquery/dataset.py @@ -17,8 +17,10 @@ from __future__ import absolute_import import copy +import json import typing +from typing import Optional, List, Dict, Any, Union import google.cloud._helpers # type: ignore @@ -27,8 +29,7 @@ from google.cloud.bigquery.routine import Routine, RoutineReference from google.cloud.bigquery.table import Table, TableReference from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration - -from typing import Optional, List, Dict, Any, Union +from google.cloud.bigquery import external_config def _get_table_reference(self, table_id: str) -> TableReference: @@ -298,12 +299,15 @@ def __init__( role: Optional[str] = None, entity_type: Optional[str] = None, entity_id: Optional[Union[Dict[str, Any], str]] = None, + **kwargs, ): - self._properties = {} + self._properties: Dict[str, Any] = {} if entity_type is not None: self._properties[entity_type] = entity_id self._properties["role"] = role - self._entity_type = entity_type + self._entity_type: Optional[str] = entity_type + for prop, val in kwargs.items(): + setattr(self, prop, val) @property def role(self) -> Optional[str]: @@ -330,6 +334,9 @@ def dataset(self, value): if isinstance(value, str): value = DatasetReference.from_string(value).to_api_repr() + if isinstance(value, DatasetReference): + value = value.to_api_repr() + if isinstance(value, (Dataset, DatasetListItem)): value = value.reference.to_api_repr() @@ -437,26 +444,89 @@ def special_group(self) -> Optional[str]: def special_group(self, value): self._properties["specialGroup"] = value + @property + def condition(self) -> Optional["Condition"]: + """Optional[Condition]: The IAM condition associated with this entry.""" + value = typing.cast(Dict[str, Any], self._properties.get("condition")) + return Condition.from_api_repr(value) if value else None + + @condition.setter + def condition(self, value: Union["Condition", dict, None]): + """Set the IAM condition for this entry.""" + if value is None: + self._properties["condition"] = None + elif isinstance(value, Condition): + self._properties["condition"] = value.to_api_repr() + elif isinstance(value, dict): + self._properties["condition"] = value + else: + raise TypeError("condition must be a Condition object, dict, or None") + @property def entity_type(self) -> Optional[str]: """The entity_type of the entry.""" + + # The api_repr for an AccessEntry object is expected to be a dict with + # only a few keys. Two keys that may be present are role and condition. + # Any additional key is going to have one of ~eight different names: + # userByEmail, groupByEmail, domain, dataset, specialGroup, view, + # routine, iamMember + + # if self._entity_type is None, see if it needs setting + # i.e. is there a key: value pair that should be associated with + # entity_type and entity_id? + if self._entity_type is None: + resource = self._properties.copy() + # we are empyting the dict to get to the last `key: value`` pair + # so we don't keep these first entries + _ = resource.pop("role", None) + _ = resource.pop("condition", None) + + try: + # we only need entity_type, because entity_id gets set elsewhere. + entity_type, _ = resource.popitem() + except KeyError: + entity_type = None + + self._entity_type = entity_type + return self._entity_type @property def entity_id(self) -> Optional[Union[Dict[str, Any], str]]: """The entity_id of the entry.""" - return self._properties.get(self._entity_type) if self._entity_type else None + if self.entity_type: + entity_type = self.entity_type + else: + return None + return typing.cast( + Optional[Union[Dict[str, Any], str]], + self._properties.get(entity_type, None), + ) def __eq__(self, other): if not isinstance(other, AccessEntry): return NotImplemented - return self._key() == other._key() + return ( + self.role == other.role + and self.entity_type == other.entity_type + and self._normalize_entity_id(self.entity_id) + == self._normalize_entity_id(other.entity_id) + and self.condition == other.condition + ) + + @staticmethod + def _normalize_entity_id(value): + """Ensure consistent equality for dicts like 'view'.""" + if isinstance(value, dict): + return json.dumps(value, sort_keys=True) + return value def __ne__(self, other): return not self == other def __repr__(self): - return f"" + return f"" def _key(self): """A tuple key that uniquely describes this field. @@ -464,9 +534,18 @@ def _key(self): Returns: Tuple: The contents of this :class:`~google.cloud.bigquery.dataset.AccessEntry`. """ + properties = self._properties.copy() + + # Dicts are not hashable. + # Convert condition to a hashable datatype(s) + condition = properties.get("condition") + if isinstance(condition, dict): + condition_key = tuple(sorted(condition.items())) + properties["condition"] = condition_key + prop_tup = tuple(sorted(properties.items())) - return (self.role, self._entity_type, self.entity_id, prop_tup) + return (self.role, self.entity_type, self.entity_id, prop_tup) def __hash__(self): return hash(self._key()) @@ -491,19 +570,10 @@ def from_api_repr(cls, resource: dict) -> "AccessEntry": Returns: google.cloud.bigquery.dataset.AccessEntry: Access entry parsed from ``resource``. - - Raises: - ValueError: - If the resource has more keys than ``role`` and one additional - key. """ - entry = resource.copy() - role = entry.pop("role", None) - entity_type, entity_id = entry.popitem() - if len(entry) != 0: - raise ValueError("Entry has unexpected keys remaining.", entry) - - return cls(role, entity_type, entity_id) + access_entry = cls() + access_entry._properties = resource.copy() + return access_entry class Dataset(object): @@ -517,6 +587,10 @@ class Dataset(object): A pointer to a dataset. If ``dataset_ref`` is a string, it must include both the project ID and the dataset ID, separated by ``.``. + + Note: + Fields marked as "Output Only" are populated by the server and will only be + available after calling :meth:`google.cloud.bigquery.client.Client.get_dataset`. """ _PROPERTY_TO_API_FIELD = { @@ -530,6 +604,9 @@ class Dataset(object): "storage_billing_model": "storageBillingModel", "max_time_travel_hours": "maxTimeTravelHours", "default_rounding_mode": "defaultRoundingMode", + "resource_tags": "resourceTags", + "external_catalog_dataset_options": "externalCatalogDatasetOptions", + "access_policy_version": "accessPolicyVersion", } def __init__(self, dataset_ref) -> None: @@ -632,7 +709,7 @@ def access_entries(self, value): @property def created(self): - """Union[datetime.datetime, None]: Datetime at which the dataset was + """Union[datetime.datetime, None]: Output only. Datetime at which the dataset was created (:data:`None` until set from the server). """ creation_time = self._properties.get("creationTime") @@ -649,8 +726,8 @@ def dataset_id(self): @property def full_dataset_id(self): - """Union[str, None]: ID for the dataset resource (:data:`None` until - set from the server) + """Union[str, None]: Output only. ID for the dataset resource + (:data:`None` until set from the server). In the format ``project_id:dataset_id``. """ @@ -665,14 +742,14 @@ def reference(self): @property def etag(self): - """Union[str, None]: ETag for the dataset resource (:data:`None` until - set from the server). + """Union[str, None]: Output only. ETag for the dataset resource + (:data:`None` until set from the server). """ return self._properties.get("etag") @property def modified(self): - """Union[datetime.datetime, None]: Datetime at which the dataset was + """Union[datetime.datetime, None]: Output only. Datetime at which the dataset was last modified (:data:`None` until set from the server). """ modified_time = self._properties.get("lastModifiedTime") @@ -684,8 +761,8 @@ def modified(self): @property def self_link(self): - """Union[str, None]: URL for the dataset resource (:data:`None` until - set from the server). + """Union[str, None]: Output only. URL for the dataset resource + (:data:`None` until set from the server). """ return self._properties.get("selfLink") @@ -801,6 +878,28 @@ def labels(self, value): raise ValueError("Pass a dict") self._properties["labels"] = value + @property + def resource_tags(self): + """Dict[str, str]: Resource tags of the dataset. + + Optional. The tags attached to this dataset. Tag keys are globally + unique. Tag key is expected to be in the namespaced format, for + example "123456789012/environment" where 123456789012 is + the ID of the parent organization or project resource for this tag + key. Tag value is expected to be the short name, for example + "Production". + + Raises: + ValueError: for invalid value types. + """ + return self._properties.setdefault("resourceTags", {}) + + @resource_tags.setter + def resource_tags(self, value): + if not isinstance(value, dict) and value is not None: + raise ValueError("Pass a dict") + self._properties["resourceTags"] = value + @property def default_encryption_configuration(self): """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom @@ -875,6 +974,39 @@ def storage_billing_model(self, value): ) self._properties["storageBillingModel"] = value + @property + def external_catalog_dataset_options(self): + """Options defining open source compatible datasets living in the + BigQuery catalog. Contains metadata of open source database, schema + or namespace represented by the current dataset.""" + + prop = _helpers._get_sub_prop( + self._properties, ["externalCatalogDatasetOptions"] + ) + + if prop is not None: + prop = external_config.ExternalCatalogDatasetOptions.from_api_repr(prop) + return prop + + @external_catalog_dataset_options.setter + def external_catalog_dataset_options(self, value): + value = _helpers._isinstance_or_raise( + value, external_config.ExternalCatalogDatasetOptions, none_allowed=True + ) + self._properties[ + self._PROPERTY_TO_API_FIELD["external_catalog_dataset_options"] + ] = (value.to_api_repr() if value is not None else None) + + @property + def access_policy_version(self): + return self._properties.get("accessPolicyVersion") + + @access_policy_version.setter + def access_policy_version(self, value): + if not isinstance(value, int) and value is not None: + raise ValueError("Pass an integer, or None") + self._properties["accessPolicyVersion"] = value + @classmethod def from_string(cls, full_dataset_id: str) -> "Dataset": """Construct a dataset from fully-qualified dataset ID. @@ -1026,3 +1158,130 @@ def reference(self): model = _get_model_reference routine = _get_routine_reference + + +class Condition(object): + """Represents a textual expression in the Common Expression Language (CEL) syntax. + + Typically used for filtering or policy rules, such as in IAM Conditions + or BigQuery row/column access policies. + + See: + https://cloud.google.com/iam/docs/reference/rest/Shared.Types/Expr + https://github.com/google/cel-spec + + Args: + expression (str): + The condition expression string using CEL syntax. This is required. + Example: ``resource.type == "compute.googleapis.com/Instance"`` + title (Optional[str]): + An optional title for the condition, providing a short summary. + Example: ``"Request is for a GCE instance"`` + description (Optional[str]): + An optional description of the condition, providing a detailed explanation. + Example: ``"This condition checks whether the resource is a GCE instance."`` + """ + + def __init__( + self, + expression: str, + title: Optional[str] = None, + description: Optional[str] = None, + ): + self._properties: Dict[str, Any] = {} + # Use setters to initialize properties, which also handle validation + self.expression = expression + self.title = title + self.description = description + + @property + def title(self) -> Optional[str]: + """Optional[str]: The title for the condition.""" + return self._properties.get("title") + + @title.setter + def title(self, value: Optional[str]): + if value is not None and not isinstance(value, str): + raise ValueError("Pass a string for title, or None") + self._properties["title"] = value + + @property + def description(self) -> Optional[str]: + """Optional[str]: The description for the condition.""" + return self._properties.get("description") + + @description.setter + def description(self, value: Optional[str]): + if value is not None and not isinstance(value, str): + raise ValueError("Pass a string for description, or None") + self._properties["description"] = value + + @property + def expression(self) -> str: + """str: The expression string for the condition.""" + + # Cast assumes expression is always set due to __init__ validation + return typing.cast(str, self._properties.get("expression")) + + @expression.setter + def expression(self, value: str): + if not isinstance(value, str): + raise ValueError("Pass a non-empty string for expression") + if not value: + raise ValueError("expression cannot be an empty string") + self._properties["expression"] = value + + def to_api_repr(self) -> Dict[str, Any]: + """Construct the API resource representation of this Condition.""" + return self._properties + + @classmethod + def from_api_repr(cls, resource: Dict[str, Any]) -> "Condition": + """Factory: construct a Condition instance given its API representation.""" + + # Ensure required fields are present in the resource if necessary + if "expression" not in resource: + raise ValueError("API representation missing required 'expression' field.") + + return cls( + expression=resource["expression"], + title=resource.get("title"), + description=resource.get("description"), + ) + + def __eq__(self, other: object) -> bool: + """Check for equality based on expression, title, and description.""" + if not isinstance(other, Condition): + return NotImplemented + return self._key() == other._key() + + def _key(self): + """A tuple key that uniquely describes this field. + Used to compute this instance's hashcode and evaluate equality. + Returns: + Tuple: The contents of this :class:`~google.cloud.bigquery.dataset.AccessEntry`. + """ + + properties = self._properties.copy() + + # Dicts are not hashable. + # Convert object to a hashable datatype(s) + prop_tup = tuple(sorted(properties.items())) + return prop_tup + + def __ne__(self, other: object) -> bool: + """Check for inequality.""" + return not self == other + + def __hash__(self) -> int: + """Generate a hash based on expression, title, and description.""" + return hash(self._key()) + + def __repr__(self) -> str: + """Return a string representation of the Condition object.""" + parts = [f"expression={self.expression!r}"] + if self.title is not None: + parts.append(f"title={self.title!r}") + if self.description is not None: + parts.append(f"description={self.description!r}") + return f"Condition({', '.join(parts)})" diff --git a/google/cloud/bigquery/dbapi/cursor.py b/google/cloud/bigquery/dbapi/cursor.py index 014a6825e..bffd7678f 100644 --- a/google/cloud/bigquery/dbapi/cursor.py +++ b/google/cloud/bigquery/dbapi/cursor.py @@ -323,6 +323,8 @@ def _bqstorage_fetch(self, bqstorage_client): read_session=requested_session, # a single stream only, as DB API is not well-suited for multithreading max_stream_count=1, + retry=None, + timeout=None, ) if not read_session.streams: diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index d8cbe9969..dc67f9674 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -80,6 +80,24 @@ class CreateDisposition(object): returned in the job result.""" +class DatasetView(enum.Enum): + """DatasetView specifies which dataset information is returned.""" + + DATASET_VIEW_UNSPECIFIED = "DATASET_VIEW_UNSPECIFIED" + """The default value. Currently maps to the FULL view.""" + + METADATA = "METADATA" + """View metadata information for the dataset, such as friendlyName, + description, labels, etc.""" + + ACL = "ACL" + """View ACL information for the dataset, which defines dataset access + for one or more entities.""" + + FULL = "FULL" + """View both dataset metadata and ACL information.""" + + class DefaultPandasDTypes(enum.Enum): """Default Pandas DataFrem DTypes to convert BigQuery data. These Sentinel values are used instead of None to maintain backward compatibility, @@ -246,6 +264,11 @@ class KeyResultStatementKind: class StandardSqlTypeNames(str, enum.Enum): + """Enum of allowed SQL type names in schema.SchemaField. + + Datatype used in GoogleSQL. + """ + def _generate_next_value_(name, start, count, last_values): return name @@ -267,6 +290,9 @@ def _generate_next_value_(name, start, count, last_values): ARRAY = enum.auto() STRUCT = enum.auto() RANGE = enum.auto() + # NOTE: FOREIGN acts as a wrapper for data types + # not natively understood by BigQuery unless translated + FOREIGN = enum.auto() class EntityTypes(str, enum.Enum): @@ -285,7 +311,10 @@ class EntityTypes(str, enum.Enum): # See also: https://cloud.google.com/bigquery/data-types#legacy_sql_data_types # and https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types class SqlTypeNames(str, enum.Enum): - """Enum of allowed SQL type names in schema.SchemaField.""" + """Enum of allowed SQL type names in schema.SchemaField. + + Datatype used in Legacy SQL. + """ STRING = "STRING" BYTES = "BYTES" @@ -306,6 +335,9 @@ class SqlTypeNames(str, enum.Enum): DATETIME = "DATETIME" INTERVAL = "INTERVAL" # NOTE: not available in legacy types RANGE = "RANGE" # NOTE: not available in legacy types + # NOTE: FOREIGN acts as a wrapper for data types + # not natively understood by BigQuery unless translated + FOREIGN = "FOREIGN" class WriteDisposition(object): @@ -324,6 +356,10 @@ class WriteDisposition(object): WRITE_TRUNCATE = "WRITE_TRUNCATE" """If the table already exists, BigQuery overwrites the table data.""" + WRITE_TRUNCATE_DATA = "WRITE_TRUNCATE_DATA" + """For existing tables, truncate data but preserve existing schema + and constraints.""" + WRITE_EMPTY = "WRITE_EMPTY" """If the table already exists and contains data, a 'duplicate' error is returned in the job result.""" @@ -344,3 +380,118 @@ class DeterminismLevel: NOT_DETERMINISTIC = "NOT_DETERMINISTIC" """The UDF is not deterministic.""" + + +class RoundingMode(str, enum.Enum): + """Rounding mode options that can be used when storing NUMERIC or BIGNUMERIC + values. + + ROUNDING_MODE_UNSPECIFIED: will default to using ROUND_HALF_AWAY_FROM_ZERO. + + ROUND_HALF_AWAY_FROM_ZERO: rounds half values away from zero when applying + precision and scale upon writing of NUMERIC and BIGNUMERIC values. + For Scale: 0 + * 1.1, 1.2, 1.3, 1.4 => 1 + * 1.5, 1.6, 1.7, 1.8, 1.9 => 2 + + ROUND_HALF_EVEN: rounds half values to the nearest even value when applying + precision and scale upon writing of NUMERIC and BIGNUMERIC values. + For Scale: 0 + * 1.1, 1.2, 1.3, 1.4 => 1 + * 1.5 => 2 + * 1.6, 1.7, 1.8, 1.9 => 2 + * 2.5 => 2 + """ + + def _generate_next_value_(name, start, count, last_values): + return name + + ROUNDING_MODE_UNSPECIFIED = enum.auto() + ROUND_HALF_AWAY_FROM_ZERO = enum.auto() + ROUND_HALF_EVEN = enum.auto() + + +class BigLakeFileFormat(object): + FILE_FORMAT_UNSPECIFIED = "FILE_FORMAT_UNSPECIFIED" + """The default unspecified value.""" + + PARQUET = "PARQUET" + """Apache Parquet format.""" + + +class BigLakeTableFormat(object): + TABLE_FORMAT_UNSPECIFIED = "TABLE_FORMAT_UNSPECIFIED" + """The default unspecified value.""" + + ICEBERG = "ICEBERG" + """Apache Iceberg format.""" + + +class UpdateMode(enum.Enum): + """Specifies the kind of information to update in a dataset.""" + + UPDATE_MODE_UNSPECIFIED = "UPDATE_MODE_UNSPECIFIED" + """The default value. Behavior defaults to UPDATE_FULL.""" + + UPDATE_METADATA = "UPDATE_METADATA" + """Includes metadata information for the dataset, such as friendlyName, + description, labels, etc.""" + + UPDATE_ACL = "UPDATE_ACL" + """Includes ACL information for the dataset, which defines dataset access + for one or more entities.""" + + UPDATE_FULL = "UPDATE_FULL" + """Includes both dataset metadata and ACL information.""" + + +class JobCreationMode(object): + """Documented values for Job Creation Mode.""" + + JOB_CREATION_MODE_UNSPECIFIED = "JOB_CREATION_MODE_UNSPECIFIED" + """Job creation mode is unspecified.""" + + JOB_CREATION_REQUIRED = "JOB_CREATION_REQUIRED" + """Job creation is always required.""" + + JOB_CREATION_OPTIONAL = "JOB_CREATION_OPTIONAL" + """Job creation is optional. + + Returning immediate results is prioritized. + BigQuery will automatically determine if a Job needs to be created. + The conditions under which BigQuery can decide to not create a Job are + subject to change. + """ + + +class SourceColumnMatch(str, enum.Enum): + """Uses sensible defaults based on how the schema is provided. + If autodetect is used, then columns are matched by name. Otherwise, columns + are matched by position. This is done to keep the behavior backward-compatible. + """ + + SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED" + """Unspecified column name match option.""" + + POSITION = "POSITION" + """Matches by position. This assumes that the columns are ordered the same + way as the schema.""" + + NAME = "NAME" + """Matches by name. This reads the header row as column names and reorders + columns to match the field names in the schema.""" + + +class TimestampPrecision(enum.Enum): + """Precision (maximum number of total digits in base 10) for seconds of + TIMESTAMP type.""" + + MICROSECOND = None + """ + Default, for TIMESTAMP type with microsecond precision. + """ + + PICOSECOND = 12 + """ + For TIMESTAMP type with picosecond precision. + """ diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py index a891bc232..7e76f93b5 100644 --- a/google/cloud/bigquery/external_config.py +++ b/google/cloud/bigquery/external_config.py @@ -18,17 +18,21 @@ Job.configuration.query.tableDefinitions. """ -from __future__ import absolute_import +from __future__ import absolute_import, annotations import base64 import copy +import typing from typing import Any, Dict, FrozenSet, Iterable, Optional, Union from google.cloud.bigquery._helpers import _to_bytes from google.cloud.bigquery._helpers import _bytes_to_json from google.cloud.bigquery._helpers import _int_or_none from google.cloud.bigquery._helpers import _str_or_none +from google.cloud.bigquery import _helpers +from google.cloud.bigquery.enums import SourceColumnMatch from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions +from google.cloud.bigquery import schema from google.cloud.bigquery.schema import SchemaField @@ -471,6 +475,60 @@ def skip_leading_rows(self): def skip_leading_rows(self, value): self._properties["skipLeadingRows"] = str(value) + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the + strategy used to match loaded columns to the schema. If not set, a sensible + default is chosen based on how the schema is provided. If autodetect is + used, then columns are matched by name. Otherwise, columns are matched by + position. This is done to keep the behavior backward-compatible. + + Acceptable values are: + + SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option. + POSITION: matches by position. This assumes that the columns are ordered + the same way as the schema. + NAME: matches by name. This reads the header row as column names and + reorders columns to match the field names in the schema. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.source_column_match + """ + + value = self._properties.get("sourceColumnMatch") + return SourceColumnMatch(value) if value is not None else None + + @source_column_match.setter + def source_column_match(self, value: Union[SourceColumnMatch, str, None]): + if value is not None and not isinstance(value, (SourceColumnMatch, str)): + raise TypeError( + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None" + ) + if isinstance(value, SourceColumnMatch): + value = value.value + self._properties["sourceColumnMatch"] = value if value else None + + @property + def null_markers(self) -> Optional[Iterable[str]]: + """Optional[Iterable[str]]: A list of strings represented as SQL NULL values in a CSV file. + + .. note:: + null_marker and null_markers can't be set at the same time. + If null_marker is set, null_markers has to be not set. + If null_markers is set, null_marker has to be not set. + If both null_marker and null_markers are set at the same time, a user error would be thrown. + Any strings listed in null_markers, including empty string would be interpreted as SQL NULL. + This applies to all column types. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.null_markers + """ + return self._properties.get("nullMarkers") + + @null_markers.setter + def null_markers(self, value: Optional[Iterable[str]]): + self._properties["nullMarkers"] = value + def to_api_repr(self) -> dict: """Build an API representation of this object. @@ -579,11 +637,7 @@ def from_api_repr(cls, resource: dict) -> "GoogleSheetsOptions": class HivePartitioningOptions(object): - """[Beta] Options that configure hive partitioning. - - .. note:: - **Experimental**. This feature is experimental and might change or - have limited support. + """Options that configure hive partitioning. See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#HivePartitioningOptions @@ -750,13 +804,9 @@ def decimal_target_types(self, value: Optional[Iterable[str]]): @property def hive_partitioning(self): - """Optional[:class:`~.external_config.HivePartitioningOptions`]: [Beta] When set, \ + """Optional[:class:`~.external_config.HivePartitioningOptions`]: When set, \ it configures hive partitioning support. - .. note:: - **Experimental**. This feature is experimental and might change or - have limited support. - See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.hive_partitioning_options """ @@ -833,7 +883,9 @@ def schema(self): See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.schema """ - prop = self._properties.get("schema", {}) + prop: Dict[str, Any] = typing.cast( + Dict[str, Any], self._properties.get("schema", {}) + ) return [SchemaField.from_api_repr(field) for field in prop.get("fields", [])] @schema.setter @@ -844,15 +896,83 @@ def schema(self, value): self._properties["schema"] = prop @property - def connection_id(self): - """Optional[str]: [Experimental] ID of a BigQuery Connection API - resource. + def date_format(self) -> Optional[str]: + """Optional[str]: Format used to parse DATE values. Supports C-style and SQL-style values. - .. WARNING:: + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.date_format + """ + result = self._properties.get("dateFormat") + return typing.cast(str, result) - This feature is experimental. Pre-GA features may have limited - support, and changes to pre-GA features may not be compatible with - other pre-GA versions. + @date_format.setter + def date_format(self, value: Optional[str]): + self._properties["dateFormat"] = value + + @property + def datetime_format(self) -> Optional[str]: + """Optional[str]: Format used to parse DATETIME values. Supports C-style + and SQL-style values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.datetime_format + """ + result = self._properties.get("datetimeFormat") + return typing.cast(str, result) + + @datetime_format.setter + def datetime_format(self, value: Optional[str]): + self._properties["datetimeFormat"] = value + + @property + def time_zone(self) -> Optional[str]: + """Optional[str]: Time zone used when parsing timestamp values that do not + have specific time zone information (e.g. 2024-04-20 12:34:56). The expected + format is an IANA timezone string (e.g. America/Los_Angeles). + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.time_zone + """ + + result = self._properties.get("timeZone") + return typing.cast(str, result) + + @time_zone.setter + def time_zone(self, value: Optional[str]): + self._properties["timeZone"] = value + + @property + def time_format(self) -> Optional[str]: + """Optional[str]: Format used to parse TIME values. Supports C-style and SQL-style values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.time_format + """ + result = self._properties.get("timeFormat") + return typing.cast(str, result) + + @time_format.setter + def time_format(self, value: Optional[str]): + self._properties["timeFormat"] = value + + @property + def timestamp_format(self) -> Optional[str]: + """Optional[str]: Format used to parse TIMESTAMP values. Supports C-style and SQL-style values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.timestamp_format + """ + result = self._properties.get("timestampFormat") + return typing.cast(str, result) + + @timestamp_format.setter + def timestamp_format(self, value: Optional[str]): + self._properties["timestampFormat"] = value + + @property + def connection_id(self): + """Optional[str]: ID of a BigQuery Connection API + resource. """ return self._properties.get("connectionId") @@ -1003,3 +1123,182 @@ def from_api_repr(cls, resource: dict) -> "ExternalConfig": config = cls(resource["sourceFormat"]) config._properties = copy.deepcopy(resource) return config + + +class ExternalCatalogDatasetOptions: + """Options defining open source compatible datasets living in the BigQuery catalog. + Contains metadata of open source database, schema or namespace represented + by the current dataset. + + Args: + default_storage_location_uri (Optional[str]): The storage location URI for all + tables in the dataset. Equivalent to hive metastore's database + locationUri. Maximum length of 1024 characters. (str) + parameters (Optional[dict[str, Any]]): A map of key value pairs defining the parameters + and properties of the open source schema. Maximum size of 2Mib. + """ + + def __init__( + self, + default_storage_location_uri: Optional[str] = None, + parameters: Optional[Dict[str, Any]] = None, + ): + self._properties: Dict[str, Any] = {} + self.default_storage_location_uri = default_storage_location_uri + self.parameters = parameters + + @property + def default_storage_location_uri(self) -> Optional[str]: + """Optional. The storage location URI for all tables in the dataset. + Equivalent to hive metastore's database locationUri. Maximum length of + 1024 characters.""" + + return self._properties.get("defaultStorageLocationUri") + + @default_storage_location_uri.setter + def default_storage_location_uri(self, value: Optional[str]): + value = _helpers._isinstance_or_raise(value, str, none_allowed=True) + self._properties["defaultStorageLocationUri"] = value + + @property + def parameters(self) -> Optional[Dict[str, Any]]: + """Optional. A map of key value pairs defining the parameters and + properties of the open source schema. Maximum size of 2Mib.""" + + return self._properties.get("parameters") + + @parameters.setter + def parameters(self, value: Optional[Dict[str, Any]]): + value = _helpers._isinstance_or_raise(value, dict, none_allowed=True) + self._properties["parameters"] = value + + def to_api_repr(self) -> dict: + """Build an API representation of this object. + + Returns: + Dict[str, Any]: + A dictionary in the format used by the BigQuery API. + """ + return self._properties + + @classmethod + def from_api_repr(cls, api_repr: dict) -> ExternalCatalogDatasetOptions: + """Factory: constructs an instance of the class (cls) + given its API representation. + + Args: + api_repr (Dict[str, Any]): + API representation of the object to be instantiated. + + Returns: + An instance of the class initialized with data from 'resource'. + """ + config = cls() + config._properties = api_repr + return config + + +class ExternalCatalogTableOptions: + """Metadata about open source compatible table. The fields contained in these + options correspond to hive metastore's table level properties. + + Args: + connection_id (Optional[str]): The connection specifying the credentials to be + used to read external storage, such as Azure Blob, Cloud Storage, or + S3. The connection is needed to read the open source table from + BigQuery Engine. The connection_id can have the form `..` or + `projects//locations//connections/`. + parameters (Union[Dict[str, Any], None]): A map of key value pairs defining the parameters + and properties of the open source table. Corresponds with hive meta + store table parameters. Maximum size of 4Mib. + storage_descriptor (Optional[StorageDescriptor]): A storage descriptor containing information + about the physical storage of this table. + """ + + def __init__( + self, + connection_id: Optional[str] = None, + parameters: Union[Dict[str, Any], None] = None, + storage_descriptor: Optional[schema.StorageDescriptor] = None, + ): + self._properties: Dict[str, Any] = {} + self.connection_id = connection_id + self.parameters = parameters + self.storage_descriptor = storage_descriptor + + @property + def connection_id(self) -> Optional[str]: + """Optional. The connection specifying the credentials to be + used to read external storage, such as Azure Blob, Cloud Storage, or + S3. The connection is needed to read the open source table from + BigQuery Engine. The connection_id can have the form `..` or + `projects//locations//connections/`. + """ + + return self._properties.get("connectionId") + + @connection_id.setter + def connection_id(self, value: Optional[str]): + value = _helpers._isinstance_or_raise(value, str, none_allowed=True) + self._properties["connectionId"] = value + + @property + def parameters(self) -> Union[Dict[str, Any], None]: + """Optional. A map of key value pairs defining the parameters and + properties of the open source table. Corresponds with hive meta + store table parameters. Maximum size of 4Mib. + """ + + return self._properties.get("parameters") + + @parameters.setter + def parameters(self, value: Union[Dict[str, Any], None]): + value = _helpers._isinstance_or_raise(value, dict, none_allowed=True) + self._properties["parameters"] = value + + @property + def storage_descriptor(self) -> Any: + """Optional. A storage descriptor containing information about the + physical storage of this table.""" + + prop = _helpers._get_sub_prop(self._properties, ["storageDescriptor"]) + + if prop is not None: + return schema.StorageDescriptor.from_api_repr(prop) + return None + + @storage_descriptor.setter + def storage_descriptor(self, value: Union[schema.StorageDescriptor, dict, None]): + value = _helpers._isinstance_or_raise( + value, (schema.StorageDescriptor, dict), none_allowed=True + ) + if isinstance(value, schema.StorageDescriptor): + self._properties["storageDescriptor"] = value.to_api_repr() + else: + self._properties["storageDescriptor"] = value + + def to_api_repr(self) -> dict: + """Build an API representation of this object. + + Returns: + Dict[str, Any]: + A dictionary in the format used by the BigQuery API. + """ + + return self._properties + + @classmethod + def from_api_repr(cls, api_repr: dict) -> ExternalCatalogTableOptions: + """Factory: constructs an instance of the class (cls) + given its API representation. + + Args: + api_repr (Dict[str, Any]): + API representation of the object to be instantiated. + + Returns: + An instance of the class initialized with data from 'api_repr'. + """ + config = cls() + config._properties = api_repr + return config diff --git a/google/cloud/bigquery/format_options.py b/google/cloud/bigquery/format_options.py index ad5591b1c..e26b7a74f 100644 --- a/google/cloud/bigquery/format_options.py +++ b/google/cloud/bigquery/format_options.py @@ -13,7 +13,7 @@ # limitations under the License. import copy -from typing import Dict, Optional +from typing import Dict, Optional, Union class AvroOptions: @@ -106,7 +106,7 @@ def enable_list_inference(self, value: bool) -> None: self._properties["enableListInference"] = value @property - def map_target_type(self) -> str: + def map_target_type(self) -> Optional[Union[bool, str]]: """Indicates whether to simplify the representation of parquet maps to only show keys and values.""" return self._properties.get("mapTargetType") diff --git a/google/cloud/bigquery/job/__init__.py b/google/cloud/bigquery/job/__init__.py index f51311b0b..4cda65965 100644 --- a/google/cloud/bigquery/job/__init__.py +++ b/google/cloud/bigquery/job/__init__.py @@ -39,6 +39,7 @@ from google.cloud.bigquery.job.query import QueryPlanEntryStep from google.cloud.bigquery.job.query import ScriptOptions from google.cloud.bigquery.job.query import TimelineEntry +from google.cloud.bigquery.job.query import IncrementalResultStats from google.cloud.bigquery.enums import Compression from google.cloud.bigquery.enums import CreateDisposition from google.cloud.bigquery.enums import DestinationFormat @@ -84,4 +85,5 @@ "SourceFormat", "TransactionInfo", "WriteDisposition", + "IncrementalResultStats", ] diff --git a/google/cloud/bigquery/job/base.py b/google/cloud/bigquery/job/base.py index f165fd036..7576fc9aa 100644 --- a/google/cloud/bigquery/job/base.py +++ b/google/cloud/bigquery/job/base.py @@ -218,8 +218,62 @@ def job_timeout_ms(self, value): err.__traceback__ ) - """ Docs indicate a string is expected by the API """ - self._properties["jobTimeoutMs"] = str(value) + if value is not None: + # docs indicate a string is expected by the API + self._properties["jobTimeoutMs"] = str(value) + else: + self._properties.pop("jobTimeoutMs", None) + + @property + def max_slots(self) -> Optional[int]: + """The maximum rate of slot consumption to allow for this job. + + If set, the number of slots used to execute the job will be throttled + to try and keep its slot consumption below the requested rate. + This feature is not generally available. + """ + + max_slots = self._properties.get("maxSlots") + if max_slots is not None: + if isinstance(max_slots, str): + return int(max_slots) + if isinstance(max_slots, int): + return max_slots + return None + + @max_slots.setter + def max_slots(self, value): + try: + value = _int_or_none(value) + except ValueError as err: + raise ValueError("Pass an int for max slots, e.g. 100").with_traceback( + err.__traceback__ + ) + + if value is not None: + self._properties["maxSlots"] = str(value) + else: + self._properties.pop("maxSlots", None) + + @property + def reservation(self): + """str: Optional. The reservation that job would use. + + User can specify a reservation to execute the job. If reservation is + not set, reservation is determined based on the rules defined by the + reservation assignments. The expected format is + projects/{project}/locations/{location}/reservations/{reservation}. + + Raises: + ValueError: If ``value`` type is not None or of string type. + """ + return self._properties.setdefault("reservation", None) + + @reservation.setter + def reservation(self, value): + if value and not isinstance(value, str): + raise ValueError("Reservation must be None or a string.") + self._properties["reservation"] = value @property def labels(self): @@ -412,7 +466,7 @@ def __init__(self, job_id, client): @property def configuration(self) -> _JobConfig: """Job-type specific configurtion.""" - configuration = self._CONFIG_CLASS() + configuration: _JobConfig = self._CONFIG_CLASS() # pytype: disable=not-callable configuration._properties = self._properties.setdefault("configuration", {}) return configuration @@ -483,6 +537,18 @@ def location(self): """str: Location where the job runs.""" return _helpers._get_sub_prop(self._properties, ["jobReference", "location"]) + @property + def reservation_id(self): + """str: Name of the primary reservation assigned to this job. + + Note that this could be different than reservations reported in + the reservation field if parent reservations were used to execute + this job. + """ + return _helpers._get_sub_prop( + self._properties, ["statistics", "reservation_id"] + ) + def _require_client(self, client): """Check client or verify over-ride. @@ -627,7 +693,12 @@ def transaction_info(self) -> Optional[TransactionInfo]: @property def error_result(self): - """Error information about the job as a whole. + """Output only. Final error result of the job. + + If present, indicates that the job has completed and was unsuccessful. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatus.FIELDS.error_result Returns: Optional[Mapping]: the error information (None until set from the server). @@ -638,7 +709,13 @@ def error_result(self): @property def errors(self): - """Information about individual errors generated by the job. + """Output only. The first errors encountered during the running of the job. + + The final message includes the number of errors that caused the process to stop. + Errors here do not necessarily mean that the job has not completed or was unsuccessful. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatus.FIELDS.errors Returns: Optional[List[Mapping]]: @@ -650,7 +727,12 @@ def errors(self): @property def state(self): - """Status of the job. + """Output only. Running state of the job. + + Valid states include 'PENDING', 'RUNNING', and 'DONE'. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatus.FIELDS.state Returns: Optional[str]: @@ -962,8 +1044,7 @@ def result( # type: ignore # (incompatible with supertype) if self.state is None: self._begin(retry=retry, timeout=timeout) - kwargs = {} if retry is DEFAULT_RETRY else {"retry": retry} - return super(_AsyncJob, self).result(timeout=timeout, **kwargs) + return super(_AsyncJob, self).result(timeout=timeout, retry=retry) def cancelled(self): """Check if the job has been cancelled. diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py index e56ce16f0..9c74f7124 100644 --- a/google/cloud/bigquery/job/load.py +++ b/google/cloud/bigquery/job/load.py @@ -15,9 +15,10 @@ """Classes for load jobs.""" import typing -from typing import FrozenSet, List, Iterable, Optional +from typing import FrozenSet, List, Iterable, Optional, Union from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +from google.cloud.bigquery.enums import SourceColumnMatch from google.cloud.bigquery.external_config import HivePartitioningOptions from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery import _helpers @@ -386,6 +387,27 @@ def null_marker(self): def null_marker(self, value): self._set_sub_prop("nullMarker", value) + @property + def null_markers(self) -> Optional[List[str]]: + """Optional[List[str]]: A list of strings represented as SQL NULL values in a CSV file. + + .. note:: + null_marker and null_markers can't be set at the same time. + If null_marker is set, null_markers has to be not set. + If null_markers is set, null_marker has to be not set. + If both null_marker and null_markers are set at the same time, a user error would be thrown. + Any strings listed in null_markers, including empty string would be interpreted as SQL NULL. + This applies to all column types. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_markers + """ + return self._get_sub_prop("nullMarkers") + + @null_markers.setter + def null_markers(self, value: Optional[List[str]]): + self._set_sub_prop("nullMarkers", value) + @property def preserve_ascii_control_characters(self): """Optional[bool]: Preserves the embedded ASCII control characters when sourceFormat is set to CSV. @@ -548,6 +570,105 @@ def source_format(self): def source_format(self, value): self._set_sub_prop("sourceFormat", value) + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the + strategy used to match loaded columns to the schema. If not set, a sensible + default is chosen based on how the schema is provided. If autodetect is + used, then columns are matched by name. Otherwise, columns are matched by + position. This is done to keep the behavior backward-compatible. + + Acceptable values are: + + SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option. + POSITION: matches by position. This assumes that the columns are ordered + the same way as the schema. + NAME: matches by name. This reads the header row as column names and + reorders columns to match the field names in the schema. + + See: + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match + """ + value = self._get_sub_prop("sourceColumnMatch") + return SourceColumnMatch(value) if value is not None else None + + @source_column_match.setter + def source_column_match(self, value: Union[SourceColumnMatch, str, None]): + if value is not None and not isinstance(value, (SourceColumnMatch, str)): + raise TypeError( + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None" + ) + if isinstance(value, SourceColumnMatch): + value = value.value + self._set_sub_prop("sourceColumnMatch", value if value else None) + + @property + def date_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing DATE values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.date_format + """ + return self._get_sub_prop("dateFormat") + + @date_format.setter + def date_format(self, value: Optional[str]): + self._set_sub_prop("dateFormat", value) + + @property + def datetime_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing DATETIME values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.datetime_format + """ + return self._get_sub_prop("datetimeFormat") + + @datetime_format.setter + def datetime_format(self, value: Optional[str]): + self._set_sub_prop("datetimeFormat", value) + + @property + def time_zone(self) -> Optional[str]: + """Optional[str]: Default time zone that will apply when parsing timestamp + values that have no specific time zone. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_zone + """ + return self._get_sub_prop("timeZone") + + @time_zone.setter + def time_zone(self, value: Optional[str]): + self._set_sub_prop("timeZone", value) + + @property + def time_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing TIME values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_format + """ + return self._get_sub_prop("timeFormat") + + @time_format.setter + def time_format(self, value: Optional[str]): + self._set_sub_prop("timeFormat", value) + + @property + def timestamp_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing TIMESTAMP values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.timestamp_format + """ + return self._get_sub_prop("timestampFormat") + + @timestamp_format.setter + def timestamp_format(self, value: Optional[str]): + self._set_sub_prop("timestampFormat", value) + @property def time_partitioning(self): """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based @@ -638,6 +759,36 @@ def column_name_character_map(self, value: Optional[str]): value = ColumnNameCharacterMap.COLUMN_NAME_CHARACTER_MAP_UNSPECIFIED self._set_sub_prop("columnNameCharacterMap", value) + @property + def timestamp_target_precision(self) -> Optional[List[int]]: + """Optional[list[int]]: [Private Preview] Precisions (maximum number of + total digits in base 10) for seconds of TIMESTAMP types that are + allowed to the destination table for autodetection mode. + + Available for the formats: CSV. + + For the CSV Format, Possible values include: + None, [], or [6]: timestamp(6) for all auto detected TIMESTAMP + columns. + [6, 12]: timestamp(6) for all auto detected TIMESTAMP columns that + have less than 6 digits of subseconds. timestamp(12) for all auto + detected TIMESTAMP columns that have more than 6 digits of + subseconds. + [12]: timestamp(12) for all auto detected TIMESTAMP columns. + + The order of the elements in this array is ignored. Inputs that have + higher precision than the highest target precision in this array will + be truncated. + """ + return self._get_sub_prop("timestampTargetPrecision") + + @timestamp_target_precision.setter + def timestamp_target_precision(self, value: Optional[List[int]]): + if value is not None: + self._set_sub_prop("timestampTargetPrecision", value) + else: + self._del_sub_prop("timestampTargetPrecision") + class LoadJob(_AsyncJob): """Asynchronous job for loading data into a table. @@ -788,6 +939,13 @@ def null_marker(self): """ return self.configuration.null_marker + @property + def null_markers(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.null_markers`. + """ + return self.configuration.null_markers + @property def quote_character(self): """See @@ -889,6 +1047,48 @@ def clustering_fields(self): """ return self.configuration.clustering_fields + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match`. + """ + return self.configuration.source_column_match + + @property + def date_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.date_format`. + """ + return self.configuration.date_format + + @property + def datetime_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.datetime_format`. + """ + return self.configuration.datetime_format + + @property + def time_zone(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.time_zone`. + """ + return self.configuration.time_zone + + @property + def time_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.time_format`. + """ + return self.configuration.time_format + + @property + def timestamp_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.timestamp_format`. + """ + return self.configuration.timestamp_format + @property def schema_update_options(self): """See diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 4ea5687e0..e82deb1ef 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -197,6 +197,66 @@ def from_api_repr(cls, stats: Dict[str, str]) -> "DmlStats": return cls(*args) +class IncrementalResultStats: + """IncrementalResultStats provides information about incremental query execution.""" + + def __init__(self): + self._properties = {} + + @classmethod + def from_api_repr(cls, resource) -> "IncrementalResultStats": + """Factory: construct instance from the JSON repr. + + Args: + resource(Dict[str: object]): + IncrementalResultStats representation returned from API. + + Returns: + google.cloud.bigquery.job.IncrementalResultStats: + stats parsed from ``resource``. + """ + entry = cls() + entry._properties = resource + return entry + + @property + def disabled_reason(self): + """Optional[string]: Reason why incremental results were not + written by the query. + """ + return _helpers._str_or_none(self._properties.get("disabledReason")) + + @property + def result_set_last_replace_time(self): + """Optional[datetime]: The time at which the result table's contents + were completely replaced. May be absent if no results have been written + or the query has completed.""" + from google.cloud._helpers import _rfc3339_nanos_to_datetime + + value = self._properties.get("resultSetLastReplaceTime") + if value: + try: + return _rfc3339_nanos_to_datetime(value) + except ValueError: + pass + return None + + @property + def result_set_last_modify_time(self): + """Optional[datetime]: The time at which the result table's contents + were modified. May be absent if no results have been written or the + query has completed.""" + from google.cloud._helpers import _rfc3339_nanos_to_datetime + + value = self._properties.get("resultSetLastModifyTime") + if value: + try: + return _rfc3339_nanos_to_datetime(value) + except ValueError: + pass + return None + + class IndexUnusedReason(typing.NamedTuple): """Reason about why no search index was used in the search query (or sub-query). @@ -476,6 +536,11 @@ def destination(self): ID, each separated by ``.``. For example: ``your-project.your_dataset.your_table``. + .. note:: + + Only table ID is passed to the backend, so any configuration + in `~google.cloud.bigquery.table.Table` is discarded. + See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.destination_table """ @@ -669,6 +734,21 @@ def write_disposition(self): def write_disposition(self, value): self._set_sub_prop("writeDisposition", value) + @property + def write_incremental_results(self) -> Optional[bool]: + """This is only supported for a SELECT query using a temporary table. + + If set, the query is allowed to write results incrementally to the temporary result + table. This may incur a performance penalty. This option cannot be used with Legacy SQL. + + This feature is not generally available. + """ + return self._get_sub_prop("writeIncrementalResults") + + @write_incremental_results.setter + def write_incremental_results(self, value): + self._set_sub_prop("writeIncrementalResults", value) + @property def table_definitions(self): """Dict[str, google.cloud.bigquery.external_config.ExternalConfig]: @@ -1319,6 +1399,13 @@ def bi_engine_stats(self) -> Optional[BiEngineStats]: else: return BiEngineStats.from_api_repr(stats) + @property + def incremental_result_stats(self) -> Optional[IncrementalResultStats]: + stats = self._job_statistics().get("incrementalResultStats") + if stats is None: + return None + return IncrementalResultStats.from_api_repr(stats) + def _blocking_poll(self, timeout=None, **kwargs): self._done_timeout = timeout self._transport_timeout = timeout @@ -1389,6 +1476,7 @@ def _reload_query_results( retry: "retries.Retry" = DEFAULT_RETRY, timeout: Optional[float] = None, page_size: int = 0, + start_index: Optional[int] = None, ): """Refresh the cached query results unless already cached and complete. @@ -1401,6 +1489,9 @@ def _reload_query_results( page_size (int): Maximum number of rows in a single response. See maxResults in the jobs.getQueryResults REST API. + start_index (Optional[int]): + Zero-based index of the starting row. See startIndex in the + jobs.getQueryResults REST API. """ # Optimization: avoid a call to jobs.getQueryResults if it's already # been fetched, e.g. from jobs.query first page of results. @@ -1448,6 +1539,7 @@ def _reload_query_results( location=self.location, timeout=transport_timeout, page_size=page_size, + start_index=start_index, ) def result( # type: ignore # (incompatible with supertype) @@ -1509,7 +1601,7 @@ def result( # type: ignore # (incompatible with supertype) a DDL query, an ``_EmptyRowIterator`` instance is returned. Raises: - google.cloud.exceptions.GoogleAPICallError: + google.api_core.exceptions.GoogleAPICallError: If the job failed and retries aren't successful. concurrent.futures.TimeoutError: If the job did not complete in the given timeout. @@ -1525,6 +1617,8 @@ def result( # type: ignore # (incompatible with supertype) return _EmptyRowIterator( project=self.project, location=self.location, + schema=self.schema, + total_bytes_processed=self.total_bytes_processed, # Intentionally omit job_id and query_id since this doesn't # actually correspond to a finished query job. ) @@ -1550,6 +1644,9 @@ def result( # type: ignore # (incompatible with supertype) if page_size is not None: reload_query_results_kwargs["page_size"] = page_size + if start_index is not None: + reload_query_results_kwargs["start_index"] = start_index + try: retry_do_query = getattr(self, "_retry_do_query", None) if retry_do_query is not None: @@ -1709,7 +1806,11 @@ def is_job_done(): project=self.project, job_id=self.job_id, query_id=self.query_id, + schema=self.schema, num_dml_affected_rows=self._query_results.num_dml_affected_rows, + query=self.query, + total_bytes_processed=self.total_bytes_processed, + slot_millis=self.slot_millis, ) # We know that there's at least 1 row, so only treat the response from @@ -1736,6 +1837,12 @@ def is_job_done(): query_id=self.query_id, first_page_response=first_page_response, num_dml_affected_rows=self._query_results.num_dml_affected_rows, + query=self.query, + total_bytes_processed=self.total_bytes_processed, + slot_millis=self.slot_millis, + created=self.created, + started=self.started, + ended=self.ended, **list_rows_kwargs, ) rows._preserve_order = _contains_order_by(self.query) @@ -1750,6 +1857,7 @@ def to_arrow( bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, create_bqstorage_client: bool = True, max_results: Optional[int] = None, + timeout: Optional[float] = None, ) -> "pyarrow.Table": """[Beta] Create a class:`pyarrow.Table` by loading all pages of a table or query. @@ -1797,6 +1905,10 @@ def to_arrow( .. versionadded:: 2.21.0 + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. + Returns: pyarrow.Table A :class:`pyarrow.Table` populated with row data and column @@ -1814,6 +1926,7 @@ def to_arrow( progress_bar_type=progress_bar_type, bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, + timeout=timeout, ) # If changing the signature of this method, make sure to apply the same @@ -1842,6 +1955,7 @@ def to_dataframe( range_timestamp_dtype: Union[ Any, None ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, + timeout: Optional[float] = None, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -2034,6 +2148,10 @@ def to_dataframe( .. versionadded:: 3.21.0 + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data @@ -2067,6 +2185,7 @@ def to_dataframe( range_date_dtype=range_date_dtype, range_datetime_dtype=range_datetime_dtype, range_timestamp_dtype=range_timestamp_dtype, + timeout=timeout, ) # If changing the signature of this method, make sure to apply the same @@ -2080,6 +2199,11 @@ def to_geodataframe( create_bqstorage_client: bool = True, max_results: Optional[int] = None, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, + timeout: Optional[float] = None, ) -> "geopandas.GeoDataFrame": """Return a GeoPandas GeoDataFrame from a QueryJob @@ -2130,6 +2254,37 @@ def to_geodataframe( identifies which one to use to construct a GeoPandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. Returns: geopandas.GeoDataFrame: @@ -2153,6 +2308,11 @@ def to_geodataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, + timeout=timeout, ) def __iter__(self): diff --git a/google/cloud/bigquery/magics/magics.py b/google/cloud/bigquery/magics/magics.py index b153d959a..1f892b595 100644 --- a/google/cloud/bigquery/magics/magics.py +++ b/google/cloud/bigquery/magics/magics.py @@ -55,8 +55,7 @@ except ImportError: bigquery_magics = None - -IPYTHON_USER_AGENT = "ipython-{}".format(IPython.__version__) +IPYTHON_USER_AGENT = "ipython-{}".format(IPython.__version__) # type: ignore class Context(object): diff --git a/google/cloud/bigquery/model.py b/google/cloud/bigquery/model.py index 45a88ab22..16581be5a 100644 --- a/google/cloud/bigquery/model.py +++ b/google/cloud/bigquery/model.py @@ -58,7 +58,7 @@ def __init__(self, model_ref: Union["ModelReference", str, None]): # semantics. The BigQuery API makes a distinction between an unset # value, a null value, and a default value (0 or ""), but the protocol # buffer classes do not. - self._properties = {} + self._properties: Dict[str, Any] = {} if isinstance(model_ref, str): model_ref = ModelReference.from_string(model_ref) diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index f1090a7dc..170ed2976 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -18,11 +18,11 @@ import copy import datetime import decimal -from typing import Any, Optional, Dict, Union +from typing import Any, cast, Optional, Dict, Union from google.cloud.bigquery.table import _parse_schema_resource +from google.cloud.bigquery import _helpers from google.cloud.bigquery._helpers import _rows_from_json -from google.cloud.bigquery._helpers import _QUERY_PARAMS_FROM_JSON from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_PARAM from google.cloud.bigquery._helpers import _SUPPORTED_RANGE_ELEMENTS @@ -571,6 +571,9 @@ def from_api_repr(cls, resource: dict) -> "ScalarQueryParameter": Returns: google.cloud.bigquery.query.ScalarQueryParameter: Instance """ + # Import here to avoid circular imports. + from google.cloud.bigquery import schema + name = resource.get("name") type_ = resource["parameterType"]["type"] @@ -578,7 +581,9 @@ def from_api_repr(cls, resource: dict) -> "ScalarQueryParameter": # from the back-end - the latter omits it for None values. value = resource.get("parameterValue", {}).get("value") if value is not None: - converted = _QUERY_PARAMS_FROM_JSON[type_](value, None) + converted = _helpers.SCALAR_QUERY_PARAM_PARSER.to_py( + value, schema.SchemaField(cast(str, name), type_) + ) else: converted = None @@ -693,13 +698,20 @@ def _from_api_repr_struct(cls, resource): @classmethod def _from_api_repr_scalar(cls, resource): + """Converts REST resource into a list of scalar values.""" + # Import here to avoid circular imports. + from google.cloud.bigquery import schema + name = resource.get("name") array_type = resource["parameterType"]["arrayType"]["type"] parameter_value = resource.get("parameterValue", {}) array_values = parameter_value.get("arrayValues", ()) values = [value["value"] for value in array_values] converted = [ - _QUERY_PARAMS_FROM_JSON[array_type](value, None) for value in values + _helpers.SCALAR_QUERY_PARAM_PARSER.to_py( + value, schema.SchemaField(name, array_type) + ) + for value in values ] return cls(name, array_type, converted) @@ -850,6 +862,9 @@ def from_api_repr(cls, resource: dict) -> "StructQueryParameter": Returns: google.cloud.bigquery.query.StructQueryParameter: Instance """ + # Import here to avoid circular imports. + from google.cloud.bigquery import schema + name = resource.get("name") instance = cls(name) type_resources = {} @@ -877,7 +892,9 @@ def from_api_repr(cls, resource: dict) -> "StructQueryParameter": converted = ArrayQueryParameter.from_api_repr(struct_resource) else: value = value["value"] - converted = _QUERY_PARAMS_FROM_JSON[type_](value, None) + converted = _helpers.SCALAR_QUERY_PARAM_PARSER.to_py( + value, schema.SchemaField(cast(str, name), type_) + ) instance.struct_values[key] = converted return instance @@ -1211,11 +1228,18 @@ def location(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.job_reference + or https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.location Returns: str: Job ID of the query job. """ - return self._properties.get("jobReference", {}).get("location") + location = self._properties.get("jobReference", {}).get("location") + + # Sometimes there's no job, but we still want to get the location + # information. Prefer the value from job for backwards compatibilitity. + if not location: + location = self._properties.get("location") + return location @property def query_id(self) -> Optional[str]: @@ -1265,6 +1289,20 @@ def total_bytes_processed(self): if total_bytes_processed is not None: return int(total_bytes_processed) + @property + def slot_millis(self): + """Total number of slot ms the user is actually billed for. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.total_slot_ms + + Returns: + Optional[int]: Count generated on the server (None until set by the server). + """ + slot_millis = self._properties.get("totalSlotMs") + if slot_millis is not None: + return int(slot_millis) + @property def num_dml_affected_rows(self): """Total number of rows affected by a DML query. @@ -1279,6 +1317,56 @@ def num_dml_affected_rows(self): if num_dml_affected_rows is not None: return int(num_dml_affected_rows) + @property + def created(self): + """Creation time of this query. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.creation_time + + Returns: + Optional[datetime.datetime]: + the creation time (None until set from the server). + """ + millis = self._properties.get("creationTime") + if millis is not None: + return _helpers._datetime_from_microseconds(int(millis) * 1000.0) + + @property + def started(self): + """Start time of this query. + + This field will be present when the query transitions from the + PENDING state to either RUNNING or DONE. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.start_time + + Returns: + Optional[datetime.datetime]: + the start time (None until set from the server). + """ + millis = self._properties.get("startTime") + if millis is not None: + return _helpers._datetime_from_microseconds(int(millis) * 1000.0) + + @property + def ended(self): + """End time of this query. + + This field will be present whenever a query is in the DONE state. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.end_time + + Returns: + Optional[datetime.datetime]: + the end time (None until set from the server). + """ + millis = self._properties.get("endTime") + if millis is not None: + return _helpers._datetime_from_microseconds(int(millis) * 1000.0) + @property def rows(self): """Query results. @@ -1312,7 +1400,7 @@ def _set_properties(self, api_response): api_response (Dict): Response returned from an API call """ self._properties.clear() - self._properties.update(copy.deepcopy(api_response)) + self._properties.update(api_response) def _query_param_from_api_repr(resource): diff --git a/google/cloud/bigquery/retry.py b/google/cloud/bigquery/retry.py index 10958980d..6fd458df5 100644 --- a/google/cloud/bigquery/retry.py +++ b/google/cloud/bigquery/retry.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + from google.api_core import exceptions from google.api_core import retry import google.api_core.future.polling from google.auth import exceptions as auth_exceptions # type: ignore import requests.exceptions +_LOGGER = logging.getLogger(__name__) _RETRYABLE_REASONS = frozenset( ["rateLimitExceeded", "backendError", "internalError", "badGateway"] @@ -61,14 +64,17 @@ def _should_retry(exc): """Predicate for determining when to retry. - We retry if and only if the 'reason' is 'backendError' - or 'rateLimitExceeded'. + We retry if and only if the 'reason' is in _RETRYABLE_REASONS or is + in _UNSTRUCTURED_RETRYABLE_TYPES. """ - if not hasattr(exc, "errors") or len(exc.errors) == 0: - # Check for unstructured error returns, e.g. from GFE + try: + reason = exc.errors[0]["reason"] + except (AttributeError, IndexError, TypeError, KeyError): + # Fallback for when errors attribute is missing, empty, or not a dict + # or doesn't contain "reason" (e.g. gRPC exceptions). + _LOGGER.debug("Inspecting unstructured error for retry: %r", exc) return isinstance(exc, _UNSTRUCTURED_RETRYABLE_TYPES) - reason = exc.errors[0]["reason"] return reason in _RETRYABLE_REASONS @@ -82,6 +88,32 @@ def _should_retry(exc): pass ``retry=bigquery.DEFAULT_RETRY.with_deadline(30)``. """ + +def _should_retry_get_job_conflict(exc): + """Predicate for determining when to retry a jobs.get call after a conflict error. + + Sometimes we get a 404 after a Conflict. In this case, we + have pretty high confidence that by retrying the 404, we'll + (hopefully) eventually recover the job. + https://github.com/googleapis/python-bigquery/issues/2134 + + Note: we may be able to extend this to user-specified predicates + after https://github.com/googleapis/python-api-core/issues/796 + to tweak existing Retry object predicates. + """ + return isinstance(exc, exceptions.NotFound) or _should_retry(exc) + + +# Pick a deadline smaller than our other deadlines since we want to timeout +# before those expire. +_DEFAULT_GET_JOB_CONFLICT_DEADLINE = _DEFAULT_RETRY_DEADLINE / 3.0 +_DEFAULT_GET_JOB_CONFLICT_RETRY = retry.Retry( + predicate=_should_retry_get_job_conflict, + deadline=_DEFAULT_GET_JOB_CONFLICT_DEADLINE, +) +"""Private, may be removed in future.""" + + # Note: Take care when updating DEFAULT_TIMEOUT to anything but None. We # briefly had a default timeout, but even setting it at more than twice the # theoretical server-side default timeout of 2 minutes was not enough for @@ -95,9 +127,8 @@ def _should_retry(exc): """ job_retry_reasons = ( - "rateLimitExceeded", - "backendError", - "internalError", + "jobBackendError", + "jobInternalError", "jobRateLimitExceeded", ) @@ -142,6 +173,34 @@ def _job_should_retry(exc): The default job retry object. """ + +def _query_job_insert_should_retry(exc): + # Per https://github.com/googleapis/python-bigquery/issues/2134, sometimes + # we get a 404 error. In this case, if we get this far, assume that the job + # doesn't actually exist and try again. We can't add 404 to the default + # job_retry because that happens for errors like "this table does not + # exist", which probably won't resolve with a retry. + if isinstance(exc, exceptions.RetryError): + exc = exc.cause + + if isinstance(exc, exceptions.NotFound): + message = exc.message + # Don't try to retry table/dataset not found, just job not found. + # The URL contains jobs, so use whitespace to disambiguate. + return message is not None and " job" in message.lower() + + return _job_should_retry(exc) + + +_DEFAULT_QUERY_JOB_INSERT_RETRY = retry.Retry( + predicate=_query_job_insert_should_retry, + # jobs.insert doesn't wait for the job to complete, so we don't need the + # long _DEFAULT_JOB_DEADLINE for this part. + deadline=_DEFAULT_RETRY_DEADLINE, +) +"""Private, may be removed in future.""" + + DEFAULT_GET_JOB_TIMEOUT = 128 """ Default timeout for Client.get_job(). diff --git a/google/cloud/bigquery/routine/__init__.py b/google/cloud/bigquery/routine/__init__.py index e576b0d49..025103957 100644 --- a/google/cloud/bigquery/routine/__init__.py +++ b/google/cloud/bigquery/routine/__init__.py @@ -21,6 +21,7 @@ from google.cloud.bigquery.routine.routine import RoutineReference from google.cloud.bigquery.routine.routine import RoutineType from google.cloud.bigquery.routine.routine import RemoteFunctionOptions +from google.cloud.bigquery.routine.routine import ExternalRuntimeOptions __all__ = ( @@ -30,4 +31,5 @@ "RoutineReference", "RoutineType", "RemoteFunctionOptions", + "ExternalRuntimeOptions", ) diff --git a/google/cloud/bigquery/routine/routine.py b/google/cloud/bigquery/routine/routine.py index 83cb6362d..c5aa8750e 100644 --- a/google/cloud/bigquery/routine/routine.py +++ b/google/cloud/bigquery/routine/routine.py @@ -15,8 +15,8 @@ # limitations under the License. """Define resources for the BigQuery Routines API.""" - -from typing import Any, Dict, Optional +import typing +from typing import Any, Dict, Optional, Union import google.cloud._helpers # type: ignore from google.cloud.bigquery import _helpers @@ -69,6 +69,7 @@ class Routine(object): "determinism_level": "determinismLevel", "remote_function_options": "remoteFunctionOptions", "data_governance_type": "dataGovernanceType", + "external_runtime_options": "externalRuntimeOptions", } def __init__(self, routine_ref, **kwargs) -> None: @@ -216,7 +217,7 @@ def return_type(self, value: StandardSqlDataType): self._properties[self._PROPERTY_TO_API_FIELD["return_type"]] = resource @property - def return_table_type(self) -> Optional[StandardSqlTableType]: + def return_table_type(self) -> Union[StandardSqlTableType, Any, None]: """The return type of a Table Valued Function (TVF) routine. .. versionadded:: 2.22.0 @@ -349,6 +350,37 @@ def data_governance_type(self, value): ) self._properties[self._PROPERTY_TO_API_FIELD["data_governance_type"]] = value + @property + def external_runtime_options(self): + """Optional[google.cloud.bigquery.routine.ExternalRuntimeOptions]: + Configures the external runtime options for a routine. + + Raises: + ValueError: + If the value is not + :class:`~google.cloud.bigquery.routine.ExternalRuntimeOptions` or + :data:`None`. + """ + prop = self._properties.get( + self._PROPERTY_TO_API_FIELD["external_runtime_options"] + ) + if prop is not None: + return ExternalRuntimeOptions.from_api_repr(prop) + + @external_runtime_options.setter + def external_runtime_options(self, value): + api_repr = value + if isinstance(value, ExternalRuntimeOptions): + api_repr = value.to_api_repr() + elif value is not None: + raise ValueError( + "value must be google.cloud.bigquery.routine.ExternalRuntimeOptions " + "or None" + ) + self._properties[ + self._PROPERTY_TO_API_FIELD["external_runtime_options"] + ] = api_repr + @classmethod def from_api_repr(cls, resource: dict) -> "Routine": """Factory: construct a routine given its API representation. @@ -518,17 +550,17 @@ def __init__(self): @property def project(self): """str: ID of the project containing the routine.""" - return self._properties["projectId"] # pytype: disable=key-error + return self._properties.get("projectId", "") @property def dataset_id(self): """str: ID of dataset containing the routine.""" - return self._properties["datasetId"] # pytype: disable=key-error + return self._properties.get("datasetId", "") @property def routine_id(self): """str: The routine ID.""" - return self._properties["routineId"] # pytype: disable=key-error + return self._properties.get("routineId", "") @property def path(self): @@ -736,3 +768,154 @@ def __repr__(self): for property_name in sorted(self._PROPERTY_TO_API_FIELD) ] return "RemoteFunctionOptions({})".format(", ".join(all_properties)) + + +class ExternalRuntimeOptions(object): + """Options for the runtime of the external system. + + Args: + container_memory (str): + Optional. Amount of memory provisioned for a Python UDF container + instance. Format: {number}{unit} where unit is one of "M", "G", "Mi" + and "Gi" (e.g. 1G, 512Mi). If not specified, the default value is + 512Mi. For more information, see `Configure container limits for + Python UDFs `_ + container_cpu (int): + Optional. Amount of CPU provisioned for a Python UDF container + instance. For more information, see `Configure container limits + for Python UDFs `_ + runtime_connection (str): + Optional. Fully qualified name of the connection whose service account + will be used to execute the code in the container. Format: + "projects/{projectId}/locations/{locationId}/connections/{connectionId}" + max_batching_rows (int): + Optional. Maximum number of rows in each batch sent to the external + runtime. If absent or if 0, BigQuery dynamically decides the number of + rows in a batch. + runtime_version (str): + Optional. Language runtime version. Example: python-3.11. + """ + + _PROPERTY_TO_API_FIELD = { + "container_memory": "containerMemory", + "container_cpu": "containerCpu", + "runtime_connection": "runtimeConnection", + "max_batching_rows": "maxBatchingRows", + "runtime_version": "runtimeVersion", + } + + def __init__( + self, + container_memory: Optional[str] = None, + container_cpu: Optional[int] = None, + runtime_connection: Optional[str] = None, + max_batching_rows: Optional[int] = None, + runtime_version: Optional[str] = None, + _properties: Optional[Dict] = None, + ) -> None: + if _properties is None: + _properties = {} + self._properties = _properties + + if container_memory is not None: + self.container_memory = container_memory + if container_cpu is not None: + self.container_cpu = container_cpu + if runtime_connection is not None: + self.runtime_connection = runtime_connection + if max_batching_rows is not None: + self.max_batching_rows = max_batching_rows + if runtime_version is not None: + self.runtime_version = runtime_version + + @property + def container_memory(self) -> Optional[str]: + """Optional. Amount of memory provisioned for a Python UDF container instance.""" + return _helpers._str_or_none(self._properties.get("containerMemory")) + + @container_memory.setter + def container_memory(self, value: Optional[str]): + if value is not None and not isinstance(value, str): + raise ValueError("container_memory must be a string or None.") + self._properties["containerMemory"] = value + + @property + def container_cpu(self) -> Optional[int]: + """Optional. Amount of CPU provisioned for a Python UDF container instance.""" + return _helpers._int_or_none(self._properties.get("containerCpu")) + + @container_cpu.setter + def container_cpu(self, value: Optional[int]): + if value is not None and not isinstance(value, int): + raise ValueError("container_cpu must be an integer or None.") + self._properties["containerCpu"] = value + + @property + def runtime_connection(self) -> Optional[str]: + """Optional. Fully qualified name of the connection.""" + return _helpers._str_or_none(self._properties.get("runtimeConnection")) + + @runtime_connection.setter + def runtime_connection(self, value: Optional[str]): + if value is not None and not isinstance(value, str): + raise ValueError("runtime_connection must be a string or None.") + self._properties["runtimeConnection"] = value + + @property + def max_batching_rows(self) -> Optional[int]: + """Optional. Maximum number of rows in each batch sent to the external runtime.""" + return typing.cast( + int, _helpers._int_or_none(self._properties.get("maxBatchingRows")) + ) + + @max_batching_rows.setter + def max_batching_rows(self, value: Optional[int]): + if value is not None and not isinstance(value, int): + raise ValueError("max_batching_rows must be an integer or None.") + self._properties["maxBatchingRows"] = _helpers._str_or_none(value) + + @property + def runtime_version(self) -> Optional[str]: + """Optional. Language runtime version.""" + return _helpers._str_or_none(self._properties.get("runtimeVersion")) + + @runtime_version.setter + def runtime_version(self, value: Optional[str]): + if value is not None and not isinstance(value, str): + raise ValueError("runtime_version must be a string or None.") + self._properties["runtimeVersion"] = value + + @classmethod + def from_api_repr(cls, resource: dict) -> "ExternalRuntimeOptions": + """Factory: construct external runtime options given its API representation. + Args: + resource (Dict[str, object]): Resource, as returned from the API. + Returns: + google.cloud.bigquery.routine.ExternalRuntimeOptions: + Python object, as parsed from ``resource``. + """ + ref = cls() + ref._properties = resource + return ref + + def to_api_repr(self) -> dict: + """Construct the API resource representation of this ExternalRuntimeOptions. + Returns: + Dict[str, object]: External runtime options represented as an API resource. + """ + return self._properties + + def __eq__(self, other): + if not isinstance(other, ExternalRuntimeOptions): + return NotImplemented + return self._properties == other._properties + + def __ne__(self, other): + return not self == other + + def __repr__(self): + all_properties = [ + "{}={}".format(property_name, repr(getattr(self, property_name))) + for property_name in sorted(self._PROPERTY_TO_API_FIELD) + ] + return "ExternalRuntimeOptions({})".format(", ".join(all_properties)) diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py index f5b03cbef..1809df21f 100644 --- a/google/cloud/bigquery/schema.py +++ b/google/cloud/bigquery/schema.py @@ -14,19 +14,22 @@ """Schemas for BigQuery tables / queries.""" -import collections +from __future__ import annotations import enum -from typing import Any, Dict, Iterable, Optional, Union, cast +import typing +from typing import Any, cast, Dict, Iterable, Optional, Union, Sequence +from google.cloud.bigquery import _helpers from google.cloud.bigquery import standard_sql +from google.cloud.bigquery import enums from google.cloud.bigquery.enums import StandardSqlTypeNames _STRUCT_TYPES = ("RECORD", "STRUCT") # SQL types reference: -# https://cloud.google.com/bigquery/data-types#legacy_sql_data_types -# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types +# LEGACY SQL: https://cloud.google.com/bigquery/data-types#legacy_sql_data_types +# GoogleSQL: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types LEGACY_TO_STANDARD_TYPES = { "STRING": StandardSqlTypeNames.STRING, "BYTES": StandardSqlTypeNames.BYTES, @@ -45,6 +48,7 @@ "DATE": StandardSqlTypeNames.DATE, "TIME": StandardSqlTypeNames.TIME, "DATETIME": StandardSqlTypeNames.DATETIME, + "FOREIGN": StandardSqlTypeNames.FOREIGN, # no direct conversion from ARRAY, the latter is represented by mode="REPEATED" } """String names of the legacy SQL types to integer codes of Standard SQL standard_sql.""" @@ -163,6 +167,43 @@ class SchemaField(object): the type is RANGE, this field is required. Possible values for the field element type of a RANGE include `DATE`, `DATETIME` and `TIMESTAMP`. + + rounding_mode: Union[enums.RoundingMode, str, None] + Specifies the rounding mode to be used when storing values of + NUMERIC and BIGNUMERIC type. + + Unspecified will default to using ROUND_HALF_AWAY_FROM_ZERO. + ROUND_HALF_AWAY_FROM_ZERO rounds half values away from zero + when applying precision and scale upon writing of NUMERIC and BIGNUMERIC + values. + + For Scale: 0 + 1.1, 1.2, 1.3, 1.4 => 1 + 1.5, 1.6, 1.7, 1.8, 1.9 => 2 + + ROUND_HALF_EVEN rounds half values to the nearest even value + when applying precision and scale upon writing of NUMERIC and BIGNUMERIC + values. + + For Scale: 0 + 1.1, 1.2, 1.3, 1.4 => 1 + 1.5 => 2 + 1.6, 1.7, 1.8, 1.9 => 2 + 2.5 => 2 + + foreign_type_definition: Optional[str] + Definition of the foreign data type. + + Only valid for top-level schema fields (not nested fields). + If the type is FOREIGN, this field is required. + + timestamp_precision: Optional[enums.TimestampPrecision] + Precision (maximum number of total digits in base 10) for seconds + of TIMESTAMP type. + + Defaults to `enums.TimestampPrecision.MICROSECOND` (`None`) for + microsecond precision. Use `enums.TimestampPrecision.PICOSECOND` + (`12`) for picosecond precision. """ def __init__( @@ -178,11 +219,15 @@ def __init__( scale: Union[int, _DefaultSentinel] = _DEFAULT_VALUE, max_length: Union[int, _DefaultSentinel] = _DEFAULT_VALUE, range_element_type: Union[FieldElementType, str, None] = None, + rounding_mode: Union[enums.RoundingMode, str, None] = None, + foreign_type_definition: Optional[str] = None, + timestamp_precision: Optional[enums.TimestampPrecision] = None, ): self._properties: Dict[str, Any] = { "name": name, "type": field_type, } + self._properties["name"] = name if mode is not None: self._properties["mode"] = mode.upper() if description is not _DEFAULT_VALUE: @@ -197,80 +242,77 @@ def __init__( self._properties["maxLength"] = max_length if policy_tags is not _DEFAULT_VALUE: self._properties["policyTags"] = ( - policy_tags.to_api_repr() if policy_tags is not None else None + policy_tags.to_api_repr() + if isinstance(policy_tags, PolicyTagList) + else None + ) + if isinstance(timestamp_precision, enums.TimestampPrecision): + self._properties["timestampPrecision"] = timestamp_precision.value + elif timestamp_precision is not None: + raise ValueError( + "timestamp_precision must be class enums.TimestampPrecision " + f"or None, got {type(timestamp_precision)} instead." ) if isinstance(range_element_type, str): self._properties["rangeElementType"] = {"type": range_element_type} if isinstance(range_element_type, FieldElementType): self._properties["rangeElementType"] = range_element_type.to_api_repr() + if rounding_mode is not None: + self._properties["roundingMode"] = rounding_mode + if foreign_type_definition is not None: + self._properties["foreignTypeDefinition"] = foreign_type_definition - self._fields = tuple(fields) - - @staticmethod - def __get_int(api_repr, name): - v = api_repr.get(name, _DEFAULT_VALUE) - if v is not _DEFAULT_VALUE: - v = int(v) - return v + if fields: # Don't set the property if it's not set. + self._properties["fields"] = [field.to_api_repr() for field in fields] @classmethod def from_api_repr(cls, api_repr: dict) -> "SchemaField": """Return a ``SchemaField`` object deserialized from a dictionary. Args: - api_repr (Mapping[str, str]): The serialized representation - of the SchemaField, such as what is output by - :meth:`to_api_repr`. + api_repr (dict): The serialized representation of the SchemaField, + such as what is output by :meth:`to_api_repr`. Returns: google.cloud.bigquery.schema.SchemaField: The ``SchemaField`` object. """ - field_type = api_repr["type"].upper() + placeholder = cls("this_will_be_replaced", "PLACEHOLDER") - # Handle optional properties with default values - mode = api_repr.get("mode", "NULLABLE") - description = api_repr.get("description", _DEFAULT_VALUE) - fields = api_repr.get("fields", ()) - policy_tags = api_repr.get("policyTags", _DEFAULT_VALUE) + # The API would return a string despite we send an integer. To ensure + # success of resending received schema, we convert string to integer + # to ensure consistency. + try: + api_repr["timestampPrecision"] = int(api_repr["timestampPrecision"]) + except (TypeError, KeyError): + pass - default_value_expression = api_repr.get("defaultValueExpression", None) + # Note: we don't make a copy of api_repr because this can cause + # unnecessary slowdowns, especially on deeply nested STRUCT / RECORD + # fields. See https://github.com/googleapis/python-bigquery/issues/6 + placeholder._properties = api_repr - if policy_tags is not None and policy_tags is not _DEFAULT_VALUE: - policy_tags = PolicyTagList.from_api_repr(policy_tags) + # Add the field `mode` with default value if it does not exist. Fixes + # an incompatibility issue with pandas-gbq: + # https://github.com/googleapis/python-bigquery-pandas/issues/854 + if "mode" not in placeholder._properties: + placeholder._properties["mode"] = "NULLABLE" - if api_repr.get("rangeElementType"): - range_element_type = cast(dict, api_repr.get("rangeElementType")) - element_type = range_element_type.get("type") - else: - element_type = None - - return cls( - field_type=field_type, - fields=[cls.from_api_repr(f) for f in fields], - mode=mode.upper(), - default_value_expression=default_value_expression, - description=description, - name=api_repr["name"], - policy_tags=policy_tags, - precision=cls.__get_int(api_repr, "precision"), - scale=cls.__get_int(api_repr, "scale"), - max_length=cls.__get_int(api_repr, "maxLength"), - range_element_type=element_type, - ) + return placeholder @property def name(self): """str: The name of the field.""" - return self._properties["name"] + return self._properties.get("name", "") @property - def field_type(self): + def field_type(self) -> str: """str: The type of the field. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type """ - return self._properties["type"] + type_ = self._properties.get("type") + return cast(str, type_).upper() @property def mode(self): @@ -279,7 +321,7 @@ def mode(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.mode """ - return self._properties.get("mode") + return cast(str, self._properties.get("mode", "NULLABLE")).upper() @property def is_nullable(self): @@ -299,17 +341,17 @@ def description(self): @property def precision(self): """Optional[int]: Precision (number of digits) for the NUMERIC field.""" - return self._properties.get("precision") + return _helpers._int_or_none(self._properties.get("precision")) @property def scale(self): """Optional[int]: Scale (digits after decimal) for the NUMERIC field.""" - return self._properties.get("scale") + return _helpers._int_or_none(self._properties.get("scale")) @property def max_length(self): """Optional[int]: Maximum length for the STRING or BYTES field.""" - return self._properties.get("maxLength") + return _helpers._int_or_none(self._properties.get("maxLength")) @property def range_element_type(self): @@ -323,13 +365,29 @@ def range_element_type(self): ret = self._properties.get("rangeElementType") return FieldElementType.from_api_repr(ret) + @property + def rounding_mode(self): + """Enum that specifies the rounding mode to be used when storing values of + NUMERIC and BIGNUMERIC type. + """ + return self._properties.get("roundingMode") + + @property + def foreign_type_definition(self): + """Definition of the foreign data type. + + Only valid for top-level schema fields (not nested fields). + If the type is FOREIGN, this field is required. + """ + return self._properties.get("foreignTypeDefinition") + @property def fields(self): """Optional[tuple]: Subfields contained in this field. Must be empty unset if ``field_type`` is not 'RECORD'. """ - return self._fields + return tuple(_to_schema_fields(self._properties.get("fields", []))) @property def policy_tags(self): @@ -339,21 +397,26 @@ def policy_tags(self): resource = self._properties.get("policyTags") return PolicyTagList.from_api_repr(resource) if resource is not None else None + @property + def timestamp_precision(self) -> enums.TimestampPrecision: + """Precision (maximum number of total digits in base 10) for seconds of + TIMESTAMP type. + + Returns: + enums.TimestampPrecision: value of TimestampPrecision. + """ + return enums.TimestampPrecision(self._properties.get("timestampPrecision")) + def to_api_repr(self) -> dict: """Return a dictionary representing this schema field. Returns: Dict: A dictionary representing the SchemaField in a serialized form. """ - answer = self._properties.copy() - - # If this is a RECORD type, then sub-fields are also included, - # add this to the serialized representation. - if self.field_type.upper() in _STRUCT_TYPES: - answer["fields"] = [f.to_api_repr() for f in self.fields] - - # Done; return the serialized dictionary. - return answer + # Note: we don't make a copy of _properties because this can cause + # unnecessary slowdowns, especially on deeply nested STRUCT / RECORD + # fields. See https://github.com/googleapis/python-bigquery/issues/6 + return self._properties def _key(self): """A tuple key that uniquely describes this field. @@ -363,25 +426,23 @@ def _key(self): Returns: Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`. """ - field_type = self.field_type.upper() if self.field_type is not None else None - - # Type can temporarily be set to None if the code needs a SchemaField instance, - # but has not determined the exact type of the field yet. - if field_type is not None: - if field_type == "STRING" or field_type == "BYTES": - if self.max_length is not None: - field_type = f"{field_type}({self.max_length})" - elif field_type.endswith("NUMERIC"): - if self.precision is not None: - if self.scale is not None: - field_type = f"{field_type}({self.precision}, {self.scale})" - else: - field_type = f"{field_type}({self.precision})" + field_type = self.field_type + if field_type == "STRING" or field_type == "BYTES": + if self.max_length is not None: + field_type = f"{field_type}({self.max_length})" + elif field_type.endswith("NUMERIC"): + if self.precision is not None: + if self.scale is not None: + field_type = f"{field_type}({self.precision}, {self.scale})" + else: + field_type = f"{field_type}({self.precision})" policy_tags = ( None if self.policy_tags is None else tuple(sorted(self.policy_tags.names)) ) + timestamp_precision = self._properties.get("timestampPrecision") + return ( self.name, field_type, @@ -389,8 +450,9 @@ def _key(self): self.mode.upper(), # pytype: disable=attribute-error self.default_value_expression, self.description, - self._fields, + self.fields, policy_tags, + timestamp_precision, ) def to_standard_sql(self) -> standard_sql.StandardSqlField: @@ -441,10 +503,9 @@ def __hash__(self): return hash(self._key()) def __repr__(self): - key = self._key() - policy_tags = key[-1] + *initial_tags, policy_tags, timestamp_precision_tag = self._key() policy_tags_inst = None if policy_tags is None else PolicyTagList(policy_tags) - adjusted_key = key[:-1] + (policy_tags_inst,) + adjusted_key = (*initial_tags, policy_tags_inst, timestamp_precision_tag) return f"{self.__class__.__name__}{adjusted_key}" @@ -458,6 +519,8 @@ def _parse_schema_resource(info): Optional[Sequence[google.cloud.bigquery.schema.SchemaField`]: A list of parsed fields, or ``None`` if no "fields" key found. """ + if isinstance(info, list): + return [SchemaField.from_api_repr(f) for f in info] return [SchemaField.from_api_repr(f) for f in info.get("fields", ())] @@ -470,40 +533,48 @@ def _build_schema_resource(fields): Returns: Sequence[Dict]: Mappings describing the schema of the supplied fields. """ - return [field.to_api_repr() for field in fields] + if isinstance(fields, Sequence): + # Input is a Sequence (e.g. a list): Process and return a list of SchemaFields + return [field.to_api_repr() for field in fields] + + else: + raise TypeError("Schema must be a Sequence (e.g. a list) or None.") def _to_schema_fields(schema): - """Coerce `schema` to a list of schema field instances. + """Coerces schema to a list of SchemaField instances while + preserving the original structure as much as possible. Args: - schema(Sequence[Union[ \ - :class:`~google.cloud.bigquery.schema.SchemaField`, \ - Mapping[str, Any] \ - ]]): - Table schema to convert. If some items are passed as mappings, - their content must be compatible with - :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`. + schema (Sequence[Union[ \ + :class:`~google.cloud.bigquery.schema.SchemaField`, \ + Mapping[str, Any] \ + ] + ] + ):: + Table schema to convert. Can be a list of SchemaField + objects or mappings. Returns: - Sequence[:class:`~google.cloud.bigquery.schema.SchemaField`] + A list of SchemaField objects. Raises: - Exception: If ``schema`` is not a sequence, or if any item in the - sequence is not a :class:`~google.cloud.bigquery.schema.SchemaField` - instance or a compatible mapping representation of the field. + TypeError: If schema is not a Sequence. """ - for field in schema: - if not isinstance(field, (SchemaField, collections.abc.Mapping)): - raise ValueError( - "Schema items must either be fields or compatible " - "mapping representations." + + if isinstance(schema, Sequence): + # Input is a Sequence (e.g. a list): Process and return a list of SchemaFields + return [ + ( + field + if isinstance(field, SchemaField) + else SchemaField.from_api_repr(field) ) + for field in schema + ] - return [ - field if isinstance(field, SchemaField) else SchemaField.from_api_repr(field) - for field in schema - ] + else: + raise TypeError("Schema must be a Sequence (e.g. a list) or None.") class PolicyTagList(object): @@ -588,3 +659,267 @@ def to_api_repr(self) -> dict: """ answer = {"names": list(self.names)} return answer + + +class ForeignTypeInfo: + """Metadata about the foreign data type definition such as the system in which the + type is defined. + + Args: + type_system (str): Required. Specifies the system which defines the + foreign data type. + + TypeSystem enum currently includes: + * "TYPE_SYSTEM_UNSPECIFIED" + * "HIVE" + """ + + def __init__(self, type_system: Optional[str] = None): + self._properties: Dict[str, Any] = {} + self.type_system = type_system + + @property + def type_system(self) -> Optional[str]: + """Required. Specifies the system which defines the foreign data + type.""" + + return self._properties.get("typeSystem") + + @type_system.setter + def type_system(self, value: Optional[str]): + value = _helpers._isinstance_or_raise(value, str, none_allowed=True) + self._properties["typeSystem"] = value + + def to_api_repr(self) -> dict: + """Build an API representation of this object. + + Returns: + Dict[str, Any]: + A dictionary in the format used by the BigQuery API. + """ + + return self._properties + + @classmethod + def from_api_repr(cls, api_repr: Dict[str, Any]) -> "ForeignTypeInfo": + """Factory: constructs an instance of the class (cls) + given its API representation. + + Args: + api_repr (Dict[str, Any]): + API representation of the object to be instantiated. + + Returns: + An instance of the class initialized with data from 'api_repr'. + """ + + config = cls() + config._properties = api_repr + return config + + +class SerDeInfo: + """Serializer and deserializer information. + + Args: + serialization_library (str): Required. Specifies a fully-qualified class + name of the serialization library that is responsible for the + translation of data between table representation and the underlying + low-level input and output format structures. The maximum length is + 256 characters. + name (Optional[str]): Name of the SerDe. The maximum length is 256 + characters. + parameters: (Optional[dict[str, str]]): Key-value pairs that define the initialization + parameters for the serialization library. Maximum size 10 Kib. + """ + + def __init__( + self, + serialization_library: str, + name: Optional[str] = None, + parameters: Optional[dict[str, str]] = None, + ): + self._properties: Dict[str, Any] = {} + self.serialization_library = serialization_library + self.name = name + self.parameters = parameters + + @property + def serialization_library(self) -> str: + """Required. Specifies a fully-qualified class name of the serialization + library that is responsible for the translation of data between table + representation and the underlying low-level input and output format + structures. The maximum length is 256 characters.""" + + return typing.cast(str, self._properties.get("serializationLibrary")) + + @serialization_library.setter + def serialization_library(self, value: str): + value = _helpers._isinstance_or_raise(value, str, none_allowed=False) + self._properties["serializationLibrary"] = value + + @property + def name(self) -> Optional[str]: + """Optional. Name of the SerDe. The maximum length is 256 characters.""" + + return self._properties.get("name") + + @name.setter + def name(self, value: Optional[str] = None): + value = _helpers._isinstance_or_raise(value, str, none_allowed=True) + self._properties["name"] = value + + @property + def parameters(self) -> Optional[dict[str, str]]: + """Optional. Key-value pairs that define the initialization parameters + for the serialization library. Maximum size 10 Kib.""" + + return self._properties.get("parameters") + + @parameters.setter + def parameters(self, value: Optional[dict[str, str]] = None): + value = _helpers._isinstance_or_raise(value, dict, none_allowed=True) + self._properties["parameters"] = value + + def to_api_repr(self) -> dict: + """Build an API representation of this object. + + Returns: + Dict[str, Any]: + A dictionary in the format used by the BigQuery API. + """ + return self._properties + + @classmethod + def from_api_repr(cls, api_repr: dict) -> SerDeInfo: + """Factory: constructs an instance of the class (cls) + given its API representation. + + Args: + api_repr (Dict[str, Any]): + API representation of the object to be instantiated. + + Returns: + An instance of the class initialized with data from 'api_repr'. + """ + config = cls("PLACEHOLDER") + config._properties = api_repr + return config + + +class StorageDescriptor: + """Contains information about how a table's data is stored and accessed by open + source query engines. + + Args: + input_format (Optional[str]): Specifies the fully qualified class name of + the InputFormat (e.g. + "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"). The maximum + length is 128 characters. + location_uri (Optional[str]): The physical location of the table (e.g. + 'gs://spark-dataproc-data/pangea-data/case_sensitive/' or + 'gs://spark-dataproc-data/pangea-data/'). The maximum length is + 2056 bytes. + output_format (Optional[str]): Specifies the fully qualified class name + of the OutputFormat (e.g. + "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"). The maximum + length is 128 characters. + serde_info (Union[SerDeInfo, dict, None]): Serializer and deserializer information. + """ + + def __init__( + self, + input_format: Optional[str] = None, + location_uri: Optional[str] = None, + output_format: Optional[str] = None, + serde_info: Union[SerDeInfo, dict, None] = None, + ): + self._properties: Dict[str, Any] = {} + self.input_format = input_format + self.location_uri = location_uri + self.output_format = output_format + # Using typing.cast() because mypy cannot wrap it's head around the fact that: + # the setter can accept Union[SerDeInfo, dict, None] + # but the getter will only ever return Optional[SerDeInfo]. + self.serde_info = typing.cast(Optional[SerDeInfo], serde_info) + + @property + def input_format(self) -> Optional[str]: + """Optional. Specifies the fully qualified class name of the InputFormat + (e.g. "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"). The maximum + length is 128 characters.""" + + return self._properties.get("inputFormat") + + @input_format.setter + def input_format(self, value: Optional[str]): + value = _helpers._isinstance_or_raise(value, str, none_allowed=True) + self._properties["inputFormat"] = value + + @property + def location_uri(self) -> Optional[str]: + """Optional. The physical location of the table (e.g. 'gs://spark- + dataproc-data/pangea-data/case_sensitive/' or 'gs://spark-dataproc- + data/pangea-data/'). The maximum length is 2056 bytes.""" + + return self._properties.get("locationUri") + + @location_uri.setter + def location_uri(self, value: Optional[str]): + value = _helpers._isinstance_or_raise(value, str, none_allowed=True) + self._properties["locationUri"] = value + + @property + def output_format(self) -> Optional[str]: + """Optional. Specifies the fully qualified class name of the + OutputFormat (e.g. "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"). + The maximum length is 128 characters.""" + + return self._properties.get("outputFormat") + + @output_format.setter + def output_format(self, value: Optional[str]): + value = _helpers._isinstance_or_raise(value, str, none_allowed=True) + self._properties["outputFormat"] = value + + @property + def serde_info(self) -> Optional[SerDeInfo]: + """Optional. Serializer and deserializer information.""" + + prop = _helpers._get_sub_prop(self._properties, ["serDeInfo"]) + if prop is not None: + return typing.cast(SerDeInfo, SerDeInfo.from_api_repr(prop)) + return None + + @serde_info.setter + def serde_info(self, value: Union[SerDeInfo, dict, None]): + value = _helpers._isinstance_or_raise( + value, (SerDeInfo, dict), none_allowed=True + ) + + if isinstance(value, SerDeInfo): + self._properties["serDeInfo"] = value.to_api_repr() + else: + self._properties["serDeInfo"] = value + + def to_api_repr(self) -> dict: + """Build an API representation of this object. + Returns: + Dict[str, Any]: + A dictionary in the format used by the BigQuery API. + """ + return self._properties + + @classmethod + def from_api_repr(cls, resource: dict) -> StorageDescriptor: + """Factory: constructs an instance of the class (cls) + given its API representation. + Args: + resource (Dict[str, Any]): + API representation of the object to be instantiated. + Returns: + An instance of the class initialized with data from 'resource'. + """ + config = cls() + config._properties = resource + return config diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index faf827be4..88b673a8b 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -21,7 +21,8 @@ import functools import operator import typing -from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union, Sequence + import warnings try: @@ -43,7 +44,7 @@ import geopandas # type: ignore except ImportError: geopandas = None -else: +finally: _COORDINATE_REFERENCE_SYSTEM = "EPSG:4326" try: @@ -66,9 +67,11 @@ from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.external_config import ExternalConfig +from google.cloud.bigquery import schema as _schema from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields +from google.cloud.bigquery import external_config if typing.TYPE_CHECKING: # pragma: NO COVER # Unconditionally import optional dependencies again to tell pytype that @@ -134,7 +137,9 @@ def _reference_getter(table): return TableReference(dataset_ref, table.table_id) -def _view_use_legacy_sql_getter(table): +def _view_use_legacy_sql_getter( + table: Union["Table", "TableListItem"] +) -> Optional[bool]: """bool: Specifies whether to execute the view with Legacy or Standard SQL. This boolean specifies whether to execute the view with Legacy SQL @@ -145,15 +150,17 @@ def _view_use_legacy_sql_getter(table): Raises: ValueError: For invalid value types. """ - view = table._properties.get("view") + + view: Optional[Dict[str, Any]] = table._properties.get("view") if view is not None: # The server-side default for useLegacySql is True. - return view.get("useLegacySql", True) + return view.get("useLegacySql", True) if view is not None else True # In some cases, such as in a table list no view object is present, but the # resource still represents a view. Use the type as a fallback. if table.table_type == "VIEW": # The server-side default for useLegacySql is True. return True + return None # explicit return statement to appease mypy class _TableBase: @@ -372,8 +379,9 @@ class Table(_TableBase): :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`. """ - _PROPERTY_TO_API_FIELD = { + _PROPERTY_TO_API_FIELD: Dict[str, Any] = { **_TableBase._PROPERTY_TO_API_FIELD, + "biglake_configuration": "biglakeConfiguration", "clustering_fields": "clustering", "created": "creationTime", "description": "description", @@ -390,13 +398,14 @@ class Table(_TableBase): "mview_last_refresh_time": ["materializedView", "lastRefreshTime"], "mview_query": "materializedView", "mview_refresh_interval": "materializedView", + "mview_allow_non_incremental_definition": "materializedView", "num_bytes": "numBytes", "num_rows": "numRows", "partition_expiration": "timePartitioning", "partitioning_type": "timePartitioning", "range_partitioning": "rangePartitioning", "time_partitioning": "timePartitioning", - "schema": "schema", + "schema": ["schema", "fields"], "snapshot_definition": "snapshotDefinition", "clone_definition": "cloneDefinition", "streaming_buffer": "streamingBuffer", @@ -406,17 +415,47 @@ class Table(_TableBase): "view_query": "view", "require_partition_filter": "requirePartitionFilter", "table_constraints": "tableConstraints", + "max_staleness": "maxStaleness", + "resource_tags": "resourceTags", + "external_catalog_table_options": "externalCatalogTableOptions", + "foreign_type_info": ["schema", "foreignTypeInfo"], } def __init__(self, table_ref, schema=None) -> None: table_ref = _table_arg_to_table_ref(table_ref) - self._properties = {"tableReference": table_ref.to_api_repr(), "labels": {}} + self._properties: Dict[str, Any] = { + "tableReference": table_ref.to_api_repr(), + "labels": {}, + } # Let the @property do validation. if schema is not None: self.schema = schema reference = property(_reference_getter) + @property + def biglake_configuration(self): + """google.cloud.bigquery.table.BigLakeConfiguration: Configuration + for managed tables for Apache Iceberg. + + See https://cloud.google.com/bigquery/docs/iceberg-tables for more information. + """ + prop = self._properties.get( + self._PROPERTY_TO_API_FIELD["biglake_configuration"] + ) + if prop is not None: + prop = BigLakeConfiguration.from_api_repr(prop) + return prop + + @biglake_configuration.setter + def biglake_configuration(self, value): + api_repr = value + if value is not None: + api_repr = value.to_api_repr() + self._properties[ + self._PROPERTY_TO_API_FIELD["biglake_configuration"] + ] = api_repr + @property def require_partition_filter(self): """bool: If set to true, queries over the partitioned table require a @@ -446,8 +485,20 @@ def schema(self): If ``schema`` is not a sequence, or if any item in the sequence is not a :class:`~google.cloud.bigquery.schema.SchemaField` instance or a compatible mapping representation of the field. + + .. Note:: + If you are referencing a schema for an external catalog table such + as a Hive table, it will also be necessary to populate the foreign_type_info + attribute. This is not necessary if defining the schema for a BigQuery table. + + For details, see: + https://cloud.google.com/bigquery/docs/external-tables + https://cloud.google.com/bigquery/docs/datasets-intro#external_datasets + """ - prop = self._properties.get(self._PROPERTY_TO_API_FIELD["schema"]) + prop = _helpers._get_sub_prop( + self._properties, self._PROPERTY_TO_API_FIELD["schema"] + ) if not prop: return [] else: @@ -458,10 +509,21 @@ def schema(self, value): api_field = self._PROPERTY_TO_API_FIELD["schema"] if value is None: - self._properties[api_field] = None - else: + _helpers._set_sub_prop( + self._properties, + api_field, + None, + ) + elif isinstance(value, Sequence): value = _to_schema_fields(value) - self._properties[api_field] = {"fields": _build_schema_resource(value)} + value = _build_schema_resource(value) + _helpers._set_sub_prop( + self._properties, + api_field, + value, + ) + else: + raise TypeError("Schema must be a Sequence (e.g. a list) or None.") @property def labels(self): @@ -928,6 +990,28 @@ def mview_refresh_interval(self, value): refresh_interval_ms, ) + @property + def mview_allow_non_incremental_definition(self): + """Optional[bool]: This option declares the intention to construct a + materialized view that isn't refreshed incrementally. + The default value is :data:`False`. + """ + api_field = self._PROPERTY_TO_API_FIELD[ + "mview_allow_non_incremental_definition" + ] + return _helpers._get_sub_prop( + self._properties, [api_field, "allowNonIncrementalDefinition"] + ) + + @mview_allow_non_incremental_definition.setter + def mview_allow_non_incremental_definition(self, value): + api_field = self._PROPERTY_TO_API_FIELD[ + "mview_allow_non_incremental_definition" + ] + _helpers._set_sub_prop( + self._properties, [api_field, "allowNonIncrementalDefinition"], value + ) + @property def streaming_buffer(self): """google.cloud.bigquery.StreamingBuffer: Information about a table's @@ -999,6 +1083,101 @@ def table_constraints(self) -> Optional["TableConstraints"]: table_constraints = TableConstraints.from_api_repr(table_constraints) return table_constraints + @table_constraints.setter + def table_constraints(self, value): + """Tables Primary Key and Foreign Key information.""" + api_repr = value + if not isinstance(value, TableConstraints) and value is not None: + raise ValueError( + "value must be google.cloud.bigquery.table.TableConstraints or None" + ) + api_repr = value.to_api_repr() if value else None + self._properties[self._PROPERTY_TO_API_FIELD["table_constraints"]] = api_repr + + @property + def resource_tags(self): + """Dict[str, str]: Resource tags for the table. + + See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table.FIELDS.resource_tags + """ + return self._properties.setdefault( + self._PROPERTY_TO_API_FIELD["resource_tags"], {} + ) + + @resource_tags.setter + def resource_tags(self, value): + if not isinstance(value, dict) and value is not None: + raise ValueError("resource_tags must be a dict or None") + self._properties[self._PROPERTY_TO_API_FIELD["resource_tags"]] = value + + @property + def external_catalog_table_options( + self, + ) -> Optional[external_config.ExternalCatalogTableOptions]: + """Options defining open source compatible datasets living in the + BigQuery catalog. Contains metadata of open source database, schema + or namespace represented by the current dataset.""" + + prop = self._properties.get( + self._PROPERTY_TO_API_FIELD["external_catalog_table_options"] + ) + if prop is not None: + return external_config.ExternalCatalogTableOptions.from_api_repr(prop) + return None + + @external_catalog_table_options.setter + def external_catalog_table_options( + self, value: Union[external_config.ExternalCatalogTableOptions, dict, None] + ): + value = _helpers._isinstance_or_raise( + value, + (external_config.ExternalCatalogTableOptions, dict), + none_allowed=True, + ) + if isinstance(value, external_config.ExternalCatalogTableOptions): + self._properties[ + self._PROPERTY_TO_API_FIELD["external_catalog_table_options"] + ] = value.to_api_repr() + else: + self._properties[ + self._PROPERTY_TO_API_FIELD["external_catalog_table_options"] + ] = value + + @property + def foreign_type_info(self) -> Optional[_schema.ForeignTypeInfo]: + """Optional. Specifies metadata of the foreign data type definition in + field schema (TableFieldSchema.foreign_type_definition). + Returns: + Optional[schema.ForeignTypeInfo]: + Foreign type information, or :data:`None` if not set. + .. Note:: + foreign_type_info is only required if you are referencing an + external catalog such as a Hive table. + For details, see: + https://cloud.google.com/bigquery/docs/external-tables + https://cloud.google.com/bigquery/docs/datasets-intro#external_datasets + """ + + prop = _helpers._get_sub_prop( + self._properties, self._PROPERTY_TO_API_FIELD["foreign_type_info"] + ) + if prop is not None: + return _schema.ForeignTypeInfo.from_api_repr(prop) + return None + + @foreign_type_info.setter + def foreign_type_info(self, value: Union[_schema.ForeignTypeInfo, dict, None]): + value = _helpers._isinstance_or_raise( + value, + (_schema.ForeignTypeInfo, dict), + none_allowed=True, + ) + if isinstance(value, _schema.ForeignTypeInfo): + value = value.to_api_repr() + _helpers._set_sub_prop( + self._properties, self._PROPERTY_TO_API_FIELD["foreign_type_info"], value + ) + @classmethod def from_string(cls, full_table_id: str) -> "Table": """Construct a table from fully-qualified table ID. @@ -1092,6 +1271,40 @@ def __repr__(self): def __str__(self): return f"{self.project}.{self.dataset_id}.{self.table_id}" + @property + def max_staleness(self): + """Union[str, None]: The maximum staleness of data that could be returned when the table is queried. + + Staleness encoded as a string encoding of sql IntervalValue type. + This property is optional and defaults to None. + + According to the BigQuery API documentation, maxStaleness specifies the maximum time + interval for which stale data can be returned when querying the table. + It helps control data freshness in scenarios like metadata-cached external tables. + + Returns: + Optional[str]: A string representing the maximum staleness interval + (e.g., '1h', '30m', '15s' for hours, minutes, seconds respectively). + """ + return self._properties.get(self._PROPERTY_TO_API_FIELD["max_staleness"]) + + @max_staleness.setter + def max_staleness(self, value): + """Set the maximum staleness for the table. + + Args: + value (Optional[str]): A string representing the maximum staleness interval. + Must be a valid time interval string. + Examples include '1h' (1 hour), '30m' (30 minutes), '15s' (15 seconds). + + Raises: + ValueError: If the value is not None and not a string. + """ + if value is not None and not isinstance(value, str): + raise ValueError("max_staleness must be a string or None") + + self._properties[self._PROPERTY_TO_API_FIELD["max_staleness"]] = value + class TableListItem(_TableBase): """A read-only table resource from a list operation. @@ -1572,6 +1785,18 @@ class RowIterator(HTTPIterator): first_page_response (Optional[dict]): API response for the first page of results. These are returned when the first page is requested. + query (Optional[str]): + The query text used. + total_bytes_processed (Optional[int]): + If representing query results, the total bytes processed by the associated query. + slot_millis (Optional[int]): + If representing query results, the number of slot ms billed for the associated query. + created (Optional[datetime.datetime]): + If representing query results, the creation time of the associated query. + started (Optional[datetime.datetime]): + If representing query results, the start time of the associated query. + ended (Optional[datetime.datetime]): + If representing query results, the end time of the associated query. """ def __init__( @@ -1593,6 +1818,12 @@ def __init__( query_id: Optional[str] = None, project: Optional[str] = None, num_dml_affected_rows: Optional[int] = None, + query: Optional[str] = None, + total_bytes_processed: Optional[int] = None, + slot_millis: Optional[int] = None, + created: Optional[datetime.datetime] = None, + started: Optional[datetime.datetime] = None, + ended: Optional[datetime.datetime] = None, ): super(RowIterator, self).__init__( client, @@ -1606,7 +1837,7 @@ def __init__( page_start=_rows_page_start, next_token="pageToken", ) - schema = _to_schema_fields(schema) + schema = _to_schema_fields(schema) if schema else () self._field_to_index = _helpers._field_to_index_mapping(schema) self._page_size = page_size self._preserve_order = False @@ -1620,6 +1851,12 @@ def __init__( self._query_id = query_id self._project = project self._num_dml_affected_rows = num_dml_affected_rows + self._query = query + self._total_bytes_processed = total_bytes_processed + self._slot_millis = slot_millis + self._job_created = created + self._job_started = started + self._job_ended = ended @property def _billing_project(self) -> Optional[str]: @@ -1667,6 +1904,36 @@ def query_id(self) -> Optional[str]: """ return self._query_id + @property + def query(self) -> Optional[str]: + """The query text used.""" + return self._query + + @property + def total_bytes_processed(self) -> Optional[int]: + """total bytes processed from job statistics, if present.""" + return self._total_bytes_processed + + @property + def slot_millis(self) -> Optional[int]: + """Number of slot ms the user is actually billed for.""" + return self._slot_millis + + @property + def created(self) -> Optional[datetime.datetime]: + """If representing query results, the creation time of the associated query.""" + return self._job_created + + @property + def started(self) -> Optional[datetime.datetime]: + """If representing query results, the start time of the associated query.""" + return self._job_started + + @property + def ended(self) -> Optional[datetime.datetime]: + """If representing query results, the end time of the associated query.""" + return self._job_ended + def _is_almost_completely_cached(self): """Check if all results are completely cached. @@ -1756,12 +2023,19 @@ def _get_next_page_response(self): return response params = self._get_query_params() + + # If the user has provided page_size and start_index, we need to pass + # start_index for the first page, but for all subsequent pages, we + # should not pass start_index. We make a shallow copy of params and do + # not alter the original, so if the user iterates the results again, + # start_index is preserved. + params_copy = copy.copy(params) if self._page_size is not None: if self.page_number and "startIndex" in params: - del params["startIndex"] + del params_copy["startIndex"] return self.api_request( - method=self._HTTP_METHOD, path=self.path, query_params=params + method=self._HTTP_METHOD, path=self.path, query_params=params_copy ) @property @@ -1812,6 +2086,8 @@ def to_arrow_iterable( self, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore + max_stream_count: Optional[int] = None, + timeout: Optional[float] = None, ) -> Iterator["pyarrow.RecordBatch"]: """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream. @@ -1836,6 +2112,26 @@ def to_arrow_iterable( created by the server. If ``max_queue_size`` is :data:`None`, the queue size is infinite. + max_stream_count (Optional[int]): + The maximum number of parallel download streams when + using BigQuery Storage API. Ignored if + BigQuery Storage API is not used. + + This setting also has no effect if the query result + is deterministically ordered with ORDER BY, + in which case, the number of download stream is always 1. + + If set to 0 or None (the default), the number of download + streams is determined by BigQuery the server. However, this behaviour + can require a lot of memory to store temporary download result, + especially with very large queries. In that case, + setting this parameter value to a value > 0 can help + reduce system resource consumption. + + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. + Returns: pyarrow.RecordBatch: A generator of :class:`~pyarrow.RecordBatch`. @@ -1852,9 +2148,14 @@ def to_arrow_iterable( preserve_order=self._preserve_order, selected_fields=self._selected_fields, max_queue_size=max_queue_size, + max_stream_count=max_stream_count, + timeout=timeout, ) tabledata_list_download = functools.partial( - _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema + _pandas_helpers.download_arrow_row_iterator, + iter(self.pages), + self.schema, + timeout=timeout, ) return self._to_page_iterable( bqstorage_download, @@ -1869,6 +2170,7 @@ def to_arrow( progress_bar_type: Optional[str] = None, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, create_bqstorage_client: bool = True, + timeout: Optional[float] = None, ) -> "pyarrow.Table": """[Beta] Create a class:`pyarrow.Table` by loading all pages of a table or query. @@ -1910,6 +2212,9 @@ def to_arrow( This argument does nothing if ``bqstorage_client`` is supplied. .. versionadded:: 1.24.0 + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. Returns: pyarrow.Table @@ -1944,7 +2249,7 @@ def to_arrow( record_batches = [] for record_batch in self.to_arrow_iterable( - bqstorage_client=bqstorage_client + bqstorage_client=bqstorage_client, timeout=timeout ): record_batches.append(record_batch) @@ -1978,6 +2283,8 @@ def to_dataframe_iterable( bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Optional[Dict[str, Any]] = None, max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore + max_stream_count: Optional[int] = None, + timeout: Optional[float] = None, ) -> "pandas.DataFrame": """Create an iterable of pandas DataFrames, to process the table as a stream. @@ -2008,6 +2315,26 @@ def to_dataframe_iterable( .. versionadded:: 2.14.0 + max_stream_count (Optional[int]): + The maximum number of parallel download streams when + using BigQuery Storage API. Ignored if + BigQuery Storage API is not used. + + This setting also has no effect if the query result + is deterministically ordered with ORDER BY, + in which case, the number of download stream is always 1. + + If set to 0 or None (the default), the number of download + streams is determined by BigQuery the server. However, this behaviour + can require a lot of memory to store temporary download result, + especially with very large queries. In that case, + setting this parameter value to a value > 0 can help + reduce system resource consumption. + + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. + Returns: pandas.DataFrame: A generator of :class:`~pandas.DataFrame`. @@ -2034,12 +2361,15 @@ def to_dataframe_iterable( preserve_order=self._preserve_order, selected_fields=self._selected_fields, max_queue_size=max_queue_size, + max_stream_count=max_stream_count, + timeout=timeout, ) tabledata_list_download = functools.partial( _pandas_helpers.download_dataframe_row_iterator, iter(self.pages), self.schema, dtypes, + timeout=timeout, ) return self._to_page_iterable( bqstorage_download, @@ -2071,6 +2401,7 @@ def to_dataframe( range_timestamp_dtype: Union[ Any, None ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, + timeout: Optional[float] = None, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -2267,6 +2598,10 @@ def to_dataframe( .. versionadded:: 3.21.0 + timeout (Optional[float]): + The number of seconds to wait for the underlying download to complete. + If ``None``, wait indefinitely. + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column @@ -2380,6 +2715,7 @@ def to_dataframe( progress_bar_type=progress_bar_type, bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, + timeout=timeout, ) # Default date dtype is `db_dtypes.DateDtype()` that could cause out of bounds error, @@ -2406,31 +2742,25 @@ def to_dataframe( if pyarrow.types.is_timestamp(col.type) ) - if len(record_batch) > 0: - df = record_batch.to_pandas( + df = record_batch.to_pandas( + date_as_object=date_as_object, + timestamp_as_object=timestamp_as_object, + integer_object_nulls=True, + types_mapper=_pandas_helpers.default_types_mapper( date_as_object=date_as_object, - timestamp_as_object=timestamp_as_object, - integer_object_nulls=True, - types_mapper=_pandas_helpers.default_types_mapper( - date_as_object=date_as_object, - bool_dtype=bool_dtype, - int_dtype=int_dtype, - float_dtype=float_dtype, - string_dtype=string_dtype, - date_dtype=date_dtype, - datetime_dtype=datetime_dtype, - time_dtype=time_dtype, - timestamp_dtype=timestamp_dtype, - range_date_dtype=range_date_dtype, - range_datetime_dtype=range_datetime_dtype, - range_timestamp_dtype=range_timestamp_dtype, - ), - ) - else: - # Avoid "ValueError: need at least one array to concatenate" on - # older versions of pandas when converting empty RecordBatch to - # DataFrame. See: https://github.com/pandas-dev/pandas/issues/41241 - df = pandas.DataFrame([], columns=record_batch.schema.names) + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, + date_dtype=date_dtype, + datetime_dtype=datetime_dtype, + time_dtype=time_dtype, + timestamp_dtype=timestamp_dtype, + range_date_dtype=range_date_dtype, + range_datetime_dtype=range_datetime_dtype, + range_timestamp_dtype=range_timestamp_dtype, + ), + ) for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column], copy=False) @@ -2460,6 +2790,11 @@ def to_geodataframe( progress_bar_type: Optional[str] = None, create_bqstorage_client: bool = True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, + timeout: Optional[float] = None, ) -> "geopandas.GeoDataFrame": """Create a GeoPandas GeoDataFrame by loading all pages of a query. @@ -2511,6 +2846,34 @@ def to_geodataframe( identifies which one to use to construct a geopandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type Returns: geopandas.GeoDataFrame: @@ -2562,6 +2925,11 @@ def to_geodataframe( progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, + timeout=timeout, ) return geopandas.GeoDataFrame( @@ -2577,10 +2945,6 @@ class _EmptyRowIterator(RowIterator): statements. """ - schema = () - pages = () - total_rows = 0 - def __init__( self, client=None, api_request=None, path=None, schema=(), *args, **kwargs ): @@ -2592,12 +2956,14 @@ def __init__( *args, **kwargs, ) + self._total_rows = 0 def to_arrow( self, progress_bar_type=None, bqstorage_client=None, create_bqstorage_client=True, + timeout: Optional[float] = None, ) -> "pyarrow.Table": """[Beta] Create an empty class:`pyarrow.Table`. @@ -2605,6 +2971,7 @@ def to_arrow( progress_bar_type (str): Ignored. Added for compatibility with RowIterator. bqstorage_client (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + timeout (Optional[float]): Ignored. Added for compatibility with RowIterator. Returns: pyarrow.Table: An empty :class:`pyarrow.Table`. @@ -2631,6 +2998,7 @@ def to_dataframe( range_date_dtype=None, range_datetime_dtype=None, range_timestamp_dtype=None, + timeout: Optional[float] = None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2651,6 +3019,7 @@ def to_dataframe( range_date_dtype (Any): Ignored. Added for compatibility with RowIterator. range_datetime_dtype (Any): Ignored. Added for compatibility with RowIterator. range_timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator. + timeout (Optional[float]): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. @@ -2665,6 +3034,11 @@ def to_geodataframe( progress_bar_type=None, create_bqstorage_client=True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, + timeout: Optional[float] = None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2674,6 +3048,11 @@ def to_geodataframe( progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. geography_column (str): Ignored. Added for compatibility with RowIterator. + bool_dtype (Any): Ignored. Added for compatibility with RowIterator. + int_dtype (Any): Ignored. Added for compatibility with RowIterator. + float_dtype (Any): Ignored. Added for compatibility with RowIterator. + string_dtype (Any): Ignored. Added for compatibility with RowIterator. + timeout (Optional[float]): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. @@ -2690,6 +3069,8 @@ def to_dataframe_iterable( bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Optional[Dict[str, Any]] = None, max_queue_size: Optional[int] = None, + max_stream_count: Optional[int] = None, + timeout: Optional[float] = None, ) -> Iterator["pandas.DataFrame"]: """Create an iterable of pandas DataFrames, to process the table as a stream. @@ -2705,6 +3086,12 @@ def to_dataframe_iterable( max_queue_size: Ignored. Added for compatibility with RowIterator. + max_stream_count: + Ignored. Added for compatibility with RowIterator. + + timeout (Optional[float]): + Ignored. Added for compatibility with RowIterator. + Returns: An iterator yielding a single empty :class:`~pandas.DataFrame`. @@ -2719,6 +3106,8 @@ def to_arrow_iterable( self, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, max_queue_size: Optional[int] = None, + max_stream_count: Optional[int] = None, + timeout: Optional[float] = None, ) -> Iterator["pyarrow.RecordBatch"]: """Create an iterable of pandas DataFrames, to process the table as a stream. @@ -2731,6 +3120,12 @@ def to_arrow_iterable( max_queue_size: Ignored. Added for compatibility with RowIterator. + max_stream_count: + Ignored. Added for compatibility with RowIterator. + + timeout (Optional[float]): + Ignored. Added for compatibility with RowIterator. + Returns: An iterator yielding a single empty :class:`~pyarrow.RecordBatch`. """ @@ -3181,6 +3576,20 @@ def from_api_repr(cls, api_repr: Dict[str, Any]) -> "ForeignKey": ], ) + def to_api_repr(self) -> Dict[str, Any]: + """Return a dictionary representing this object.""" + return { + "name": self.name, + "referencedTable": self.referenced_table.to_api_repr(), + "columnReferences": [ + { + "referencingColumn": column_reference.referencing_column, + "referencedColumn": column_reference.referenced_column, + } + for column_reference in self.column_references + ], + } + class TableConstraints: """The TableConstraints defines the primary key and foreign key. @@ -3202,6 +3611,13 @@ def __init__( self.primary_key = primary_key self.foreign_keys = foreign_keys + def __eq__(self, other): + if not isinstance(other, TableConstraints) and other is not None: + raise TypeError("The value provided is not a BigQuery TableConstraints.") + return self.primary_key == ( + other.primary_key if other.primary_key else None + ) and self.foreign_keys == (other.foreign_keys if other.foreign_keys else None) + @classmethod def from_api_repr(cls, resource: Dict[str, Any]) -> "TableConstraints": """Create an instance from API representation.""" @@ -3217,6 +3633,143 @@ def from_api_repr(cls, resource: Dict[str, Any]) -> "TableConstraints": ] return cls(primary_key, foreign_keys) + def to_api_repr(self) -> Dict[str, Any]: + """Return a dictionary representing this object.""" + resource: Dict[str, Any] = {} + if self.primary_key: + resource["primaryKey"] = {"columns": self.primary_key.columns} + if self.foreign_keys: + resource["foreignKeys"] = [ + foreign_key.to_api_repr() for foreign_key in self.foreign_keys + ] + return resource + + +class BigLakeConfiguration(object): + """Configuration for managed tables for Apache Iceberg, formerly + known as BigLake. + + Args: + connection_id (Optional[str]): + The connection specifying the credentials to be used to read and write to external + storage, such as Cloud Storage. The connection_id can have the form + ``{project}.{location}.{connection_id}`` or + ``projects/{project}/locations/{location}/connections/{connection_id}``. + storage_uri (Optional[str]): + The fully qualified location prefix of the external folder where table data is + stored. The '*' wildcard character is not allowed. The URI should be in the + format ``gs://bucket/path_to_table/``. + file_format (Optional[str]): + The file format the table data is stored in. See BigLakeFileFormat for available + values. + table_format (Optional[str]): + The table format the metadata only snapshots are stored in. See BigLakeTableFormat + for available values. + _properties (Optional[dict]): + Private. Used to construct object from API resource. + """ + + def __init__( + self, + connection_id: Optional[str] = None, + storage_uri: Optional[str] = None, + file_format: Optional[str] = None, + table_format: Optional[str] = None, + _properties: Optional[dict] = None, + ) -> None: + if _properties is None: + _properties = {} + self._properties = _properties + if connection_id is not None: + self.connection_id = connection_id + if storage_uri is not None: + self.storage_uri = storage_uri + if file_format is not None: + self.file_format = file_format + if table_format is not None: + self.table_format = table_format + + @property + def connection_id(self) -> Optional[str]: + """str: The connection specifying the credentials to be used to read and write to external + storage, such as Cloud Storage.""" + return self._properties.get("connectionId") + + @connection_id.setter + def connection_id(self, value: Optional[str]): + self._properties["connectionId"] = value + + @property + def storage_uri(self) -> Optional[str]: + """str: The fully qualified location prefix of the external folder where table data is + stored.""" + return self._properties.get("storageUri") + + @storage_uri.setter + def storage_uri(self, value: Optional[str]): + self._properties["storageUri"] = value + + @property + def file_format(self) -> Optional[str]: + """str: The file format the table data is stored in. See BigLakeFileFormat for available + values.""" + return self._properties.get("fileFormat") + + @file_format.setter + def file_format(self, value: Optional[str]): + self._properties["fileFormat"] = value + + @property + def table_format(self) -> Optional[str]: + """str: The table format the metadata only snapshots are stored in. See BigLakeTableFormat + for available values.""" + return self._properties.get("tableFormat") + + @table_format.setter + def table_format(self, value: Optional[str]): + self._properties["tableFormat"] = value + + def _key(self): + return tuple(sorted(self._properties.items())) + + def __eq__(self, other): + if not isinstance(other, BigLakeConfiguration): + return NotImplemented + return self._key() == other._key() + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self._key()) + + def __repr__(self): + key_vals = ["{}={}".format(key, val) for key, val in self._key()] + return "BigLakeConfiguration({})".format(",".join(key_vals)) + + @classmethod + def from_api_repr(cls, resource: Dict[str, Any]) -> "BigLakeConfiguration": + """Factory: construct a BigLakeConfiguration given its API representation. + + Args: + resource: + BigLakeConfiguration representation returned from the API + + Returns: + BigLakeConfiguration parsed from ``resource``. + """ + ref = cls() + ref._properties = resource + return ref + + def to_api_repr(self) -> Dict[str, Any]: + """Construct the API resource representation of this BigLakeConfiguration. + + Returns: + BigLakeConfiguration represented as an API resource. + """ + return copy.deepcopy(self._properties) + def _item_to_row(iterator, resource): """Convert a JSON row to the native object. @@ -3251,7 +3804,9 @@ def _row_iterator_page_columns(schema, response): def get_column_data(field_index, field): for row in rows: - yield _helpers._field_from_json(row["f"][field_index]["v"], field) + yield _helpers.DATA_FRAME_CELL_DATA_PARSER.to_py( + row["f"][field_index]["v"], field + ) for field_index, field in enumerate(schema): columns.append(get_column_data(field_index, field)) diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index fed077e26..2519009bf 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.25.0" +__version__ = "3.40.1" diff --git a/noxfile.py b/noxfile.py index a2df2e094..194e7ce8f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -24,7 +24,7 @@ MYPY_VERSION = "mypy==1.6.1" -PYTYPE_VERSION = "pytype==2021.4.9" +PYTYPE_VERSION = "pytype==2024.9.13" BLACK_VERSION = "black==23.7.0" BLACK_PATHS = ( "benchmark", @@ -37,9 +37,9 @@ "setup.py", ) -DEFAULT_PYTHON_VERSION = "3.8" -SYSTEM_TEST_PYTHON_VERSIONS = ["3.8", "3.11", "3.12"] -UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.12"] +DEFAULT_PYTHON_VERSION = "3.9" +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.11", "3.12", "3.13"] +UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() @@ -95,27 +95,43 @@ def default(session, install_extras=True): # Install all test dependencies, then install local packages in-place. session.install( - "pytest", + # TODO(https://github.com/pytest-dev/pytest-xdist/issues/1273): Remove once this bug is fixed + "pytest<9", "google-cloud-testutils", "pytest-cov", + "pytest-xdist", "freezegun", "-c", constraints_path, ) - - if install_extras and session.python in ["3.11", "3.12"]: - install_target = ".[bqstorage,ipywidgets,pandas,tqdm,opentelemetry]" - elif install_extras: + # We have logic in the magics.py file that checks for whether 'bigquery_magics' + # is imported OR not. If yes, we use a context object from that library. + # If no, we use our own context object from magics.py. In order to exercise + # that logic (and the associated tests) we avoid installing the [ipython] extra + # which has a downstream effect of then avoiding installing bigquery_magics. + if install_extras and session.python == UNIT_TEST_PYTHON_VERSIONS[0]: + install_target = ".[bqstorage,pandas,ipywidgets,geopandas,matplotlib,tqdm,opentelemetry,bigquery_v2]" + elif install_extras: # run against all other UNIT_TEST_PYTHON_VERSIONS install_target = ".[all]" else: install_target = "." session.install("-e", install_target, "-c", constraints_path) + + # Test with some broken "extras" in case the user didn't install the extra + # directly. For example, pandas-gbq is recommended for pandas features, but + # we want to test that we fallback to the previous behavior. For context, + # see internal document go/pandas-gbq-and-bigframes-redundancy. + if session.python == UNIT_TEST_PYTHON_VERSIONS[0]: + session.run("python", "-m", "pip", "uninstall", "pandas-gbq", "-y") + session.run("python", "-m", "pip", "freeze") # Run py.test against the unit tests. session.run( "py.test", + "-n=8", "--quiet", + "-W default::PendingDeprecationWarning", "--cov=google/cloud/bigquery", "--cov=tests/unit", "--cov-append", @@ -148,8 +164,7 @@ def unit_noextras(session): # so that it continues to be an optional dependency. # https://github.com/googleapis/python-bigquery/issues/1877 if session.python == UNIT_TEST_PYTHON_VERSIONS[0]: - session.install("pyarrow==1.0.0") - + session.install("pyarrow==4.0.0", "numpy==1.20.2") default(session, install_extras=False) @@ -169,6 +184,7 @@ def mypy(session): "types-requests", "types-setuptools", ) + session.run("python", "-m", "pip", "freeze") session.run("mypy", "-p", "google", "--show-traceback") @@ -183,6 +199,7 @@ def pytype(session): session.install("attrs==20.3.0") session.install("-e", ".[all]") session.install(PYTYPE_VERSION) + session.run("python", "-m", "pip", "freeze") # See https://github.com/google/pytype/issues/464 session.run("pytype", "-P", ".", "google/cloud/bigquery") @@ -207,7 +224,12 @@ def system(session): # Install all test dependencies, then install local packages in place. session.install( - "pytest", "psutil", "google-cloud-testutils", "-c", constraints_path + "pytest", + "psutil", + "pytest-xdist", + "google-cloud-testutils", + "-c", + constraints_path, ) if os.environ.get("GOOGLE_API_USE_CLIENT_CERTIFICATE", "") == "true": # mTLS test requires pyopenssl and latest google-cloud-storage @@ -218,19 +240,31 @@ def system(session): # Data Catalog needed for the column ACL test with a real Policy Tag. session.install("google-cloud-datacatalog", "-c", constraints_path) + # Resource Manager needed for test with a real Resource Tag. + session.install("google-cloud-resource-manager", "-c", constraints_path) + if session.python in ["3.11", "3.12"]: extras = "[bqstorage,ipywidgets,pandas,tqdm,opentelemetry]" else: extras = "[all]" session.install("-e", f".{extras}", "-c", constraints_path) + # Test with some broken "extras" in case the user didn't install the extra + # directly. For example, pandas-gbq is recommended for pandas features, but + # we want to test that we fallback to the previous behavior. For context, + # see internal document go/pandas-gbq-and-bigframes-redundancy. + if session.python == SYSTEM_TEST_PYTHON_VERSIONS[0]: + session.run("python", "-m", "pip", "uninstall", "pandas-gbq", "-y") + # print versions of all dependencies session.run("python", "-m", "pip", "freeze") # Run py.test against the system tests. session.run( "py.test", + "-n=auto", "--quiet", + "-W default::PendingDeprecationWarning", os.path.join("tests", "system"), *session.posargs, ) @@ -261,7 +295,7 @@ def mypy_samples(session): "types-setuptools", ) - session.install("typing-extensions") # for TypedDict in pre-3.8 Python versions + session.run("python", "-m", "pip", "freeze") session.run( "mypy", @@ -282,23 +316,32 @@ def snippets(session): ) # Install all test dependencies, then install local packages in place. - session.install("pytest", "google-cloud-testutils", "-c", constraints_path) + session.install( + "pytest", "pytest-xdist", "google-cloud-testutils", "-c", constraints_path + ) session.install("google-cloud-storage", "-c", constraints_path) session.install("grpcio", "-c", constraints_path) if session.python in ["3.11", "3.12"]: - extras = "[bqstorage,ipywidgets,pandas,tqdm,opentelemetry]" + extras = ( + "[bqstorage,pandas,ipywidgets,geopandas,tqdm,opentelemetry,bigquery_v2]" + ) else: extras = "[all]" session.install("-e", f".{extras}", "-c", constraints_path) + session.run("python", "-m", "pip", "freeze") # Run py.test against the snippets tests. # Skip tests in samples/snippets, as those are run in a different session # using the nox config from that directory. - session.run("py.test", os.path.join("docs", "snippets.py"), *session.posargs) + session.run( + "py.test", "-n=auto", os.path.join("docs", "snippets.py"), *session.posargs + ) session.run( "py.test", + "-n=auto", "samples", + "-W default::PendingDeprecationWarning", "--ignore=samples/desktopapp", "--ignore=samples/magics", "--ignore=samples/geography", @@ -318,6 +361,7 @@ def cover(session): """ session.install("coverage", "pytest-cov") + session.run("python", "-m", "pip", "freeze") session.run("coverage", "report", "--show-missing", "--fail-under=100") session.run("coverage", "erase") @@ -329,11 +373,46 @@ def prerelease_deps(session): https://github.com/googleapis/python-bigquery/issues/95 """ + # Because we test minimum dependency versions on the minimum Python + # version, the first version we test with in the unit tests sessions has a + # constraints file containing all dependencies and extras. + with open( + CURRENT_DIRECTORY + / "testing" + / f"constraints-{UNIT_TEST_PYTHON_VERSIONS[0]}.txt", + encoding="utf-8", + ) as constraints_file: + constraints_text = constraints_file.read() + + # Ignore leading whitespace and comment lines. + deps = [ + match.group(1) + for match in re.finditer( + r"^\s*(\S+)(?===\S+)", constraints_text, flags=re.MULTILINE + ) + ] + + session.install(*deps) + + session.install( + "--pre", + "--upgrade", + "freezegun", + "google-cloud-datacatalog", + "google-cloud-resource-manager", + "google-cloud-storage", + "google-cloud-testutils", + "psutil", + "pytest", + "pytest-xdist", + "pytest-cov", + ) + # PyArrow prerelease packages are published to an alternative PyPI host. - # https://arrow.apache.org/docs/python/install.html#installing-nightly-packages + # https://arrow.apache.org/docs/developers/python.html#installing-nightly-packages session.install( "--extra-index-url", - "https://pypi.fury.io/arrow-nightlies/", + "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple", "--prefer-binary", "--pre", "--upgrade", @@ -353,57 +432,43 @@ def prerelease_deps(session): session.install( "--pre", "--upgrade", + "--no-deps", "google-api-core", "google-cloud-bigquery-storage", "google-cloud-core", "google-resumable-media", - # Exclude version 1.49.0rc1 which has a known issue. See https://github.com/grpc/grpc/pull/30642 - "grpcio!=1.49.0rc1", - ) - session.install( - "freezegun", - "google-cloud-datacatalog", - "google-cloud-storage", - "google-cloud-testutils", - "psutil", - "pytest", - "pytest-cov", + "db-dtypes", + "grpcio", + "protobuf", ) - # Because we test minimum dependency versions on the minimum Python - # version, the first version we test with in the unit tests sessions has a - # constraints file containing all dependencies and extras. - with open( - CURRENT_DIRECTORY - / "testing" - / f"constraints-{UNIT_TEST_PYTHON_VERSIONS[0]}.txt", - encoding="utf-8", - ) as constraints_file: - constraints_text = constraints_file.read() - - # Ignore leading whitespace and comment lines. - deps = [ - match.group(1) - for match in re.finditer( - r"^\s*(\S+)(?===\S+)", constraints_text, flags=re.MULTILINE - ) - ] - - # We use --no-deps to ensure that pre-release versions aren't overwritten - # by the version ranges in setup.py. - session.install(*deps) - session.install("--no-deps", "-e", ".[all]") + # Ensure that this library is installed from source + session.install("-e", ".", "--no-deps") # Print out prerelease package versions. - session.run("python", "-c", "import grpc; print(grpc.__version__)") - session.run("python", "-c", "import pandas; print(pandas.__version__)") - session.run("python", "-c", "import pyarrow; print(pyarrow.__version__)") session.run("python", "-m", "pip", "freeze") # Run all tests, except a few samples tests which require extra dependencies. - session.run("py.test", "tests/unit") - session.run("py.test", "tests/system") - session.run("py.test", "samples/tests") + session.run( + "py.test", + "-n=auto", + "tests/unit", + "-W default::PendingDeprecationWarning", + ) + + session.run( + "py.test", + "-n=auto", + "tests/system", + "-W default::PendingDeprecationWarning", + ) + + session.run( + "py.test", + "-n=auto", + "samples/tests", + "-W default::PendingDeprecationWarning", + ) @nox.session(python=DEFAULT_PYTHON_VERSION) @@ -417,6 +482,7 @@ def lint(session): session.install("flake8", BLACK_VERSION) session.install("-e", ".") + session.run("python", "-m", "pip", "freeze") session.run("flake8", os.path.join("google", "cloud", "bigquery")) session.run("flake8", "tests") session.run("flake8", os.path.join("docs", "samples")) @@ -431,6 +497,7 @@ def lint_setup_py(session): """Verify that setup.py is valid (including RST check).""" session.install("docutils", "Pygments") + session.run("python", "-m", "pip", "freeze") session.run("python", "setup.py", "check", "--restructuredtext", "--strict") @@ -442,10 +509,11 @@ def blacken(session): """ session.install(BLACK_VERSION) + session.run("python", "-m", "pip", "freeze") session.run("black", *BLACK_PATHS) -@nox.session(python="3.9") +@nox.session(python="3.10") @_calculate_duration def docs(session): """Build the docs.""" @@ -468,6 +536,7 @@ def docs(session): session.install("-e", ".[all]") shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + session.run("python", "-m", "pip", "freeze") session.run( "sphinx-build", "-W", # warnings as errors @@ -504,6 +573,7 @@ def docfx(session): ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + session.run("python", "-m", "pip", "freeze") session.run( "sphinx-build", "-T", # show full traceback on exception diff --git a/owlbot.py b/owlbot.py deleted file mode 100644 index 07805d11a..000000000 --- a/owlbot.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""This script is used to synthesize generated parts of this library.""" -from pathlib import Path -import textwrap - -import synthtool as s -from synthtool import gcp -from synthtool.languages import python - -REPO_ROOT = Path(__file__).parent.absolute() - -default_version = "v2" - -for library in s.get_staging_dirs(default_version): - # Avoid breaking change due to change in field renames. - # https://github.com/googleapis/python-bigquery/issues/319 - s.replace( - library / f"google/cloud/bigquery_{library.name}/types/standard_sql.py", - r"type_ ", - "type ", - ) - # Patch docs issue - s.replace( - library / f"google/cloud/bigquery_{library.name}/types/model.py", - r"""\"predicted_\"""", - """`predicted_`""", - ) - s.move(library / f"google/cloud/bigquery_{library.name}/types") -s.remove_staging_dirs() - -common = gcp.CommonTemplates() - -# ---------------------------------------------------------------------------- -# Add templated files -# ---------------------------------------------------------------------------- -templated_files = common.py_library( - cov_level=100, - samples=True, - microgenerator=True, - split_system_tests=True, - intersphinx_dependencies={ - "dateutil": "https://dateutil.readthedocs.io/en/latest/", - "geopandas": "https://geopandas.org/", - "pandas": "https://pandas.pydata.org/pandas-docs/stable/", - }, -) - -# BigQuery has a custom multiprocessing note -s.move( - templated_files, - excludes=[ - "noxfile.py", - "docs/multiprocessing.rst", - "docs/index.rst", - ".coveragerc", - ".github/CODEOWNERS", - # Include custom SNIPPETS_TESTS job for performance. - # https://github.com/googleapis/python-bigquery/issues/191 - ".kokoro/presubmit/presubmit.cfg", - ".kokoro/continuous/prerelease-deps.cfg", - ".github/workflows", # exclude gh actions as credentials are needed for tests - "README.rst", - ], -) - -python.configure_previous_major_version_branches() -# ---------------------------------------------------------------------------- -# Samples templates -# ---------------------------------------------------------------------------- - -python.py_samples() - -s.replace( - "docs/conf.py", - r'\{"members": True\}', - '{"members": True, "inherited-members": True}', -) -s.replace( - "docs/conf.py", - r"exclude_patterns = \[", - '\\g<0>\n "google/cloud/bigquery_v2/**", # Legacy proto-based types.', -) - -# ---------------------------------------------------------------------------- -# pytype-related changes -# ---------------------------------------------------------------------------- - -# Add .pytype to .gitignore -s.replace(".gitignore", r"\.pytest_cache", "\\g<0>\n.pytype") - -s.shell.run(["nox", "-s", "blacken"], hide_output=False) -for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"): - s.shell.run(["nox", "-s", "blacken"], cwd=noxfile.parent, hide_output=False) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..a0e356b34 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,112 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "google-cloud-bigquery" +authors = [{ name = "Google LLC", email = "googleapis-packages@google.com" }] +license = "Apache-2.0" +license-files = ["LICENSE"] +requires-python = ">=3.9" +description = "Google BigQuery API client library" +readme = "README.rst" +classifiers = [ + # Should be one of: + # "Development Status :: 3 - Alpha" + # "Development Status :: 4 - Beta" + # "Development Status :: 5 - Production/Stable" + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Operating System :: OS Independent", + "Topic :: Internet", +] +dependencies = [ + "google-api-core[grpc] >= 2.11.1, < 3.0.0", + "google-auth >= 2.14.1, < 3.0.0", + "google-cloud-core >= 2.4.1, < 3.0.0", + "google-resumable-media >= 2.0.0, < 3.0.0", + "packaging >= 24.2.0", + "python-dateutil >= 2.8.2, < 3.0.0", + "requests >= 2.21.0, < 3.0.0", +] +dynamic = ["version"] + +[project.urls] +Repository = "https://github.com/googleapis/python-bigquery" + +[project.optional-dependencies] +# bqstorage had a period where it was a required dependency, and has been +# moved back to optional due to bloat. See +# https://github.com/googleapis/python-bigquery/issues/1196 for more background. +bqstorage = [ + "google-cloud-bigquery-storage >= 2.18.0, < 3.0.0", + # Due to an issue in pip's dependency resolver, the `grpc` extra is not + # installed, even though `google-cloud-bigquery-storage` specifies it + # as `google-api-core[grpc]`. We thus need to explicitly specify it here. + # See: https://github.com/googleapis/python-bigquery/issues/83 The + # grpc.Channel.close() method isn't added until 1.32.0. + # https://github.com/grpc/grpc/pull/15254 + "grpcio >= 1.47.0, < 2.0.0", + "grpcio >= 1.49.1, < 2.0.0; python_version >= '3.11'", + "grpcio >= 1.75.1, < 2.0.0; python_version >= '3.14'", + "pyarrow >= 4.0.0", +] +pandas = [ + "pandas >= 1.3.0", + "pandas-gbq >= 0.26.1", + "grpcio >= 1.47.0, < 2.0.0", + "grpcio >= 1.49.1, < 2.0.0; python_version >= '3.11'", + "grpcio >= 1.75.1, < 2.0.0; python_version >= '3.14'", + "pyarrow >= 3.0.0", + "db-dtypes >= 1.0.4, < 2.0.0", +] +ipywidgets = ["ipywidgets >= 7.7.1", "ipykernel >= 6.2.0"] +geopandas = ["geopandas >= 0.9.0, < 2.0.0", "Shapely >= 1.8.4, < 3.0.0"] +ipython = ["ipython >= 7.23.1", "bigquery-magics >= 0.6.0"] +matplotlib = [ + "matplotlib >= 3.7.1, <= 3.9.2; python_version == '3.9'", + "matplotlib >= 3.10.3; python_version >= '3.10'", +] +tqdm = ["tqdm >= 4.23.4, < 5.0.0"] +opentelemetry = [ + "opentelemetry-api >= 1.1.0", + "opentelemetry-sdk >= 1.1.0", + "opentelemetry-instrumentation >= 0.20b0", +] +bigquery_v2 = [ + "proto-plus >= 1.22.3, < 2.0.0", + "protobuf >= 3.20.2, < 7.0.0, != 4.21.0, != 4.21.1, != 4.21.2, != 4.21.3, != 4.21.4, != 4.21.5", # For the legacy proto-based types. +] +all = [ + "google-cloud-bigquery[bqstorage,pandas,ipywidgets,geopandas,ipython,matplotlib,tqdm,opentelemetry,bigquery_v2]", +] + +[tool.setuptools.dynamic] +version = { attr = "google.cloud.bigquery.version.__version__" } + +[tool.setuptools.packages.find] +# Only include packages under the 'google' namespace. Do not include tests, +# benchmarks, etc. +include = ["google*"] diff --git a/renovate.json b/renovate.json index 39b2a0ec9..3ea143d4c 100644 --- a/renovate.json +++ b/renovate.json @@ -5,8 +5,15 @@ ":preserveSemverRanges", ":disableDependencyDashboard" ], - "ignorePaths": [".pre-commit-config.yaml", ".kokoro/requirements.txt", "setup.py"], + "ignorePaths": [".pre-commit-config.yaml", ".kokoro/requirements.txt", "setup.py", ".github/workflows/unittest.yml", ".github/workflows/docs.yml"], "pip_requirements": { "fileMatch": ["requirements-test.txt", "samples/[\\S/]*constraints.txt", "samples/[\\S/]*constraints-test.txt"] - } + }, + "packageRules": [ + { + "matchFileNames": ["pyproject.toml"], + "matchStrings": ["matplotlib (.*); python_version == '3.9'"], + "allowedVersions": ">= 3.7.1, <= 3.9.2" + } + ] } diff --git a/samples/client_query_shortmode.py b/samples/client_query_job_optional.py similarity index 69% rename from samples/client_query_shortmode.py rename to samples/client_query_job_optional.py index 50446dc48..6321aea35 100644 --- a/samples/client_query_shortmode.py +++ b/samples/client_query_job_optional.py @@ -13,16 +13,18 @@ # limitations under the License. -def client_query_shortmode() -> None: - # [START bigquery_query_shortquery] - # This example demonstrates issuing a query that may be run in short query mode. - # - # To enable the short query mode preview feature, the QUERY_PREVIEW_ENABLED - # environmental variable should be set to `TRUE`. +def client_query_job_optional() -> None: + # [START bigquery_query_job_optional] + # This example demonstrates executing a query without requiring an associated + # job. from google.cloud import bigquery + from google.cloud.bigquery.enums import JobCreationMode - # Construct a BigQuery client object. - client = bigquery.Client() + # Construct a BigQuery client object, specifying that the library should + # avoid creating jobs when possible. + client = bigquery.Client( + default_job_creation_mode=JobCreationMode.JOB_CREATION_OPTIONAL + ) query = """ SELECT @@ -44,10 +46,12 @@ def client_query_shortmode() -> None: if rows.job_id is not None: print("Query was run with job state. Job ID: {}".format(rows.job_id)) else: - print("Query was run in short mode. Query ID: {}".format(rows.query_id)) + print( + "Query was run without creating a job. Query ID: {}".format(rows.query_id) + ) print("The query data:") for row in rows: # Row values can be accessed by field name or index. print("name={}, gender={}, total={}".format(row[0], row[1], row["total"])) - # [END bigquery_query_shortquery] + # [END bigquery_query_job_optional] diff --git a/samples/client_query_w_array_params.py b/samples/client_query_w_array_params.py index 25592a94a..e9d759f61 100644 --- a/samples/client_query_w_array_params.py +++ b/samples/client_query_w_array_params.py @@ -35,8 +35,8 @@ def client_query_w_array_params() -> None: bigquery.ArrayQueryParameter("states", "STRING", ["WA", "WI", "WV", "WY"]), ] ) - query_job = client.query(query, job_config=job_config) # Make an API request. + rows = client.query_and_wait(query, job_config=job_config) # Make an API request. - for row in query_job: + for row in rows: print("{}: \t{}".format(row.name, row.count)) # [END bigquery_query_params_arrays] diff --git a/samples/desktopapp/noxfile.py b/samples/desktopapp/noxfile.py index 3b7135946..db2333e5a 100644 --- a/samples/desktopapp/noxfile.py +++ b/samples/desktopapp/noxfile.py @@ -29,7 +29,7 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -BLACK_VERSION = "black==22.3.0" +BLACK_VERSION = "black==23.7.0" ISORT_VERSION = "isort==5.10.1" # Copy `noxfile_config.py` to your directory and modify it instead. @@ -89,7 +89,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/desktopapp/requirements-test.txt b/samples/desktopapp/requirements-test.txt index 68f9039cc..31b836790 100644 --- a/samples/desktopapp/requirements-test.txt +++ b/samples/desktopapp/requirements-test.txt @@ -1,4 +1,4 @@ -google-cloud-testutils==1.4.0 -pytest===7.4.4; python_version == '3.7' -pytest==8.3.2; python_version >= '3.8' -mock==5.1.0 +google-cloud-testutils==1.6.4 +pytest==8.4.2 +mock==5.2.0 +pytest-xdist==3.8.0 diff --git a/samples/desktopapp/requirements.txt b/samples/desktopapp/requirements.txt index dafb60b2a..56696f868 100644 --- a/samples/desktopapp/requirements.txt +++ b/samples/desktopapp/requirements.txt @@ -1,2 +1,2 @@ -google-cloud-bigquery==3.25.0 -google-auth-oauthlib==1.2.1 +google-cloud-bigquery==3.38.0 +google-auth-oauthlib==1.2.2 diff --git a/samples/desktopapp/user_credentials_test.py b/samples/desktopapp/user_credentials_test.py index 252b843c4..d14798d9b 100644 --- a/samples/desktopapp/user_credentials_test.py +++ b/samples/desktopapp/user_credentials_test.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import sys from typing import Iterator, Union from unittest import mock @@ -24,13 +23,7 @@ PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] - -if sys.version_info >= (3, 8): - # Python 3.8+ has an AsyncMock attribute in unittest.mock, but 3.7 does not - MockType = Union[mock.MagicMock, mock.AsyncMock] -else: - # Other definitions and imports - MockType = Union[mock.MagicMock] +MockType = Union[mock.MagicMock, mock.AsyncMock] @pytest.fixture diff --git a/samples/geography/noxfile.py b/samples/geography/noxfile.py index 3b7135946..db2333e5a 100644 --- a/samples/geography/noxfile.py +++ b/samples/geography/noxfile.py @@ -29,7 +29,7 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -BLACK_VERSION = "black==22.3.0" +BLACK_VERSION = "black==23.7.0" ISORT_VERSION = "isort==5.10.1" # Copy `noxfile_config.py` to your directory and modify it instead. @@ -89,7 +89,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/geography/requirements-test.txt b/samples/geography/requirements-test.txt index 335236a14..6fb9ba310 100644 --- a/samples/geography/requirements-test.txt +++ b/samples/geography/requirements-test.txt @@ -1,3 +1,3 @@ -pytest===7.4.4; python_version == '3.7' -pytest==8.3.2; python_version >= '3.8' -mock==5.1.0 +pytest==8.4.2 +mock==5.2.0 +pytest-xdist==3.8.0 diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 1a1cf4b04..5f4d686b3 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -1,56 +1,44 @@ -attrs==24.1.0 -certifi==2024.7.4 -cffi===1.15.1; python_version == '3.7' -cffi==1.16.0; python_version >= '3.8' -charset-normalizer==3.3.2 -click==8.1.7 -click-plugins==1.1.1 +attrs==25.4.0 +certifi==2025.10.5 +cffi==2.0.0 +charset-normalizer==3.4.3 +click===8.1.8; python_version == '3.9' +click==8.3.0; python_version >= '3.10' +click-plugins==1.1.1.2 cligj==0.7.2 -dataclasses==0.8; python_version < '3.7' -db-dtypes==1.2.0 -Fiona==1.9.6 -geojson==3.1.0 -geopandas===0.10.2; python_version == '3.7' -geopandas===0.13.2; python_version == '3.8' -geopandas==1.0.1; python_version >= '3.9' -google-api-core==2.19.1 -google-auth==2.32.0 -google-cloud-bigquery==3.25.0 -google-cloud-bigquery-storage==2.25.0 -google-cloud-core==2.4.1 -google-crc32c==1.5.0 -google-resumable-media==2.7.1 -googleapis-common-protos==1.63.2 -grpcio===1.62.2; python_version == '3.7' -grpcio==1.65.4; python_version >= '3.8' -idna==3.7 +db-dtypes==1.4.3 +Fiona==1.10.1 +geojson==3.2.0 +geopandas===1.0.1; python_version <= '3.9' +geopandas==1.1.2; python_version >= '3.10' +google-api-core==2.25.2 +google-auth==2.41.1 +google-cloud-bigquery==3.38.0 +google-cloud-bigquery-storage==2.33.1 +google-cloud-core==2.4.3 +google-crc32c==1.7.1 +google-resumable-media==2.7.2 +googleapis-common-protos==1.70.0 +grpcio==1.75.1 +idna==3.10 munch==4.0.0 -mypy-extensions==1.0.0 -packaging===24.0; python_version == '3.7' -packaging==24.1; python_version >= '3.8' -pandas===1.3.5; python_version == '3.7' -pandas===2.0.3; python_version == '3.8' -pandas==2.2.2; python_version >= '3.9' -proto-plus==1.24.0 -pyarrow==12.0.1; python_version == '3.7' -pyarrow==17.0.0; python_version >= '3.8' -pyasn1===0.5.1; python_version == '3.7' -pyasn1==0.6.0; python_version >= '3.8' -pyasn1-modules===0.3.0; python_version == '3.7' -pyasn1-modules==0.4.0; python_version >= '3.8' -pycparser===2.21; python_version == '3.7' -pycparser==2.22; python_version >= '3.8' -pyparsing==3.1.2 +mypy-extensions==1.1.0 +packaging==25.0 +pandas==2.3.3 +proto-plus==1.26.1 +pyarrow==21.0.0 +pyasn1==0.6.2 +pyasn1-modules==0.4.2 +pycparser==2.23 +pyparsing==3.2.5 python-dateutil==2.9.0.post0 -pytz==2024.1 -PyYAML==6.0.1 -requests==2.31.0; python_version == '3.7' -requests==2.32.3; python_version >= '3.8' -rsa==4.9 -Shapely==2.0.5 -six==1.16.0 -typing-extensions===4.7.1; python_version == '3.7' -typing-extensions==4.12.2; python_version >= '3.8' +pytz==2025.2 +PyYAML==6.0.3 +requests==2.32.5 +rsa==4.9.1 +Shapely===2.0.7; python_version == '3.9' +Shapely==2.1.2; python_version >= '3.10' +six==1.17.0 +typing-extensions==4.15.0 typing-inspect==0.9.0 -urllib3===1.26.18; python_version == '3.7' -urllib3==2.2.2; python_version >= '3.8' +urllib3==2.6.3 diff --git a/samples/magics/conftest.py b/samples/magics/conftest.py index 55ea30f90..0943c535a 100644 --- a/samples/magics/conftest.py +++ b/samples/magics/conftest.py @@ -18,7 +18,7 @@ import pytest if typing.TYPE_CHECKING: - from IPython.core.interactiveshell import TerminalInteractiveShell + from IPython.terminal.interactiveshell import TerminalInteractiveShell interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") tools = pytest.importorskip("IPython.testing.tools") @@ -40,5 +40,7 @@ def ipython_interactive( for the duration of the test scope. """ - with ipython.builtin_trap: + + trap = typing.cast(typing.ContextManager, ipython.builtin_trap) + with trap: yield ipython diff --git a/samples/magics/noxfile.py b/samples/magics/noxfile.py index 3b7135946..db2333e5a 100644 --- a/samples/magics/noxfile.py +++ b/samples/magics/noxfile.py @@ -29,7 +29,7 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -BLACK_VERSION = "black==22.3.0" +BLACK_VERSION = "black==23.7.0" ISORT_VERSION = "isort==5.10.1" # Copy `noxfile_config.py` to your directory and modify it instead. @@ -89,7 +89,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/magics/requirements-test.txt b/samples/magics/requirements-test.txt index 68f9039cc..31b836790 100644 --- a/samples/magics/requirements-test.txt +++ b/samples/magics/requirements-test.txt @@ -1,4 +1,4 @@ -google-cloud-testutils==1.4.0 -pytest===7.4.4; python_version == '3.7' -pytest==8.3.2; python_version >= '3.8' -mock==5.1.0 +google-cloud-testutils==1.6.4 +pytest==8.4.2 +mock==5.2.0 +pytest-xdist==3.8.0 diff --git a/samples/magics/requirements.txt b/samples/magics/requirements.txt index a1044c231..331e910e2 100644 --- a/samples/magics/requirements.txt +++ b/samples/magics/requirements.txt @@ -1,10 +1,6 @@ -bigquery_magics==0.1.0 -db-dtypes==1.2.0 -google.cloud.bigquery==3.25.0 -google-cloud-bigquery-storage==2.25.0 -ipython===7.31.1; python_version == '3.7' -ipython===8.0.1; python_version == '3.8' -ipython===8.18.1; python_version >= '3.9' -pandas===1.3.5; python_version == '3.7' -pandas===2.0.3; python_version == '3.8' -pandas==2.2.2; python_version >= '3.9' +bigquery_magics==0.10.3 +db-dtypes==1.4.3 +google.cloud.bigquery==3.38.0 +google-cloud-bigquery-storage==2.33.1 +ipython===8.18.1 +pandas==2.3.3 diff --git a/samples/notebooks/jupyter_tutorial_test.py b/samples/notebooks/jupyter_tutorial_test.py index 2c2cf9390..1861a822f 100644 --- a/samples/notebooks/jupyter_tutorial_test.py +++ b/samples/notebooks/jupyter_tutorial_test.py @@ -45,7 +45,9 @@ def ipython_interactive( for the duration of the test scope. """ - with ipython.builtin_trap: + + trap = typing.cast(typing.ContextManager, ipython.builtin_trap) + with trap: yield ipython diff --git a/samples/notebooks/noxfile.py b/samples/notebooks/noxfile.py index 3b7135946..db2333e5a 100644 --- a/samples/notebooks/noxfile.py +++ b/samples/notebooks/noxfile.py @@ -29,7 +29,7 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -BLACK_VERSION = "black==22.3.0" +BLACK_VERSION = "black==23.7.0" ISORT_VERSION = "isort==5.10.1" # Copy `noxfile_config.py` to your directory and modify it instead. @@ -89,7 +89,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/notebooks/requirements-test.txt b/samples/notebooks/requirements-test.txt index 68f9039cc..31b836790 100644 --- a/samples/notebooks/requirements-test.txt +++ b/samples/notebooks/requirements-test.txt @@ -1,4 +1,4 @@ -google-cloud-testutils==1.4.0 -pytest===7.4.4; python_version == '3.7' -pytest==8.3.2; python_version >= '3.8' -mock==5.1.0 +google-cloud-testutils==1.6.4 +pytest==8.4.2 +mock==5.2.0 +pytest-xdist==3.8.0 diff --git a/samples/notebooks/requirements.txt b/samples/notebooks/requirements.txt index 81fa3782c..ef509734a 100644 --- a/samples/notebooks/requirements.txt +++ b/samples/notebooks/requirements.txt @@ -1,13 +1,9 @@ -bigquery-magics==0.1.0 -db-dtypes==1.2.0 -google-cloud-bigquery==3.25.0 -google-cloud-bigquery-storage==2.25.0 -ipython===7.31.1; python_version == '3.7' -ipython===8.0.1; python_version == '3.8' -ipython===8.18.1; python_version >= '3.9' -matplotlib===3.5.3; python_version == '3.7' -matplotlib===3.7.4; python_version == '3.8' -matplotlib==3.9.1; python_version >= '3.9' -pandas===1.3.5; python_version == '3.7' -pandas===2.0.3; python_version == '3.8' -pandas==2.2.2; python_version >= '3.9' +bigquery-magics==0.10.3 +db-dtypes==1.4.3 +google-cloud-bigquery==3.38.0 +google-cloud-bigquery-storage==2.33.1 +ipython===8.18.1; python_version == '3.9' +ipython==9.6.0; python_version >= '3.10' +matplotlib===3.9.2; python_version == '3.9' +matplotlib==3.10.6; python_version >= '3.10' +pandas==2.3.3 diff --git a/samples/snippets/label_job.py b/samples/snippets/label_job.py new file mode 100644 index 000000000..cfd06d189 --- /dev/null +++ b/samples/snippets/label_job.py @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def label_job() -> None: + # [START bigquery_label_job] + from google.cloud import bigquery + + client = bigquery.Client() + + sql = """ + SELECT corpus + FROM `bigquery-public-data.samples.shakespeare` + GROUP BY corpus; + """ + labels = {"color": "green"} + + config = bigquery.QueryJobConfig() + config.labels = labels + location = "us" + job = client.query(sql, location=location, job_config=config) + job_id = job.job_id + + print(f"Added {job.labels} to {job_id}.") + # [END bigquery_label_job] diff --git a/.github/.OwlBot.yaml b/samples/snippets/label_job_test.py similarity index 58% rename from .github/.OwlBot.yaml rename to samples/snippets/label_job_test.py index 8b142686c..0780db61a 100644 --- a/.github/.OwlBot.yaml +++ b/samples/snippets/label_job_test.py @@ -1,10 +1,10 @@ -# Copyright 2021 Google LLC +# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -12,11 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -docker: - image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest +import typing -deep-remove-regex: - - /owl-bot-staging +import label_job # type: ignore -begin-after-commit-hash: f2de93abafa306b2ebadf1d10d947db8bcf2bf15 +if typing.TYPE_CHECKING: + import pytest + + +def test_label_job( + capsys: "pytest.CaptureFixture[str]", +) -> None: + label_job.label_job() + + out, _ = capsys.readouterr() + assert "color" in out + assert "green" in out diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index 3b7135946..db2333e5a 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -29,7 +29,7 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -BLACK_VERSION = "black==22.3.0" +BLACK_VERSION = "black==23.7.0" ISORT_VERSION = "isort==5.10.1" # Copy `noxfile_config.py` to your directory and modify it instead. @@ -89,7 +89,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 083b20271..901f1df1a 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,5 +1,5 @@ # samples/snippets should be runnable with no "extras" -google-cloud-testutils==1.4.0 -pytest===7.4.4; python_version == '3.7' -pytest==8.3.2; python_version >= '3.8' -mock==5.1.0 +google-cloud-testutils==1.6.4 +pytest==8.4.2 +mock==5.2.0 +pytest-xdist==3.8.0 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 9e181d963..441385536 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -google-cloud-bigquery==3.25.0 +google-cloud-bigquery==3.38.0 diff --git a/samples/snippets/view.py b/samples/snippets/view.py index 94f406890..30e719c79 100644 --- a/samples/snippets/view.py +++ b/samples/snippets/view.py @@ -147,7 +147,7 @@ def grant_access( # Make an API request to get the view dataset ACLs. view_dataset = client.get_dataset(view_dataset_id) - analyst_group_email = "data_analysts@example.com" + analyst_group_email = "example-analyst-group@google.com" # [END bigquery_grant_view_access] # To facilitate testing, we replace values with alternatives # provided by the testing harness. diff --git a/samples/snippets/view_test.py b/samples/snippets/view_test.py index dfa1cdeee..d46595695 100644 --- a/samples/snippets/view_test.py +++ b/samples/snippets/view_test.py @@ -114,7 +114,6 @@ def test_view( project_id, dataset_id, table_id = view_id.split(".") overrides: view.OverridesDict = { - "analyst_group_email": "cloud-dpes-bigquery@google.com", "view_dataset_id": view_dataset_id, "source_dataset_id": source_dataset_id, "view_reference": { @@ -127,5 +126,5 @@ def test_view( assert len(view_dataset.access_entries) != 0 assert len(source_dataset.access_entries) != 0 out, _ = capsys.readouterr() - assert "cloud-dpes-bigquery@google.com" in out + assert "example-analyst-group@google.com" in out assert table_id in out diff --git a/samples/tests/test_client_query_shortmode.py b/samples/tests/test_client_query_job_optional.py similarity index 85% rename from samples/tests/test_client_query_shortmode.py rename to samples/tests/test_client_query_job_optional.py index 41132f24c..0e0b2cf19 100644 --- a/samples/tests/test_client_query_shortmode.py +++ b/samples/tests/test_client_query_job_optional.py @@ -1,4 +1,4 @@ -# Copyright 2024 Google LLC +# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,13 +14,13 @@ import typing -from .. import client_query_shortmode +from .. import client_query_job_optional if typing.TYPE_CHECKING: import pytest def test_client_query_shortmode(capsys: "pytest.CaptureFixture[str]") -> None: - client_query_shortmode.client_query_shortmode() + client_query_job_optional.client_query_job_optional() out, err = capsys.readouterr() assert "Query was run" in out diff --git a/samples/tests/test_download_public_data.py b/samples/tests/test_download_public_data.py index 02c2c6f9c..4f6c02452 100644 --- a/samples/tests/test_download_public_data.py +++ b/samples/tests/test_download_public_data.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging - import pytest from .. import download_public_data @@ -21,20 +19,9 @@ pytest.importorskip("google.cloud.bigquery_storage_v1") -def test_download_public_data( - caplog: pytest.LogCaptureFixture, capsys: pytest.CaptureFixture[str] -) -> None: - # Enable debug-level logging to verify the BigQuery Storage API is used. - caplog.set_level(logging.DEBUG) - +def test_download_public_data(capsys: pytest.CaptureFixture[str]) -> None: download_public_data.download_public_data() out, _ = capsys.readouterr() assert "year" in out assert "gender" in out assert "name" in out - - assert any( - "Started reading table 'bigquery-public-data.usa_names.usa_1910_current' with BQ Storage API session" - in message - for message in caplog.messages - ) diff --git a/samples/tests/test_download_public_data_sandbox.py b/samples/tests/test_download_public_data_sandbox.py index e86f604ad..d3dd31a38 100644 --- a/samples/tests/test_download_public_data_sandbox.py +++ b/samples/tests/test_download_public_data_sandbox.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging - import pytest from .. import download_public_data_sandbox @@ -21,20 +19,9 @@ pytest.importorskip("google.cloud.bigquery_storage_v1") -def test_download_public_data_sandbox( - caplog: pytest.LogCaptureFixture, capsys: pytest.CaptureFixture[str] -) -> None: - # Enable debug-level logging to verify the BigQuery Storage API is used. - caplog.set_level(logging.DEBUG) - +def test_download_public_data_sandbox(capsys: pytest.CaptureFixture[str]) -> None: download_public_data_sandbox.download_public_data_sandbox() - out, err = capsys.readouterr() + out, _ = capsys.readouterr() assert "year" in out assert "gender" in out assert "name" in out - - assert any( - # An anonymous table is used because this sample reads from query results. - ("Started reading table" in message and "BQ Storage API session" in message) - for message in caplog.messages - ) diff --git a/scripts/readme-gen/templates/install_deps.tmpl.rst b/scripts/readme-gen/templates/install_deps.tmpl.rst index 6f069c6c8..f21db80c4 100644 --- a/scripts/readme-gen/templates/install_deps.tmpl.rst +++ b/scripts/readme-gen/templates/install_deps.tmpl.rst @@ -12,7 +12,7 @@ Install Dependencies .. _Python Development Environment Setup Guide: https://cloud.google.com/python/setup -#. Create a virtualenv. Samples are compatible with Python 3.7+. +#. Create a virtualenv. Samples are compatible with Python 3.9+. .. code-block:: bash diff --git a/setup.cfg b/setup.cfg index 37b63aa49..d5e734f0f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,10 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Generated by synthtool. DO NOT EDIT! -[bdist_wheel] -universal = 1 - [pytype] python_version = 3.8 inputs = diff --git a/setup.py b/setup.py index 617685543..2ad29ecbf 100644 --- a/setup.py +++ b/setup.py @@ -12,131 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io -import os +import setuptools # type: ignore -import setuptools - -# Package metadata. - -name = "google-cloud-bigquery" -description = "Google BigQuery API client library" - -# Should be one of: -# 'Development Status :: 3 - Alpha' -# 'Development Status :: 4 - Beta' -# 'Development Status :: 5 - Production/Stable' -release_status = "Development Status :: 5 - Production/Stable" -dependencies = [ - "google-api-core[grpc] >= 2.11.1, <3.0.0dev", - "google-auth >= 2.14.1, <3.0.0dev", - "google-cloud-core >= 2.4.1, <3.0.0dev", - "google-resumable-media >= 2.0.0, < 3.0dev", - "packaging >= 20.0.0", - "python-dateutil >= 2.7.3, <3.0dev", - "requests >= 2.21.0, < 3.0.0dev", -] -pyarrow_dependency = "pyarrow >= 3.0.0" -extras = { - # bqstorage had a period where it was a required dependency, and has been - # moved back to optional due to bloat. See - # https://github.com/googleapis/python-bigquery/issues/1196 for more background. - "bqstorage": [ - "google-cloud-bigquery-storage >= 2.6.0, <3.0.0dev", - # Due to an issue in pip's dependency resolver, the `grpc` extra is not - # installed, even though `google-cloud-bigquery-storage` specifies it - # as `google-api-core[grpc]`. We thus need to explicitly specify it here. - # See: https://github.com/googleapis/python-bigquery/issues/83 The - # grpc.Channel.close() method isn't added until 1.32.0. - # https://github.com/grpc/grpc/pull/15254 - "grpcio >= 1.47.0, < 2.0dev", - "grpcio >= 1.49.1, < 2.0dev; python_version>='3.11'", - pyarrow_dependency, - ], - "pandas": [ - "pandas>=1.1.0", - pyarrow_dependency, - "db-dtypes>=0.3.0,<2.0.0dev", - "importlib_metadata>=1.0.0; python_version<'3.8'", - ], - "ipywidgets": [ - "ipywidgets>=7.7.0", - "ipykernel>=6.0.0", - ], - "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.8.4, <3.0.0dev"], - "ipython": [ - "bigquery-magics >= 0.1.0", - ], - "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], - "opentelemetry": [ - "opentelemetry-api >= 1.1.0", - "opentelemetry-sdk >= 1.1.0", - "opentelemetry-instrumentation >= 0.20b0", - ], - "bigquery_v2": [ - "proto-plus >= 1.22.3, <2.0.0dev", - "protobuf>=3.20.2,<6.0.0dev,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5", # For the legacy proto-based types. - ], -} - -all_extras = [] - -for extra in extras: - all_extras.extend(extras[extra]) - -extras["all"] = all_extras - -# Setup boilerplate below this line. - -package_root = os.path.abspath(os.path.dirname(__file__)) - -readme_filename = os.path.join(package_root, "README.rst") -with io.open(readme_filename, encoding="utf-8") as readme_file: - readme = readme_file.read() - -version = {} -with open(os.path.join(package_root, "google/cloud/bigquery/version.py")) as fp: - exec(fp.read(), version) -version = version["__version__"] - -# Only include packages under the 'google' namespace. Do not include tests, -# benchmarks, etc. -packages = [ - package - for package in setuptools.find_namespace_packages() - if package.startswith("google") -] - -setuptools.setup( - name=name, - version=version, - description=description, - long_description=readme, - author="Google LLC", - author_email="googleapis-packages@google.com", - license="Apache 2.0", - url="https://github.com/googleapis/python-bigquery", - classifiers=[ - release_status, - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Operating System :: OS Independent", - "Topic :: Internet", - ], - platforms="Posix; MacOS X; Windows", - packages=packages, - install_requires=dependencies, - extras_require=extras, - python_requires=">=3.7", - include_package_data=True, - zip_safe=False, -) +setuptools.setup() diff --git a/testing/constraints-3.13.txt b/testing/constraints-3.13.txt new file mode 100644 index 000000000..e69de29bb diff --git a/testing/constraints-3.14.txt b/testing/constraints-3.14.txt new file mode 100644 index 000000000..6bd20f5fb --- /dev/null +++ b/testing/constraints-3.14.txt @@ -0,0 +1,2 @@ +# Constraints for Python 3.14 +grpcio >= 1.75.1 diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt deleted file mode 100644 index 55e63449f..000000000 --- a/testing/constraints-3.7.txt +++ /dev/null @@ -1,36 +0,0 @@ -# This constraints file is used to check that lower bounds -# are correct in setup.py -# List *all* library dependencies and extras in this file. -# Pin the version to the lower bound. -# -# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", -# Then this file should have foo==1.14.0 -bigquery-magics==0.1.0 -db-dtypes==0.3.0 -geopandas==0.9.0 -google-api-core==2.11.1 -google-auth==2.14.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-core==2.4.1 -google-cloud-testutils==1.4.0 -google-crc32c==1.5.0 -google-resumable-media==2.0.0 -googleapis-common-protos==1.62.0 -grpcio==1.47.0 -grpcio-status==1.47.0 -ipywidgets==7.7.1 -ipython==7.23.1 -ipykernel==6.0.0 -opentelemetry-api==1.1.0 -opentelemetry-instrumentation==0.20b0 -opentelemetry-sdk==1.1.0 -packaging==20.0.0 -pandas==1.1.0 -proto-plus==1.22.3 -protobuf==3.20.2 -pyarrow==3.0.0 -python-dateutil==2.7.3 -requests==2.21.0 -Shapely==1.8.4 -six==1.13.0 -tqdm==4.7.4 diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt deleted file mode 100644 index e5e73c5c7..000000000 --- a/testing/constraints-3.8.txt +++ /dev/null @@ -1,2 +0,0 @@ -grpcio==1.47.0 -pandas==1.2.0 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index d4c302867..f61c0cf09 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -4,5 +4,30 @@ # # NOTE: Not comprehensive yet, will eventually be maintained semi-automatically by # the renovate bot. +bigquery-magics==0.6.0 +db-dtypes==1.0.4 +geopandas==0.9.0 +google-api-core==2.11.1 +google-auth==2.14.1 +google-cloud-bigquery-storage==2.18.0 +google-cloud-core==2.4.1 +google-resumable-media==2.0.0 grpcio==1.47.0 -pyarrow>=4.0.0 +grpcio==1.49.1; python_version >= '3.11' +ipywidgets==7.7.1 +ipython==7.23.1 +ipykernel==6.2.0 +opentelemetry-api==1.1.0 +opentelemetry-instrumentation==0.20b0 +opentelemetry-sdk==1.1.0 +numpy==1.20.2 +packaging==24.2.0 +pandas==1.3.0 +pandas-gbq==0.26.1 +proto-plus==1.22.3 +protobuf==3.20.2 +pyarrow==4.0.0 +python-dateutil==2.8.2 +requests==2.21.0 +Shapely==1.8.4 +matplotlib==3.7.1 diff --git a/tests/data/pico.csv b/tests/data/pico.csv new file mode 100644 index 000000000..bcc853040 --- /dev/null +++ b/tests/data/pico.csv @@ -0,0 +1,3 @@ +2025-01-01T00:00:00.123456789012Z +2025-01-02T00:00:00.123456789012Z +2025-01-03T00:00:00.123456789012Z \ No newline at end of file diff --git a/tests/data/pico_schema.json b/tests/data/pico_schema.json new file mode 100644 index 000000000..8227917ea --- /dev/null +++ b/tests/data/pico_schema.json @@ -0,0 +1,8 @@ +[ + { + "name": "pico_col", + "type": "TIMESTAMP", + "mode": "NULLABLE", + "timestampPrecision": "12" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 8efa042af..123aeb6e7 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -98,12 +98,14 @@ def load_scalars_table( data_path: str = "scalars.jsonl", source_format=enums.SourceFormat.NEWLINE_DELIMITED_JSON, schema_source="scalars_schema.json", + timestamp_target_precision=None, ) -> str: schema = bigquery_client.schema_from_json(DATA_DIR / schema_source) table_id = data_path.replace(".", "_") + hex(random.randrange(1000000)) job_config = bigquery.LoadJobConfig() job_config.schema = schema job_config.source_format = source_format + job_config.timestamp_target_precision = timestamp_target_precision full_table_id = f"{project_id}.{dataset_id}.{table_id}" with open(DATA_DIR / data_path, "rb") as data_file: job = bigquery_client.load_table_from_file( @@ -169,6 +171,23 @@ def scalars_table_csv( bigquery_client.delete_table(full_table_id, not_found_ok=True) +@pytest.fixture(scope="session") +def scalars_table_pico( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +): + full_table_id = load_scalars_table( + bigquery_client, + project_id, + dataset_id, + data_path="pico.csv", + source_format=enums.SourceFormat.CSV, + schema_source="pico_schema.json", + timestamp_target_precision=[12], + ) + yield full_table_id + bigquery_client.delete_table(full_table_id, not_found_ok=True) + + @pytest.fixture def test_table_name(request, replace_non_anum=re.compile(r"[^a-zA-Z0-9_]").sub): return replace_non_anum("_", request.node.name) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index 82cf11f85..f2aed656c 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -194,3 +194,32 @@ def test_list_rows_range_csv( range_type = schema.field("range_date").type assert range_type == expected_type + + +def test_to_arrow_query_with_empty_results(bigquery_client): + """ + JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580. + """ + job = bigquery_client.query( + """ + select + 123 as int_col, + '' as string_col, + to_json('{}') as json_col, + struct(to_json('[]') as json_field, -1 as int_field) as struct_col, + [to_json('null')] as json_array_col, + from unnest([]) + """ + ) + table = job.to_arrow() + assert list(table.column_names) == [ + "int_col", + "string_col", + "json_col", + "struct_col", + "json_array_col", + ] + assert table.shape == (0, 5) + struct_type = table.field("struct_col").type + assert struct_type.get_field_index("json_field") == 0 + assert struct_type.get_field_index("int_field") == 1 diff --git a/tests/system/test_client.py b/tests/system/test_client.py index 95c679a14..7e773598e 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -25,6 +25,8 @@ import time import unittest import uuid +import random +import string from typing import Optional from google.api_core.exceptions import PreconditionFailed @@ -45,6 +47,8 @@ from google.cloud import storage from google.cloud.datacatalog_v1 import types as datacatalog_types from google.cloud.datacatalog_v1 import PolicyTagManagerClient +from google.cloud.resourcemanager_v3 import types as resourcemanager_types +from google.cloud.resourcemanager_v3 import TagKeysClient, TagValuesClient import psutil import pytest from test_utils.retry import RetryErrors @@ -70,6 +74,16 @@ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), ] +SCHEMA_PICOSECOND = [ + bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField( + "time_pico", + "TIMESTAMP", + mode="REQUIRED", + timestamp_precision=enums.TimestampPrecision.PICOSECOND, + ), +] CLUSTERING_SCHEMA = [ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), @@ -93,6 +107,10 @@ ], ), ] +TABLE_CONSTRAINTS_SCHEMA = [ + bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("fk_id", "STRING", mode="REQUIRED"), +] SOURCE_URIS_AVRO = [ "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.avro", @@ -156,9 +174,12 @@ def setUpModule(): class TestBigQuery(unittest.TestCase): def setUp(self): self.to_delete = [] + self.to_delete_tag_keys_values = [] def tearDown(self): policy_tag_client = PolicyTagManagerClient() + tag_keys_client = TagKeysClient() + tag_values_client = TagValuesClient() def _still_in_use(bad_request): return any( @@ -181,6 +202,18 @@ def _still_in_use(bad_request): else: doomed.delete() + # The TagKey cannot be deleted if it has any child TagValues. + for key_values in self.to_delete_tag_keys_values: + tag_key = key_values.pop() + + # Delete tag values first + [ + tag_values_client.delete_tag_value(name=tag_value.name).result() + for tag_value in key_values + ] + + tag_keys_client.delete_tag_key(name=tag_key.name).result() + def test_get_service_account_email(self): client = Config.CLIENT @@ -278,24 +311,74 @@ def test_create_dataset_with_default_rounding_mode(self): self.assertTrue(_dataset_exists(dataset)) self.assertEqual(dataset.default_rounding_mode, "ROUND_HALF_EVEN") + def _create_resource_tag_key_and_values(self, key, values): + tag_key_client = TagKeysClient() + tag_value_client = TagValuesClient() + + tag_key_parent = f"projects/{Config.CLIENT.project}" + new_tag_key = resourcemanager_types.TagKey( + short_name=key, parent=tag_key_parent + ) + tag_key = tag_key_client.create_tag_key(tag_key=new_tag_key).result() + self.to_delete_tag_keys_values.insert(0, [tag_key]) + + for value in values: + new_tag_value = resourcemanager_types.TagValue( + short_name=value, parent=tag_key.name + ) + tag_value = tag_value_client.create_tag_value( + tag_value=new_tag_value + ).result() + self.to_delete_tag_keys_values[0].insert(0, tag_value) + def test_update_dataset(self): dataset = self.temp_dataset(_make_dataset_id("update_dataset")) self.assertTrue(_dataset_exists(dataset)) self.assertIsNone(dataset.friendly_name) self.assertIsNone(dataset.description) self.assertEqual(dataset.labels, {}) + self.assertEqual(dataset.resource_tags, {}) self.assertIs(dataset.is_case_insensitive, False) + # This creates unique tag keys for each of test runnings for different Python versions + tag_postfix = "".join(random.choices(string.ascii_letters + string.digits, k=4)) + tag_1 = f"env_{tag_postfix}" + tag_2 = f"component_{tag_postfix}" + tag_3 = f"project_{tag_postfix}" + + # Tags need to be created before they can be used in a dataset. + self._create_resource_tag_key_and_values(tag_1, ["prod", "dev"]) + self._create_resource_tag_key_and_values(tag_2, ["batch"]) + self._create_resource_tag_key_and_values(tag_3, ["atlas"]) + dataset.friendly_name = "Friendly" dataset.description = "Description" dataset.labels = {"priority": "high", "color": "blue"} + dataset.resource_tags = { + f"{Config.CLIENT.project}/{tag_1}": "prod", + f"{Config.CLIENT.project}/{tag_2}": "batch", + } dataset.is_case_insensitive = True ds2 = Config.CLIENT.update_dataset( - dataset, ("friendly_name", "description", "labels", "is_case_insensitive") + dataset, + ( + "friendly_name", + "description", + "labels", + "resource_tags", + "is_case_insensitive", + ), ) self.assertEqual(ds2.friendly_name, "Friendly") self.assertEqual(ds2.description, "Description") self.assertEqual(ds2.labels, {"priority": "high", "color": "blue"}) + self.assertEqual( + ds2.resource_tags, + { + f"{Config.CLIENT.project}/{tag_1}": "prod", + f"{Config.CLIENT.project}/{tag_2}": "batch", + }, + ) self.assertIs(ds2.is_case_insensitive, True) ds2.labels = { @@ -303,8 +386,25 @@ def test_update_dataset(self): "shape": "circle", # add "priority": None, # delete } - ds3 = Config.CLIENT.update_dataset(ds2, ["labels"]) + ds2.resource_tags = { + f"{Config.CLIENT.project}/{tag_1}": "dev", # change + f"{Config.CLIENT.project}/{tag_3}": "atlas", # add + f"{Config.CLIENT.project}/{tag_2}": None, # delete + } + ds3 = Config.CLIENT.update_dataset(ds2, ["labels", "resource_tags"]) self.assertEqual(ds3.labels, {"color": "green", "shape": "circle"}) + self.assertEqual( + ds3.resource_tags, + { + f"{Config.CLIENT.project}/{tag_1}": "dev", + f"{Config.CLIENT.project}/{tag_3}": "atlas", + }, + ) + + # Remove all tags + ds3.resource_tags = None + ds4 = Config.CLIENT.update_dataset(ds3, ["resource_tags"]) + self.assertEqual(ds4.resource_tags, {}) # If we try to update using d2 again, it will fail because the # previous update changed the ETag. @@ -541,6 +641,19 @@ def test_create_table_w_time_partitioning_w_clustering_fields(self): self.assertEqual(time_partitioning.field, "transaction_time") self.assertEqual(table.clustering_fields, ["user_email", "store_code"]) + def test_create_table_w_picosecond_timestamp(self): + dataset = self.temp_dataset(_make_dataset_id("create_table")) + table_id = "test_table" + table_arg = Table(dataset.table(table_id), schema=SCHEMA_PICOSECOND) + self.assertFalse(_table_exists(table_arg)) + + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) + self.to_delete.insert(0, table) + + self.assertTrue(_table_exists(table)) + self.assertEqual(table.table_id, table_id) + self.assertEqual(table.schema, SCHEMA_PICOSECOND) + def test_delete_dataset_with_string(self): dataset_id = _make_dataset_id("delete_table_true_with_string") project = Config.CLIENT.project @@ -646,6 +759,16 @@ def test_list_tables(self): def test_update_table(self): dataset = self.temp_dataset(_make_dataset_id("update_table")) + # This creates unique tag keys for each of test runnings for different Python versions + tag_postfix = "".join(random.choices(string.ascii_letters + string.digits, k=4)) + tag_1 = f"owner_{tag_postfix}" + tag_2 = f"classification_{tag_postfix}" + tag_3 = f"env_{tag_postfix}" + + self._create_resource_tag_key_and_values(tag_1, ["Alice", "Bob"]) + self._create_resource_tag_key_and_values(tag_2, ["public"]) + self._create_resource_tag_key_and_values(tag_3, ["dev"]) + TABLE_NAME = "test_table" table_arg = Table(dataset.table(TABLE_NAME), schema=SCHEMA) self.assertFalse(_table_exists(table_arg)) @@ -658,14 +781,25 @@ def test_update_table(self): table.friendly_name = "Friendly" table.description = "Description" table.labels = {"priority": "high", "color": "blue"} + table.resource_tags = { + f"{Config.CLIENT.project}/{tag_1}": "Alice", + f"{Config.CLIENT.project}/{tag_3}": "dev", + } table2 = Config.CLIENT.update_table( - table, ["friendly_name", "description", "labels"] + table, ["friendly_name", "description", "labels", "resource_tags"] ) self.assertEqual(table2.friendly_name, "Friendly") self.assertEqual(table2.description, "Description") self.assertEqual(table2.labels, {"priority": "high", "color": "blue"}) + self.assertEqual( + table2.resource_tags, + { + f"{Config.CLIENT.project}/{tag_1}": "Alice", + f"{Config.CLIENT.project}/{tag_3}": "dev", + }, + ) table2.description = None table2.labels = { @@ -673,9 +807,28 @@ def test_update_table(self): "shape": "circle", # add "priority": None, # delete } - table3 = Config.CLIENT.update_table(table2, ["description", "labels"]) + table2.resource_tags = { + f"{Config.CLIENT.project}/{tag_1}": "Bob", # change + f"{Config.CLIENT.project}/{tag_2}": "public", # add + f"{Config.CLIENT.project}/{tag_3}": None, # delete + } + table3 = Config.CLIENT.update_table( + table2, ["description", "labels", "resource_tags"] + ) self.assertIsNone(table3.description) self.assertEqual(table3.labels, {"color": "green", "shape": "circle"}) + self.assertEqual( + table3.resource_tags, + { + f"{Config.CLIENT.project}/{tag_1}": "Bob", + f"{Config.CLIENT.project}/{tag_2}": "public", + }, + ) + + # Delete resource tag bindings. + table3.resource_tags = None + table4 = Config.CLIENT.update_table(table3, ["resource_tags"]) + self.assertEqual(table4.resource_tags, {}) # If we try to update using table2 again, it will fail because the # previous update changed the ETag. @@ -775,6 +928,126 @@ def test_update_table_clustering_configuration(self): table3 = Config.CLIENT.update_table(table2, ["clustering_fields"]) self.assertIsNone(table3.clustering_fields, None) + def test_update_table_constraints(self): + from google.cloud.bigquery.table import TableConstraints + from google.cloud.bigquery.table import ( + PrimaryKey, + ForeignKey, + TableReference, + ColumnReference, + ) + + dataset = self.temp_dataset(_make_dataset_id("update_table")) + + TABLE_NAME = "test_table" + table_arg = Table(dataset.table(TABLE_NAME), schema=TABLE_CONSTRAINTS_SCHEMA) + self.assertFalse(_table_exists(table_arg)) + + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) + self.to_delete.insert(0, table) + self.assertTrue(_table_exists(table)) + + REFERENCE_TABLE_NAME = "test_table2" + reference_table_arg = Table( + dataset.table(REFERENCE_TABLE_NAME), + schema=[ + bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), + ], + ) + reference_table = helpers.retry_403(Config.CLIENT.create_table)( + reference_table_arg + ) + self.to_delete.insert(0, reference_table) + self.assertTrue(_table_exists(reference_table)) + + reference_table.table_constraints = TableConstraints( + primary_key=PrimaryKey(columns=["id"]), foreign_keys=None + ) + reference_table2 = Config.CLIENT.update_table( + reference_table, ["table_constraints"] + ) + self.assertEqual( + reference_table2.table_constraints.primary_key, + reference_table.table_constraints.primary_key, + ) + + table_constraints = TableConstraints( + primary_key=PrimaryKey(columns=["id"]), + foreign_keys=[ + ForeignKey( + name="fk_id", + referenced_table=TableReference(dataset, "test_table2"), + column_references=[ + ColumnReference(referencing_column="id", referenced_column="id") + ], + ), + ], + ) + + table.table_constraints = table_constraints + table2 = Config.CLIENT.update_table(table, ["table_constraints"]) + self.assertEqual( + table2.table_constraints, + table_constraints, + ) + + table2.table_constraints = None + table3 = Config.CLIENT.update_table(table2, ["table_constraints"]) + self.assertIsNone(table3.table_constraints, None) + + reference_table2.table_constraints = None + reference_table3 = Config.CLIENT.update_table( + reference_table2, ["table_constraints"] + ) + self.assertIsNone(reference_table3.table_constraints, None) + + def test_update_table_autodetect_schema(self): + dataset = self.temp_dataset(_make_dataset_id("bq_update_table_test")) + + # Create an external table, restrict schema to one field + TABLE_NAME = "test_table" + set_schema = [bigquery.SchemaField("username", "STRING", mode="NULLABLE")] + table_arg = Table(dataset.table(TABLE_NAME)) + + # Create an external_config and include it in the table arguments + external_config = bigquery.ExternalConfig(bigquery.ExternalSourceFormat.AVRO) + external_config.source_uris = SOURCE_URIS_AVRO + external_config.reference_file_schema_uri = REFERENCE_FILE_SCHEMA_URI_AVRO + external_config.schema = set_schema + table_arg.external_data_configuration = external_config + + self.assertFalse(_table_exists(table_arg)) + + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) + self.to_delete.insert(0, table) + self.assertTrue(_table_exists(table)) + + self.assertEqual(table.schema, set_schema) + + # Update table with schema autodetection + updated_table_arg = Table(dataset.table(TABLE_NAME)) + + # Update the external_config and include it in the table arguments + updated_external_config = copy.deepcopy(external_config) + updated_external_config.autodetect = True + updated_external_config.schema = None + updated_table_arg.external_data_configuration = updated_external_config + + # PATCH call with autodetect_schema=True to trigger schema inference + updated_table = Config.CLIENT.update_table( + updated_table_arg, ["external_data_configuration"], autodetect_schema=True + ) + + # The updated table should have a schema inferred from the reference + # file, which has all four fields. + expected_schema = [ + bigquery.SchemaField("username", "STRING", mode="NULLABLE"), + bigquery.SchemaField("tweet", "STRING", mode="NULLABLE"), + bigquery.SchemaField("timestamp", "STRING", mode="NULLABLE"), + bigquery.SchemaField("likes", "INTEGER", mode="NULLABLE"), + ] + self.assertEqual(updated_table.schema, expected_schema) + @staticmethod def _fetch_single_page(table, selected_fields=None): iterator = Config.CLIENT.list_rows(table, selected_fields=selected_fields) @@ -1022,6 +1295,29 @@ def test_load_table_from_json_schema_autodetect_table_exists(self): self.assertEqual(tuple(table.schema), table_schema) self.assertEqual(table.num_rows, 2) + def test_load_table_from_csv_w_picosecond_timestamp(self): + dataset_id = _make_dataset_id("bq_system_test") + self.temp_dataset(dataset_id) + table_id = "{}.{}.load_table_from_json_basic_use".format( + Config.CLIENT.project, dataset_id + ) + + table_schema = Config.CLIENT.schema_from_json(DATA_PATH / "pico_schema.json") + # create the table before loading so that the column order is predictable + table = helpers.retry_403(Config.CLIENT.create_table)( + Table(table_id, schema=table_schema) + ) + self.to_delete.insert(0, table) + + # do not pass an explicit job config to trigger automatic schema detection + with open(DATA_PATH / "pico.csv", "rb") as f: + load_job = Config.CLIENT.load_table_from_file(f, table_id) + load_job.result() + + table = Config.CLIENT.get_table(table) + self.assertEqual(list(table.schema), table_schema) + self.assertEqual(table.num_rows, 3) + def test_load_avro_from_uri_then_dump_table(self): from google.cloud.bigquery.job import CreateDisposition from google.cloud.bigquery.job import SourceFormat diff --git a/tests/system/test_list_rows.py b/tests/system/test_list_rows.py index 108b842ce..02b07744b 100644 --- a/tests/system/test_list_rows.py +++ b/tests/system/test_list_rows.py @@ -132,3 +132,23 @@ def test_list_rows_range(bigquery_client: bigquery.Client, scalars_table_csv: st row_null = rows[1] assert row_null["range_date"] is None + + +def test_list_rows_pico(bigquery_client: bigquery.Client, scalars_table_pico: str): + rows = bigquery_client.list_rows( + scalars_table_pico, timestamp_precision=enums.TimestampPrecision.PICOSECOND + ) + rows = list(rows) + row = rows[0] + assert row["pico_col"] == "2025-01-01T00:00:00.123456789012Z" + + +def test_list_rows_pico_truncate( + bigquery_client: bigquery.Client, scalars_table_pico: str +): + # For a picosecond timestamp column, if the user does not explicitly set + # timestamp_precision, will return truncated microsecond precision. + rows = bigquery_client.list_rows(scalars_table_pico) + rows = list(rows) + row = rows[0] + assert row["pico_col"] == "1735689600123456" diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 85c7b79e6..1fe7ff2cd 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -1259,7 +1259,7 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id): df = pandas.DataFrame( dict( dt=[ - datetime.datetime(2020, 1, 8, 8, 0, 0), + datetime.datetime(2020, 1, 8, 8, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime( 2020, 1, @@ -1299,6 +1299,32 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id): ] +def test_to_dataframe_query_with_empty_results(bigquery_client): + """ + JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580. + """ + job = bigquery_client.query( + """ + select + 123 as int_col, + '' as string_col, + to_json('{}') as json_col, + struct(to_json('[]') as json_field, -1 as int_field) as struct_col, + [to_json('null')] as json_array_col, + from unnest([]) + """ + ) + df = job.to_dataframe() + assert list(df.columns) == [ + "int_col", + "string_col", + "json_col", + "struct_col", + "json_array_col", + ] + assert len(df.index) == 0 + + def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): wkt = pytest.importorskip("shapely.wkt") bigquery_client.query( diff --git a/tests/system/test_query.py b/tests/system/test_query.py index d94a117e3..b8bb06a4c 100644 --- a/tests/system/test_query.py +++ b/tests/system/test_query.py @@ -21,6 +21,7 @@ import pytest from google.cloud import bigquery +from google.cloud.bigquery import enums from google.cloud.bigquery.query import ArrayQueryParameter from google.cloud.bigquery.query import ScalarQueryParameter from google.cloud.bigquery.query import ScalarQueryParameterType @@ -546,3 +547,15 @@ def test_session(bigquery_client: bigquery.Client, query_api_method: str): assert len(rows) == 1 assert rows[0][0] == 5 + + +def test_query_picosecond(bigquery_client: bigquery.Client): + job = bigquery_client.query( + "SELECT CAST('2025-10-20' AS TIMESTAMP(12));", + api_method="QUERY", + timestamp_precision=enums.TimestampPrecision.PICOSECOND, + ) + + result = job.result() + rows = list(result) + assert rows[0][0] == "2025-10-20T00:00:00.000000000000Z" diff --git a/tests/unit/_helpers/test_cell_data_parser.py b/tests/unit/_helpers/test_cell_data_parser.py new file mode 100644 index 000000000..f75e63b48 --- /dev/null +++ b/tests/unit/_helpers/test_cell_data_parser.py @@ -0,0 +1,476 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import datetime +import decimal +import json + +from dateutil.relativedelta import relativedelta +import pytest + +import google.cloud.bigquery.schema + + +def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs): + return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs) + + +@pytest.fixture +def mut(): + from google.cloud.bigquery import _helpers + + return _helpers + + +@pytest.fixture +def object_under_test(mut): + return mut.CELL_DATA_PARSER + + +ALL_TYPES = { + "BOOL", + "BOOLEAN", + "BYTES", + "INTEGER", + "INT64", + "INTERVAL", + "FLOAT", + "FLOAT64", + "NUMERIC", + "BIGNUMERIC", + "STRING", + "GEOGRAPHY", + "TIMESTAMP", + "DATETIME", + "DATE", + "TIME", + "RECORD", + "STRUCT", + "JSON", + "RANGE", +} + +TYPES_WITH_CLIENT_SIDE_NULL_VALIDATION = ALL_TYPES - { + "STRING", + "GEOGRAPHY", +} + + +@pytest.mark.parametrize( + "type_", + list(sorted(ALL_TYPES)), +) +def test_to_py_w_none_nullable(object_under_test, type_): + assert object_under_test.to_py(None, create_field("NULLABLE", type_)) is None + + +@pytest.mark.parametrize("type_", list(sorted(TYPES_WITH_CLIENT_SIDE_NULL_VALIDATION))) +def test_to_py_w_none_required(object_under_test, type_): + with pytest.raises(TypeError): + object_under_test.to_py(None, create_field("REQUIRED", type_)) + + +def test_interval_to_py_w_invalid_format(object_under_test): + with pytest.raises(ValueError, match="NOT_AN_INTERVAL"): + object_under_test.interval_to_py("NOT_AN_INTERVAL", create_field()) + + +@pytest.mark.parametrize( + ("value", "expected"), + ( + ("0-0 0 0:0:0", relativedelta()), + # SELECT INTERVAL X YEAR + ("-10000-0 0 0:0:0", relativedelta(years=-10000)), + ("-1-0 0 0:0:0", relativedelta(years=-1)), + ("1-0 0 0:0:0", relativedelta(years=1)), + ("10000-0 0 0:0:0", relativedelta(years=10000)), + # SELECT INTERVAL X MONTH + ("-0-11 0 0:0:0", relativedelta(months=-11)), + ("-0-1 0 0:0:0", relativedelta(months=-1)), + ("0-1 0 0:0:0", relativedelta(months=1)), + ("0-11 0 0:0:0", relativedelta(months=11)), + # SELECT INTERVAL X DAY + ("0-0 -3660000 0:0:0", relativedelta(days=-3660000)), + ("0-0 -1 0:0:0", relativedelta(days=-1)), + ("0-0 1 0:0:0", relativedelta(days=1)), + ("0-0 3660000 0:0:0", relativedelta(days=3660000)), + # SELECT INTERVAL X HOUR + ("0-0 0 -87840000:0:0", relativedelta(hours=-87840000)), + ("0-0 0 -1:0:0", relativedelta(hours=-1)), + ("0-0 0 1:0:0", relativedelta(hours=1)), + ("0-0 0 87840000:0:0", relativedelta(hours=87840000)), + # SELECT INTERVAL X MINUTE + ("0-0 0 -0:59:0", relativedelta(minutes=-59)), + ("0-0 0 -0:1:0", relativedelta(minutes=-1)), + ("0-0 0 0:1:0", relativedelta(minutes=1)), + ("0-0 0 0:59:0", relativedelta(minutes=59)), + # SELECT INTERVAL X SECOND + ("0-0 0 -0:0:59", relativedelta(seconds=-59)), + ("0-0 0 -0:0:1", relativedelta(seconds=-1)), + ("0-0 0 0:0:1", relativedelta(seconds=1)), + ("0-0 0 0:0:59", relativedelta(seconds=59)), + # SELECT (INTERVAL -1 SECOND) / 1000000 + ("0-0 0 -0:0:0.000001", relativedelta(microseconds=-1)), + ("0-0 0 -0:0:59.999999", relativedelta(seconds=-59, microseconds=-999999)), + ("0-0 0 -0:0:59.999", relativedelta(seconds=-59, microseconds=-999000)), + ("0-0 0 0:0:59.999", relativedelta(seconds=59, microseconds=999000)), + ("0-0 0 0:0:59.999999", relativedelta(seconds=59, microseconds=999999)), + # Test with multiple digits in each section. + ( + "32-11 45 67:16:23.987654", + relativedelta( + years=32, + months=11, + days=45, + hours=67, + minutes=16, + seconds=23, + microseconds=987654, + ), + ), + ( + "-32-11 -45 -67:16:23.987654", + relativedelta( + years=-32, + months=-11, + days=-45, + hours=-67, + minutes=-16, + seconds=-23, + microseconds=-987654, + ), + ), + # Test with mixed +/- sections. + ( + "9999-9 -999999 9999999:59:59.999999", + relativedelta( + years=9999, + months=9, + days=-999999, + hours=9999999, + minutes=59, + seconds=59, + microseconds=999999, + ), + ), + # Test with fraction that is not microseconds. + ("0-0 0 0:0:42.", relativedelta(seconds=42)), + ("0-0 0 0:0:59.1", relativedelta(seconds=59, microseconds=100000)), + ("0-0 0 0:0:0.12", relativedelta(microseconds=120000)), + ("0-0 0 0:0:0.123", relativedelta(microseconds=123000)), + ("0-0 0 0:0:0.1234", relativedelta(microseconds=123400)), + # Fractional seconds can cause rounding problems if cast to float. See: + # https://github.com/googleapis/python-db-dtypes-pandas/issues/18 + ("0-0 0 0:0:59.876543", relativedelta(seconds=59, microseconds=876543)), + ( + "0-0 0 01:01:01.010101", + relativedelta(hours=1, minutes=1, seconds=1, microseconds=10101), + ), + ( + "0-0 0 09:09:09.090909", + relativedelta(hours=9, minutes=9, seconds=9, microseconds=90909), + ), + ( + "0-0 0 11:11:11.111111", + relativedelta(hours=11, minutes=11, seconds=11, microseconds=111111), + ), + ( + "0-0 0 19:16:23.987654", + relativedelta(hours=19, minutes=16, seconds=23, microseconds=987654), + ), + # Nanoseconds are not expected, but should not cause error. + ("0-0 0 0:0:00.123456789", relativedelta(microseconds=123456)), + ("0-0 0 0:0:59.87654321", relativedelta(seconds=59, microseconds=876543)), + ), +) +def test_interval_to_py_w_string_values(object_under_test, value, expected): + got = object_under_test.interval_to_py(value, create_field()) + assert got == expected + + +def test_integer_to_py_w_string_value(object_under_test): + coerced = object_under_test.integer_to_py("42", object()) + assert coerced == 42 + + +def test_integer_to_py_w_float_value(object_under_test): + coerced = object_under_test.integer_to_py(42.0, object()) + assert coerced == 42 + + +def test_json_to_py_w_json_field(object_under_test): + data_field = create_field("REQUIRED", "data", "JSON") + + value = json.dumps( + {"v": {"key": "value"}}, + ) + + expected_output = {"v": {"key": "value"}} + coerced_output = object_under_test.json_to_py(value, data_field) + assert coerced_output == expected_output + + +def test_json_to_py_w_string_value(object_under_test): + coerced = object_under_test.json_to_py('"foo"', create_field()) + assert coerced == "foo" + + +def test_float_to_py_w_string_value(object_under_test): + coerced = object_under_test.float_to_py("3.1415", object()) + assert coerced == 3.1415 + + +def test_float_to_py_w_float_value(object_under_test): + coerced = object_under_test.float_to_py(3.1415, object()) + assert coerced == 3.1415 + + +def test_numeric_to_py_w_string_value(object_under_test): + coerced = object_under_test.numeric_to_py("3.1415", object()) + assert coerced == decimal.Decimal("3.1415") + + +def test_numeric_to_py_w_float_value(object_under_test): + coerced = object_under_test.numeric_to_py(3.1415, object()) + # There is no exact float representation of 3.1415. + assert coerced == decimal.Decimal(3.1415) + + +def test_bool_to_py_w_value_t(object_under_test): + coerced = object_under_test.bool_to_py("T", object()) + assert coerced is True + + +def test_bool_to_py_w_value_true(object_under_test): + coerced = object_under_test.bool_to_py("True", object()) + assert coerced is True + + +def test_bool_to_py_w_value_1(object_under_test): + coerced = object_under_test.bool_to_py("1", object()) + assert coerced is True + + +def test_bool_to_py_w_value_other(object_under_test): + coerced = object_under_test.bool_to_py("f", object()) + assert coerced is False + + +def test_string_to_py_w_string_value(object_under_test): + coerced = object_under_test.string_to_py("Wonderful!", object()) + assert coerced == "Wonderful!" + + +def test_bytes_to_py_w_base64_encoded_bytes(object_under_test): + expected = b"Wonderful!" + encoded = base64.standard_b64encode(expected) + coerced = object_under_test.bytes_to_py(encoded, object()) + assert coerced == expected + + +def test_bytes_to_py_w_base64_encoded_text(object_under_test): + expected = b"Wonderful!" + encoded = base64.standard_b64encode(expected).decode("ascii") + coerced = object_under_test.bytes_to_py(encoded, object()) + assert coerced == expected + + +def test_timestamp_to_py_w_string_int_value(object_under_test): + from google.cloud._helpers import _EPOCH + + coerced = object_under_test.timestamp_to_py("1234567", create_field()) + assert coerced == _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) + + +def test_timestamp_to_py_w_int_value(object_under_test): + from google.cloud._helpers import _EPOCH + + coerced = object_under_test.timestamp_to_py(1234567, create_field()) + assert coerced == _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) + + +def test_timestamp_to_py_w_picosecond_precision(object_under_test): + from google.cloud.bigquery import enums + + pico_schema = create_field(timestamp_precision=enums.TimestampPrecision.PICOSECOND) + pico_timestamp = "2025-01-01T00:00:00.123456789012Z" + coerced = object_under_test.timestamp_to_py(pico_timestamp, pico_schema) + assert coerced == pico_timestamp + + +def test_datetime_to_py_w_string_value(object_under_test): + coerced = object_under_test.datetime_to_py("2016-12-02T18:51:33", object()) + assert coerced == datetime.datetime(2016, 12, 2, 18, 51, 33) + + +def test_datetime_to_py_w_microseconds(object_under_test): + coerced = object_under_test.datetime_to_py("2015-05-22T10:11:12.987654", object()) + assert coerced == datetime.datetime(2015, 5, 22, 10, 11, 12, 987654) + + +def test_date_to_py_w_string_value(object_under_test): + coerced = object_under_test.date_to_py("1987-09-22", object()) + assert coerced == datetime.date(1987, 9, 22) + + +def test_time_to_py_w_string_value(object_under_test): + coerced = object_under_test.time_to_py("12:12:27", object()) + assert coerced == datetime.time(12, 12, 27) + + +def test_time_to_py_w_subsecond_string_value(object_under_test): + coerced = object_under_test.time_to_py("12:12:27.123456", object()) + assert coerced == datetime.time(12, 12, 27, 123456) + + +def test_time_to_py_w_bogus_string_value(object_under_test): + with pytest.raises(ValueError): + object_under_test.time_to_py("12:12:27.123", object()) + + +def test_range_to_py_w_wrong_format(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type="DATE", + ) + with pytest.raises(ValueError): + object_under_test.range_to_py("[2009-06-172019-06-17)", range_field) + + +def test_range_to_py_w_wrong_element_type(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type=google.cloud.bigquery.schema.FieldElementType( + element_type="TIME" + ), + ) + with pytest.raises(ValueError): + object_under_test.range_to_py("[15:31:38, 15:50:38)", range_field) + + +def test_range_to_py_w_unbounded_value(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type="DATE", + ) + coerced = object_under_test.range_to_py("[UNBOUNDED, 2019-06-17)", range_field) + assert coerced == {"start": None, "end": datetime.date(2019, 6, 17)} + + +def test_range_to_py_w_date_value(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type="DATE", + ) + coerced = object_under_test.range_to_py("[2009-06-17, 2019-06-17)", range_field) + assert coerced == { + "start": datetime.date(2009, 6, 17), + "end": datetime.date(2019, 6, 17), + } + + +def test_range_to_py_w_datetime_value(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type=google.cloud.bigquery.schema.FieldElementType( + element_type="DATETIME" + ), + ) + coerced = object_under_test.range_to_py( + "[2009-06-17T13:45:30, 2019-06-17T13:45:30)", range_field + ) + assert coerced == { + "start": datetime.datetime(2009, 6, 17, 13, 45, 30), + "end": datetime.datetime(2019, 6, 17, 13, 45, 30), + } + + +def test_range_to_py_w_timestamp_value(object_under_test): + from google.cloud._helpers import _EPOCH + + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type=google.cloud.bigquery.schema.FieldElementType( + element_type="TIMESTAMP" + ), + ) + coerced = object_under_test.range_to_py("[1234567, 1234789)", range_field) + assert coerced == { + "start": _EPOCH + datetime.timedelta(seconds=1, microseconds=234567), + "end": _EPOCH + datetime.timedelta(seconds=1, microseconds=234789), + } + + +def test_record_to_py_w_nullable_subfield_none(object_under_test): + subfield = create_field("NULLABLE", "INTEGER", name="age") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": None}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"age": None} + + +def test_record_to_py_w_scalar_subfield(object_under_test): + subfield = create_field("REQUIRED", "INTEGER", name="age") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": 42}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"age": 42} + + +def test_record_to_py_w_scalar_subfield_geography(object_under_test): + subfield = create_field("REQUIRED", "GEOGRAPHY", name="geo") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": "POINT(1, 2)"}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"geo": "POINT(1, 2)"} + + +def test_record_to_py_w_repeated_subfield(object_under_test): + subfield = create_field("REPEATED", "STRING", name="color") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": [{"v": "red"}, {"v": "yellow"}, {"v": "blue"}]}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"color": ["red", "yellow", "blue"]} + + +def test_record_to_py_w_record_subfield(object_under_test): + full_name = create_field("REQUIRED", "STRING", name="full_name") + area_code = create_field("REQUIRED", "STRING", name="area_code") + local_number = create_field("REQUIRED", "STRING", name="local_number") + rank = create_field("REQUIRED", "INTEGER", name="rank") + phone = create_field( + "NULLABLE", "RECORD", name="phone", fields=[area_code, local_number, rank] + ) + person = create_field( + "REQUIRED", "RECORD", name="person", fields=[full_name, phone] + ) + value = { + "f": [ + {"v": "Phred Phlyntstone"}, + {"v": {"f": [{"v": "800"}, {"v": "555-1212"}, {"v": 1}]}}, + ] + } + expected = { + "full_name": "Phred Phlyntstone", + "phone": {"area_code": "800", "local_number": "555-1212", "rank": 1}, + } + coerced = object_under_test.record_to_py(value, person) + assert coerced == expected diff --git a/tests/unit/_helpers/test_data_frame_cell_data_parser.py b/tests/unit/_helpers/test_data_frame_cell_data_parser.py new file mode 100644 index 000000000..c3332dc89 --- /dev/null +++ b/tests/unit/_helpers/test_data_frame_cell_data_parser.py @@ -0,0 +1,71 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import google.cloud.bigquery.schema + + +def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs): + return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs) + + +@pytest.fixture +def mut(): + from google.cloud.bigquery import _helpers + + return _helpers + + +@pytest.fixture +def object_under_test(mut): + return mut.DATA_FRAME_CELL_DATA_PARSER + + +def test_json_to_py_doesnt_parse_json(object_under_test): + coerced = object_under_test.json_to_py('{"key":"value"}', create_field()) + assert coerced == '{"key":"value"}' + + +def test_json_to_py_repeated_doesnt_parse_json(object_under_test): + coerced = object_under_test.json_to_py('{"key":"value"}', create_field("REPEATED")) + assert coerced == '{"key":"value"}' + + +def test_record_to_py_doesnt_parse_json(object_under_test): + subfield = create_field(type_="JSON", name="json") + field = create_field(fields=[subfield]) + value = {"f": [{"v": '{"key":"value"}'}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"json": '{"key":"value"}'} + + +def test_record_to_py_doesnt_parse_repeated_json(object_under_test): + subfield = create_field("REPEATED", "JSON", name="json") + field = create_field("REQUIRED", fields=[subfield]) + value = { + "f": [ + { + "v": [ + {"v": '{"key":"value0"}'}, + {"v": '{"key":"value1"}'}, + {"v": '{"key":"value2"}'}, + ] + } + ] + } + coerced = object_under_test.record_to_py(value, field) + assert coerced == { + "json": ['{"key":"value0"}', '{"key":"value1"}', '{"key":"value2"}'] + } diff --git a/tests/unit/_helpers/test_from_json.py b/tests/unit/_helpers/test_from_json.py deleted file mode 100644 index 65b054f44..000000000 --- a/tests/unit/_helpers/test_from_json.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dateutil.relativedelta import relativedelta -import pytest - -from google.cloud.bigquery.schema import SchemaField - - -def create_field(mode="NULLABLE", type_="IGNORED"): - return SchemaField("test_field", type_, mode=mode) - - -@pytest.fixture -def mut(): - from google.cloud.bigquery import _helpers - - return _helpers - - -def test_interval_from_json_w_none_nullable(mut): - got = mut._interval_from_json(None, create_field()) - assert got is None - - -def test_interval_from_json_w_none_required(mut): - with pytest.raises(TypeError): - mut._interval_from_json(None, create_field(mode="REQUIRED")) - - -def test_interval_from_json_w_invalid_format(mut): - with pytest.raises(ValueError, match="NOT_AN_INTERVAL"): - mut._interval_from_json("NOT_AN_INTERVAL", create_field()) - - -@pytest.mark.parametrize( - ("value", "expected"), - ( - ("0-0 0 0:0:0", relativedelta()), - # SELECT INTERVAL X YEAR - ("-10000-0 0 0:0:0", relativedelta(years=-10000)), - ("-1-0 0 0:0:0", relativedelta(years=-1)), - ("1-0 0 0:0:0", relativedelta(years=1)), - ("10000-0 0 0:0:0", relativedelta(years=10000)), - # SELECT INTERVAL X MONTH - ("-0-11 0 0:0:0", relativedelta(months=-11)), - ("-0-1 0 0:0:0", relativedelta(months=-1)), - ("0-1 0 0:0:0", relativedelta(months=1)), - ("0-11 0 0:0:0", relativedelta(months=11)), - # SELECT INTERVAL X DAY - ("0-0 -3660000 0:0:0", relativedelta(days=-3660000)), - ("0-0 -1 0:0:0", relativedelta(days=-1)), - ("0-0 1 0:0:0", relativedelta(days=1)), - ("0-0 3660000 0:0:0", relativedelta(days=3660000)), - # SELECT INTERVAL X HOUR - ("0-0 0 -87840000:0:0", relativedelta(hours=-87840000)), - ("0-0 0 -1:0:0", relativedelta(hours=-1)), - ("0-0 0 1:0:0", relativedelta(hours=1)), - ("0-0 0 87840000:0:0", relativedelta(hours=87840000)), - # SELECT INTERVAL X MINUTE - ("0-0 0 -0:59:0", relativedelta(minutes=-59)), - ("0-0 0 -0:1:0", relativedelta(minutes=-1)), - ("0-0 0 0:1:0", relativedelta(minutes=1)), - ("0-0 0 0:59:0", relativedelta(minutes=59)), - # SELECT INTERVAL X SECOND - ("0-0 0 -0:0:59", relativedelta(seconds=-59)), - ("0-0 0 -0:0:1", relativedelta(seconds=-1)), - ("0-0 0 0:0:1", relativedelta(seconds=1)), - ("0-0 0 0:0:59", relativedelta(seconds=59)), - # SELECT (INTERVAL -1 SECOND) / 1000000 - ("0-0 0 -0:0:0.000001", relativedelta(microseconds=-1)), - ("0-0 0 -0:0:59.999999", relativedelta(seconds=-59, microseconds=-999999)), - ("0-0 0 -0:0:59.999", relativedelta(seconds=-59, microseconds=-999000)), - ("0-0 0 0:0:59.999", relativedelta(seconds=59, microseconds=999000)), - ("0-0 0 0:0:59.999999", relativedelta(seconds=59, microseconds=999999)), - # Test with multiple digits in each section. - ( - "32-11 45 67:16:23.987654", - relativedelta( - years=32, - months=11, - days=45, - hours=67, - minutes=16, - seconds=23, - microseconds=987654, - ), - ), - ( - "-32-11 -45 -67:16:23.987654", - relativedelta( - years=-32, - months=-11, - days=-45, - hours=-67, - minutes=-16, - seconds=-23, - microseconds=-987654, - ), - ), - # Test with mixed +/- sections. - ( - "9999-9 -999999 9999999:59:59.999999", - relativedelta( - years=9999, - months=9, - days=-999999, - hours=9999999, - minutes=59, - seconds=59, - microseconds=999999, - ), - ), - # Test with fraction that is not microseconds. - ("0-0 0 0:0:42.", relativedelta(seconds=42)), - ("0-0 0 0:0:59.1", relativedelta(seconds=59, microseconds=100000)), - ("0-0 0 0:0:0.12", relativedelta(microseconds=120000)), - ("0-0 0 0:0:0.123", relativedelta(microseconds=123000)), - ("0-0 0 0:0:0.1234", relativedelta(microseconds=123400)), - # Fractional seconds can cause rounding problems if cast to float. See: - # https://github.com/googleapis/python-db-dtypes-pandas/issues/18 - ("0-0 0 0:0:59.876543", relativedelta(seconds=59, microseconds=876543)), - ( - "0-0 0 01:01:01.010101", - relativedelta(hours=1, minutes=1, seconds=1, microseconds=10101), - ), - ( - "0-0 0 09:09:09.090909", - relativedelta(hours=9, minutes=9, seconds=9, microseconds=90909), - ), - ( - "0-0 0 11:11:11.111111", - relativedelta(hours=11, minutes=11, seconds=11, microseconds=111111), - ), - ( - "0-0 0 19:16:23.987654", - relativedelta(hours=19, minutes=16, seconds=23, microseconds=987654), - ), - # Nanoseconds are not expected, but should not cause error. - ("0-0 0 0:0:00.123456789", relativedelta(microseconds=123456)), - ("0-0 0 0:0:59.87654321", relativedelta(seconds=59, microseconds=876543)), - ), -) -def test_w_string_values(mut, value, expected): - got = mut._interval_from_json(value, create_field()) - assert got == expected diff --git a/tests/unit/_helpers/test_scalar_query_param_parser.py b/tests/unit/_helpers/test_scalar_query_param_parser.py new file mode 100644 index 000000000..8e0d2a34e --- /dev/null +++ b/tests/unit/_helpers/test_scalar_query_param_parser.py @@ -0,0 +1,93 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +import pytest + +import google.cloud.bigquery.schema + + +def create_field(mode="NULLABLE", type_="IGNORED"): + return google.cloud.bigquery.schema.SchemaField("test_field", type_, mode=mode) + + +@pytest.fixture +def mut(): + from google.cloud.bigquery import _helpers + + return _helpers + + +@pytest.fixture +def object_under_test(mut): + return mut.SCALAR_QUERY_PARAM_PARSER + + +def test_timestamp_to_py_w_none_nullable(object_under_test): + assert object_under_test.timestamp_to_py(None, create_field()) is None + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ( + "2016-12-20 15:58:27.339328+00:00", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20 15:58:27+00:00", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ( + "2016-12-20T15:58:27.339328+00:00", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20T15:58:27+00:00", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ( + "2016-12-20 15:58:27.339328Z", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20 15:58:27Z", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ( + "2016-12-20T15:58:27.339328Z", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20T15:58:27Z", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ], +) +def test_timestamp_to_py_w_timestamp_valid(object_under_test, value, expected): + assert object_under_test.timestamp_to_py(value, create_field()) == expected + + +def test_timestamp_to_py_w_timestamp_invalid(object_under_test): + with pytest.raises(ValueError): + object_under_test.timestamp_to_py("definitely-not-a-timestamp", create_field()) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ebe2d2a7a..5070a199b 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -13,6 +13,7 @@ # limitations under the License. from unittest import mock +import threading import pytest @@ -24,6 +25,18 @@ def client(): yield make_client() +time_lock = threading.Lock() + + +@pytest.fixture +def global_time_lock(): + """Fixture to run tests serially that depend on the global time state, + such as tests of retry behavior. + """ + with time_lock: + yield + + @pytest.fixture def PROJECT(): yield "PROJECT" diff --git a/tests/unit/job/helpers.py b/tests/unit/job/helpers.py index 3642c7229..24ba2fa99 100644 --- a/tests/unit/job/helpers.py +++ b/tests/unit/job/helpers.py @@ -106,7 +106,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.006 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.ETAG = "ETAG" self.FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID) self.RESOURCE_URL = "{}/bigquery/v2/projects/{}/jobs/{}".format( diff --git a/tests/unit/job/test_async_job_retry.py b/tests/unit/job/test_async_job_retry.py new file mode 100644 index 000000000..35041aa1b --- /dev/null +++ b/tests/unit/job/test_async_job_retry.py @@ -0,0 +1,139 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import google.api_core.retry +from google.api_core import exceptions + +from . import helpers +import google.cloud.bigquery.job + + +PROJECT = "test-project" +JOB_ID = "test-job-id" + + +def test_cancel_w_custom_retry(global_time_lock): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + api_path = "/projects/{}/jobs/{}/cancel".format(PROJECT, JOB_ID) + resource = { + "jobReference": { + "jobId": JOB_ID, + "projectId": PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + expected = resource.copy() + expected["statistics"] = {} + response = {"job": resource} + conn = helpers.make_connection( + ValueError, + response, + ) + client = helpers._make_client(project=PROJECT, connection=conn) + job = google.cloud.bigquery.job._AsyncJob( + google.cloud.bigquery.job._JobReference(JOB_ID, PROJECT, "EU"), client + ) + + retry = DEFAULT_RETRY.with_deadline(1).with_predicate( + lambda exc: isinstance(exc, ValueError) + ) + + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + result = job.cancel(retry=retry, timeout=7.5) + + final_attributes.assert_called() + + assert result is True + assert job._properties == expected + conn.api_request.assert_has_calls( + [ + mock.call( + method="POST", + path=api_path, + query_params={"location": "EU"}, + timeout=7.5, + ), + mock.call( + method="POST", + path=api_path, + query_params={"location": "EU"}, + timeout=7.5, + ), # was retried once + ], + ) + + +def test_result_w_retry_wo_state(global_time_lock): + from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT + + begun_job_resource = helpers._make_job_resource( + job_id=JOB_ID, project_id=PROJECT, location="EU", started=True + ) + done_job_resource = helpers._make_job_resource( + job_id=JOB_ID, + project_id=PROJECT, + location="EU", + started=True, + ended=True, + ) + conn = helpers.make_connection( + exceptions.NotFound("not normally retriable"), + begun_job_resource, + exceptions.NotFound("not normally retriable"), + done_job_resource, + ) + client = helpers._make_client(project=PROJECT, connection=conn) + job = google.cloud.bigquery.job._AsyncJob( + google.cloud.bigquery.job._JobReference(JOB_ID, PROJECT, "EU"), client + ) + custom_predicate = mock.Mock() + custom_predicate.return_value = True + custom_retry = google.api_core.retry.Retry( + predicate=custom_predicate, + initial=0.001, + maximum=0.001, + deadline=0.1, + ) + assert job.result(retry=custom_retry) is job + + begin_call = mock.call( + method="POST", + path=f"/projects/{PROJECT}/jobs", + data={ + "jobReference": { + "jobId": JOB_ID, + "projectId": PROJECT, + "location": "EU", + } + }, + timeout=None, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={ + "projection": "full", + "location": "EU", + }, + timeout=DEFAULT_GET_JOB_TIMEOUT, + ) + conn.api_request.assert_has_calls( + [begin_call, begin_call, reload_call, reload_call] + ) diff --git a/tests/unit/job/test_base.py b/tests/unit/job/test_base.py index a7337afd2..420904820 100644 --- a/tests/unit/job/test_base.py +++ b/tests/unit/job/test_base.py @@ -17,8 +17,6 @@ import unittest from unittest import mock -from google.api_core import exceptions -import google.api_core.retry from google.api_core.future import polling import pytest @@ -331,7 +329,7 @@ def _datetime_and_millis(): import datetime from google.cloud._helpers import _millis - now = datetime.datetime.utcnow().replace( + now = datetime.datetime.now(datetime.timezone.utc).replace( microsecond=123000, tzinfo=datetime.timezone.utc, # stats timestamps have ms precision ) @@ -443,6 +441,16 @@ def test_state(self): status["state"] = state self.assertEqual(job.state, state) + def test_reservation_id(self): + reservation_id = "RESERVATION-ID" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.reservation_id) + stats = job._properties["statistics"] = {} + self.assertIsNone(job.reservation_id) + stats["reservation_id"] = reservation_id + self.assertEqual(job.reservation_id, reservation_id) + def _set_properties_job(self): client = _make_client(project=self.PROJECT) job = self._make_one(self.JOB_ID, client) @@ -872,50 +880,6 @@ def test_cancel_explicit(self): ) self.assertEqual(job._properties, expected) - def test_cancel_w_custom_retry(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - api_path = "/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID) - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - expected = resource.copy() - expected["statistics"] = {} - response = {"job": resource} - job = self._set_properties_job() - - api_request_patcher = mock.patch.object( - job._client._connection, "api_request", side_effect=[ValueError, response] - ) - retry = DEFAULT_RETRY.with_deadline(1).with_predicate( - lambda exc: isinstance(exc, ValueError) - ) - - with api_request_patcher as fake_api_request: - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - result = job.cancel(retry=retry, timeout=7.5) - - final_attributes.assert_called() - - self.assertTrue(result) - self.assertEqual(job._properties, expected) - self.assertEqual( - fake_api_request.call_args_list, - [ - mock.call(method="POST", path=api_path, query_params={}, timeout=7.5), - mock.call( - method="POST", path=api_path, query_params={}, timeout=7.5 - ), # was retried once - ], - ) - def test__set_future_result_wo_done(self): client = _make_client(project=self.PROJECT) job = self._make_one(self.JOB_ID, client) @@ -1059,64 +1023,6 @@ def test_result_default_wo_state(self): ) conn.api_request.assert_has_calls([begin_call, begin_call, reload_call]) - def test_result_w_retry_wo_state(self): - from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT - - begun_job_resource = _make_job_resource( - job_id=self.JOB_ID, project_id=self.PROJECT, location="EU", started=True - ) - done_job_resource = _make_job_resource( - job_id=self.JOB_ID, - project_id=self.PROJECT, - location="EU", - started=True, - ended=True, - ) - conn = make_connection( - exceptions.NotFound("not normally retriable"), - begun_job_resource, - exceptions.NotFound("not normally retriable"), - done_job_resource, - ) - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one( - self._job_reference(self.JOB_ID, self.PROJECT, "EU"), client - ) - custom_predicate = mock.Mock() - custom_predicate.return_value = True - custom_retry = google.api_core.retry.Retry( - predicate=custom_predicate, - initial=0.001, - maximum=0.001, - deadline=0.1, - ) - self.assertIs(job.result(retry=custom_retry), job) - - begin_call = mock.call( - method="POST", - path=f"/projects/{self.PROJECT}/jobs", - data={ - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": "EU", - } - }, - timeout=None, - ) - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={ - "projection": "full", - "location": "EU", - }, - timeout=DEFAULT_GET_JOB_TIMEOUT, - ) - conn.api_request.assert_has_calls( - [begin_call, begin_call, reload_call, reload_call] - ) - def test_result_explicit_w_state(self): conn = make_connection() client = _make_client(project=self.PROJECT, connection=conn) @@ -1188,15 +1094,18 @@ def test_fill_query_job_config_from_default(self): job_config = QueryJobConfig() job_config.dry_run = True job_config.maximum_bytes_billed = 1000 + job_config.reservation = "reservation_1" default_job_config = QueryJobConfig() default_job_config.use_query_cache = True default_job_config.maximum_bytes_billed = 2000 + default_job_config.reservation = "reservation_2" final_job_config = job_config._fill_from_default(default_job_config) self.assertTrue(final_job_config.dry_run) self.assertTrue(final_job_config.use_query_cache) self.assertEqual(final_job_config.maximum_bytes_billed, 1000) + self.assertEqual(final_job_config.reservation, "reservation_1") def test_fill_load_job_from_default(self): from google.cloud.bigquery import LoadJobConfig @@ -1204,15 +1113,18 @@ def test_fill_load_job_from_default(self): job_config = LoadJobConfig() job_config.create_session = True job_config.encoding = "UTF-8" + job_config.reservation = "reservation_1" default_job_config = LoadJobConfig() default_job_config.ignore_unknown_values = True default_job_config.encoding = "ISO-8859-1" + default_job_config.reservation = "reservation_2" final_job_config = job_config._fill_from_default(default_job_config) self.assertTrue(final_job_config.create_session) self.assertTrue(final_job_config.ignore_unknown_values) self.assertEqual(final_job_config.encoding, "UTF-8") + self.assertEqual(final_job_config.reservation, "reservation_1") def test_fill_from_default_conflict(self): from google.cloud.bigquery import QueryJobConfig @@ -1232,10 +1144,12 @@ def test_fill_from_empty_default_conflict(self): job_config = QueryJobConfig() job_config.dry_run = True job_config.maximum_bytes_billed = 1000 + job_config.reservation = "reservation_1" final_job_config = job_config._fill_from_default(default_job_config=None) self.assertTrue(final_job_config.dry_run) self.assertEqual(final_job_config.maximum_bytes_billed, 1000) + self.assertEqual(final_job_config.reservation, "reservation_1") @mock.patch("google.cloud.bigquery._helpers._get_sub_prop") def test__get_sub_prop_wo_default(self, _get_sub_prop): @@ -1320,3 +1234,86 @@ def test_job_timeout_ms(self): # Confirm that integers get converted to strings. job_config.job_timeout_ms = 5000 assert job_config.job_timeout_ms == "5000" # int is converted to string + + def test_job_timeout_is_none_when_set_none(self): + job_config = self._make_one() + job_config.job_timeout_ms = None + # Confirm value is None and not literal string 'None' + assert job_config.job_timeout_ms is None + + def test_job_timeout_properties(self): + # Make sure any value stored in properties is erased + # when setting job_timeout to None. + job_config = self._make_one() + job_config.job_timeout_ms = 4200 + assert job_config.job_timeout_ms == "4200" + assert job_config._properties.get("jobTimeoutMs") == "4200" + + job_config.job_timeout_ms = None + assert job_config.job_timeout_ms is None + assert "jobTimeoutMs" not in job_config._properties + + def test_reservation_miss(self): + job_config = self._make_one() + self.assertEqual(job_config.reservation, None) + + def test_reservation_hit(self): + job_config = self._make_one() + job_config._properties["reservation"] = "foo" + self.assertEqual(job_config.reservation, "foo") + + def test_reservation_update_in_place(self): + job_config = self._make_one() + job_config.reservation = "bar" # update in place + self.assertEqual(job_config.reservation, "bar") + + def test_reservation_setter_invalid(self): + job_config = self._make_one() + with self.assertRaises(ValueError): + job_config.reservation = object() + + def test_reservation_setter(self): + job_config = self._make_one() + job_config.reservation = "foo" + self.assertEqual(job_config._properties["reservation"], "foo") + + def test_max_slots_miss(self): + job_config = self._make_one() + self.assertEqual(job_config.max_slots, None) + + def test_max_slots_set_and_clear(self): + job_config = self._make_one() + job_config.max_slots = 14 + self.assertEqual(job_config.max_slots, 14) + job_config.max_slots = None + self.assertEqual(job_config.max_slots, None) + + def test_max_slots_hit_str(self): + job_config = self._make_one() + job_config._properties["maxSlots"] = "4" + self.assertEqual(job_config.max_slots, 4) + + def test_max_slots_hit_int(self): + job_config = self._make_one() + job_config._properties["maxSlots"] = int(3) + self.assertEqual(job_config.max_slots, 3) + + def test_max_slots_hit_invalid(self): + job_config = self._make_one() + job_config._properties["maxSlots"] = object() + self.assertEqual(job_config.max_slots, None) + + def test_max_slots_update_in_place(self): + job_config = self._make_one() + job_config.max_slots = 45 # update in place + self.assertEqual(job_config.max_slots, 45) + + def test_max_slots_setter_invalid(self): + job_config = self._make_one() + with self.assertRaises(ValueError): + job_config.max_slots = "foo" + + def test_max_slots_setter(self): + job_config = self._make_one() + job_config.max_slots = 123 + self.assertEqual(job_config._properties["maxSlots"], "123") diff --git a/tests/unit/job/test_copy.py b/tests/unit/job/test_copy.py index 4b0945310..8e2845316 100644 --- a/tests/unit/job/test_copy.py +++ b/tests/unit/job/test_copy.py @@ -147,7 +147,6 @@ def _verifyResourceProperties(self, job, resource): self._verifyReadonlyResourceProperties(job, resource) config = resource.get("configuration", {}).get("copy") - table_ref = config["destinationTable"] self.assertEqual(job.destination.project, table_ref["projectId"]) self.assertEqual(job.destination.dataset_id, table_ref["datasetId"]) diff --git a/tests/unit/job/test_load.py b/tests/unit/job/test_load.py index 0fb044696..b551d52dd 100644 --- a/tests/unit/job/test_load.py +++ b/tests/unit/job/test_load.py @@ -19,6 +19,7 @@ from .helpers import _Base from .helpers import _make_client +from google.cloud.bigquery.enums import SourceColumnMatch class TestLoadJob(_Base): @@ -37,11 +38,26 @@ def _setUpConstants(self): self.OUTPUT_BYTES = 23456 self.OUTPUT_ROWS = 345 self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference" + self.SOURCE_COLUMN_MATCH = "NAME" + self.DATE_FORMAT = "%Y-%m-%d" + self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S" + self.TIME_ZONE = "UTC" + self.TIME_FORMAT = "%H:%M:%S" + self.TIMESTAMP_FORMAT = "YYYY-MM-DD HH:MM:SS.SSSSSSZ" + self.NULL_MARKERS = ["", "NA"] def _make_resource(self, started=False, ended=False): resource = super(TestLoadJob, self)._make_resource(started, ended) config = resource["configuration"]["load"] config["sourceUris"] = [self.SOURCE1] + config["sourceColumnMatch"] = self.SOURCE_COLUMN_MATCH + config["dateFormat"] = self.DATE_FORMAT + config["datetimeFormat"] = self.DATETIME_FORMAT + config["timeZone"] = self.TIME_ZONE + config["timeFormat"] = self.TIME_FORMAT + config["timestampFormat"] = self.TIMESTAMP_FORMAT + config["nullMarkers"] = self.NULL_MARKERS + config["destinationTable"] = { "projectId": self.PROJECT, "datasetId": self.DS_ID, @@ -129,6 +145,10 @@ def _verifyResourceProperties(self, job, resource): self.assertEqual(job.null_marker, config["nullMarker"]) else: self.assertIsNone(job.null_marker) + if "nullMarkers" in config: + self.assertEqual(job.null_markers, config["nullMarkers"]) + else: + self.assertIsNone(job.null_markers) if "quote" in config: self.assertEqual(job.quote_character, config["quote"]) else: @@ -143,7 +163,6 @@ def _verifyResourceProperties(self, job, resource): ) else: self.assertIsNone(job.reference_file_schema_uri) - if "destinationEncryptionConfiguration" in config: self.assertIsNotNone(job.destination_encryption_configuration) self.assertEqual( @@ -152,6 +171,35 @@ def _verifyResourceProperties(self, job, resource): ) else: self.assertIsNone(job.destination_encryption_configuration) + if "dateFormat" in config: + self.assertEqual(job.date_format, config["dateFormat"]) + else: + self.assertIsNone(job.date_format) + if "datetimeFormat" in config: + self.assertEqual(job.datetime_format, config["datetimeFormat"]) + else: + self.assertIsNone(job.datetime_format) + if "timeZone" in config: + self.assertEqual(job.time_zone, config["timeZone"]) + else: + self.assertIsNone(job.time_zone) + if "timeFormat" in config: + self.assertEqual(job.time_format, config["timeFormat"]) + else: + self.assertIsNone(job.time_format) + if "timestampFormat" in config: + self.assertEqual(job.timestamp_format, config["timestampFormat"]) + else: + self.assertIsNone(job.timestamp_format) + + if "sourceColumnMatch" in config: + # job.source_column_match will be an Enum, config[...] is a string + self.assertEqual( + job.source_column_match.value, + config["sourceColumnMatch"], + ) + else: + self.assertIsNone(job.source_column_match) def test_ctor(self): client = _make_client(project=self.PROJECT) @@ -181,6 +229,7 @@ def test_ctor(self): self.assertIsNone(job.ignore_unknown_values) self.assertIsNone(job.max_bad_records) self.assertIsNone(job.null_marker) + self.assertIsNone(job.null_markers) self.assertIsNone(job.quote_character) self.assertIsNone(job.skip_leading_rows) self.assertIsNone(job.source_format) @@ -194,6 +243,12 @@ def test_ctor(self): self.assertIsNone(job.clustering_fields) self.assertIsNone(job.schema_update_options) self.assertIsNone(job.reference_file_schema_uri) + self.assertIsNone(job.source_column_match) + self.assertIsNone(job.date_format) + self.assertIsNone(job.datetime_format) + self.assertIsNone(job.time_zone) + self.assertIsNone(job.time_format) + self.assertIsNone(job.timestamp_format) def test_ctor_w_config(self): from google.cloud.bigquery.schema import SchemaField @@ -272,7 +327,7 @@ def test_schema_setter_invalid_field(self): config = LoadJobConfig() full_name = SchemaField("full_name", "STRING", mode="REQUIRED") - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): config.schema = [full_name, object()] def test_schema_setter(self): @@ -431,6 +486,24 @@ def test_from_api_repr_w_properties(self): self.assertIs(job._client, client) self._verifyResourceProperties(job, RESOURCE) + def test_to_api_repr(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = self._make_resource(ended=False) + + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client) + api_repr = job.to_api_repr() + + # as per the documentation in load.py -> LoadJob.to_api_repr(), + # the return value from to_api_repr should not include statistics + expected = { + "jobReference": RESOURCE["jobReference"], + "configuration": RESOURCE["configuration"], + } + + self.assertEqual(api_repr, expected) + def test_begin_w_already_running(self): conn = make_connection() client = _make_client(project=self.PROJECT, connection=conn) @@ -571,7 +644,14 @@ def test_begin_w_alternate_client(self): ] }, "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION], + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, + "dateFormat": self.DATE_FORMAT, + "datetimeFormat": self.DATETIME_FORMAT, + "timeZone": self.TIME_ZONE, + "timeFormat": self.TIME_FORMAT, + "timestampFormat": self.TIMESTAMP_FORMAT, } + RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION conn1 = make_connection() client1 = _make_client(project=self.PROJECT, connection=conn1) @@ -599,6 +679,13 @@ def test_begin_w_alternate_client(self): config.write_disposition = WriteDisposition.WRITE_TRUNCATE config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION] config.reference_file_schema_uri = "gs://path/to/reference" + config.source_column_match = SourceColumnMatch(self.SOURCE_COLUMN_MATCH) + config.date_format = self.DATE_FORMAT + config.datetime_format = self.DATETIME_FORMAT + config.time_zone = self.TIME_ZONE + config.time_format = self.TIME_FORMAT + config.timestamp_format = self.TIMESTAMP_FORMAT + with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py index becf3e959..2e046bfbf 100644 --- a/tests/unit/job/test_load_config.py +++ b/tests/unit/job/test_load_config.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import warnings import pytest @@ -468,6 +469,22 @@ def test_null_marker_setter(self): config.null_marker = null_marker self.assertEqual(config._properties["load"]["nullMarker"], null_marker) + def test_null_markers_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.null_markers) + + def test_null_markers_hit(self): + null_markers = ["", "NA"] + config = self._get_target_class()() + config._properties["load"]["nullMarkers"] = null_markers + self.assertEqual(config.null_markers, null_markers) + + def test_null_markers_setter(self): + null_markers = ["", "NA"] + config = self._get_target_class()() + config.null_markers = null_markers + self.assertEqual(config._properties["load"]["nullMarkers"], null_markers) + def test_preserve_ascii_control_characters_missing(self): config = self._get_target_class()() self.assertIsNone(config.preserve_ascii_control_characters) @@ -571,16 +588,34 @@ def test_schema_setter_valid_mappings_list(self): config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]} ) - def test_schema_setter_invalid_mappings_list(self): + def test_schema_setter_allows_unknown_properties(self): config = self._get_target_class()() schema = [ - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "age", "typeoo": "INTEGER", "mode": "REQUIRED"}, + { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + "someNewProperty": "test-value", + }, + { + "name": "age", + # Note: This type should be included, too. Avoid client-side + # validation, as it could prevent backwards-compatible + # evolution of the server-side behavior. + "typo": "INTEGER", + "mode": "REQUIRED", + "anotherNewProperty": "another-test", + }, ] - with self.assertRaises(Exception): - config.schema = schema + # Make sure the setter doesn't mutate schema. + expected_schema = copy.deepcopy(schema) + + config.schema = schema + + # _properties should include all fields, including unknown ones. + assert config._properties["load"]["schema"]["fields"] == expected_schema def test_schema_setter_unsetting_schema(self): from google.cloud.bigquery.schema import SchemaField @@ -809,6 +844,120 @@ def test_write_disposition_setter(self): config._properties["load"]["writeDisposition"], write_disposition ) + def test_source_column_match_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.source_column_match) + + def test_source_column_match_hit(self): + from google.cloud.bigquery.enums import SourceColumnMatch + + option_enum = SourceColumnMatch.NAME + config = self._get_target_class()() + # Assume API stores the string value of the enum + config._properties["load"]["sourceColumnMatch"] = option_enum.value + self.assertEqual(config.source_column_match, option_enum) + + def test_source_column_match_setter(self): + from google.cloud.bigquery.enums import SourceColumnMatch + + option_enum = SourceColumnMatch.POSITION + config = self._get_target_class()() + config.source_column_match = option_enum + # Assert that the string value of the enum is stored + self.assertEqual( + config._properties["load"]["sourceColumnMatch"], option_enum.value + ) + option_str = "NAME" + config.source_column_match = option_str + self.assertEqual(config._properties["load"]["sourceColumnMatch"], option_str) + + def test_source_column_match_setter_invalid_type(self): + config = self._get_target_class()() + with self.assertRaises(TypeError): + config.source_column_match = 3.14 + + def test_date_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.date_format) + + def test_date_format_hit(self): + date_format = "%Y-%m-%d" + config = self._get_target_class()() + config._properties["load"]["dateFormat"] = date_format + self.assertEqual(config.date_format, date_format) + + def test_date_format_setter(self): + date_format = "YYYY/MM/DD" + config = self._get_target_class()() + config.date_format = date_format + self.assertEqual(config._properties["load"]["dateFormat"], date_format) + + def test_datetime_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.datetime_format) + + def test_datetime_format_hit(self): + datetime_format = "%Y-%m-%dT%H:%M:%S" + config = self._get_target_class()() + config._properties["load"]["datetimeFormat"] = datetime_format + self.assertEqual(config.datetime_format, datetime_format) + + def test_datetime_format_setter(self): + datetime_format = "YYYY/MM/DD HH24:MI:SS" + config = self._get_target_class()() + config.datetime_format = datetime_format + self.assertEqual(config._properties["load"]["datetimeFormat"], datetime_format) + + def test_time_zone_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.time_zone) + + def test_time_zone_hit(self): + time_zone = "UTC" + config = self._get_target_class()() + config._properties["load"]["timeZone"] = time_zone + self.assertEqual(config.time_zone, time_zone) + + def test_time_zone_setter(self): + time_zone = "America/New_York" + config = self._get_target_class()() + config.time_zone = time_zone + self.assertEqual(config._properties["load"]["timeZone"], time_zone) + + def test_time_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.time_format) + + def test_time_format_hit(self): + time_format = "%H:%M:%S" + config = self._get_target_class()() + config._properties["load"]["timeFormat"] = time_format + self.assertEqual(config.time_format, time_format) + + def test_time_format_setter(self): + time_format = "HH24:MI:SS" + config = self._get_target_class()() + config.time_format = time_format + self.assertEqual(config._properties["load"]["timeFormat"], time_format) + + def test_timestamp_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.timestamp_format) + + def test_timestamp_format_hit(self): + timestamp_format = "%Y-%m-%dT%H:%M:%S.%fZ" + config = self._get_target_class()() + config._properties["load"]["timestampFormat"] = timestamp_format + self.assertEqual(config.timestamp_format, timestamp_format) + + def test_timestamp_format_setter(self): + timestamp_format = "YYYY/MM/DD HH24:MI:SS.FF6 TZR" + config = self._get_target_class()() + config.timestamp_format = timestamp_format + self.assertEqual( + config._properties["load"]["timestampFormat"], timestamp_format + ) + def test_parquet_options_missing(self): config = self._get_target_class()() self.assertIsNone(config.parquet_options) @@ -882,3 +1031,150 @@ def test_column_name_character_map_none(self): config._properties["load"]["columnNameCharacterMap"], ColumnNameCharacterMap.COLUMN_NAME_CHARACTER_MAP_UNSPECIFIED, ) + + RESOURCE = { + "load": { + "allowJaggedRows": True, + "createDisposition": "CREATE_NEVER", + "encoding": "UTF-8", + "fieldDelimiter": ",", + "ignoreUnknownValues": True, + "maxBadRecords": 10, + "nullMarker": "\\N", + "quote": '"', + "schema": { + "fields": [ + {"name": "name", "type": "STRING", "mode": "NULLABLE"}, + {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, + ] + }, + "skipLeadingRows": "1", + "sourceFormat": "CSV", + "timePartitioning": { + "type": "DAY", + "field": "transaction_date", + }, + "useAvroLogicalTypes": True, + "writeDisposition": "WRITE_TRUNCATE", + "dateFormat": "%Y-%m-%d", + "timeZone": "America/New_York", + "parquetOptions": {"enableListInference": True}, + "columnNameCharacterMap": "V2", + "someNewField": "some-value", + "timestampTargetPrecision": [6, 12], + } + } + + def test_timestamp_target_precision_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.timestamp_target_precision) + + def test_timestamp_target_precision_hit(self): + timestamp_target_precision = [6, 12] + config = self._get_target_class()() + config._properties["load"][ + "timestampTargetPrecision" + ] = timestamp_target_precision + self.assertEqual(config.timestamp_target_precision, timestamp_target_precision) + + def test_timestamp_target_precision_setter(self): + timestamp_target_precision = [6, 12] + config = self._get_target_class()() + config.timestamp_target_precision = timestamp_target_precision + self.assertEqual( + config._properties["load"]["timestampTargetPrecision"], + timestamp_target_precision, + ) + + def test_timestamp_target_precision_setter_w_none(self): + timestamp_target_precision = [6, 12] + config = self._get_target_class()() + config._properties["load"][ + "timestampTargetPrecision" + ] = timestamp_target_precision + config.timestamp_target_precision = None + self.assertFalse("timestampTargetPrecision" in config._properties["load"]) + + def test_from_api_repr(self): + from google.cloud.bigquery.job import ( + CreateDisposition, + LoadJobConfig, + SourceFormat, + WriteDisposition, + ) + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.table import TimePartitioning, TimePartitioningType + + from google.cloud.bigquery.job.load import ColumnNameCharacterMap + + config = LoadJobConfig.from_api_repr(self.RESOURCE) + + self.assertTrue(config.allow_jagged_rows) + self.assertEqual(config.create_disposition, CreateDisposition.CREATE_NEVER) + self.assertEqual(config.encoding, "UTF-8") + self.assertEqual(config.field_delimiter, ",") + self.assertTrue(config.ignore_unknown_values) + self.assertEqual(config.max_bad_records, 10) + self.assertEqual(config.null_marker, "\\N") + self.assertEqual(config.quote_character, '"') + self.assertEqual( + config.schema, + [SchemaField("name", "STRING"), SchemaField("age", "INTEGER")], + ) + self.assertEqual(config.skip_leading_rows, 1) + self.assertEqual(config.source_format, SourceFormat.CSV) + self.assertEqual( + config.time_partitioning, + TimePartitioning(type_=TimePartitioningType.DAY, field="transaction_date"), + ) + self.assertTrue(config.use_avro_logical_types) + self.assertEqual(config.write_disposition, WriteDisposition.WRITE_TRUNCATE) + self.assertEqual(config.date_format, "%Y-%m-%d") + self.assertEqual(config.time_zone, "America/New_York") + self.assertTrue(config.parquet_options.enable_list_inference) + self.assertEqual(config.column_name_character_map, ColumnNameCharacterMap.V2) + self.assertEqual(config._properties["load"]["someNewField"], "some-value") + self.assertEqual(config.timestamp_target_precision, [6, 12]) + + def test_to_api_repr(self): + from google.cloud.bigquery.job import ( + CreateDisposition, + LoadJobConfig, + SourceFormat, + WriteDisposition, + ) + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.table import TimePartitioning, TimePartitioningType + from google.cloud.bigquery.format_options import ParquetOptions + from google.cloud.bigquery.job.load import ColumnNameCharacterMap + + config = LoadJobConfig() + config.allow_jagged_rows = True + config.create_disposition = CreateDisposition.CREATE_NEVER + config.encoding = "UTF-8" + config.field_delimiter = "," + config.ignore_unknown_values = True + config.max_bad_records = 10 + config.null_marker = r"\N" + config.quote_character = '"' + config.schema = [SchemaField("name", "STRING"), SchemaField("age", "INTEGER")] + config.skip_leading_rows = 1 + config.source_format = SourceFormat.CSV + config.time_partitioning = TimePartitioning( + type_=TimePartitioningType.DAY, field="transaction_date" + ) + config.use_avro_logical_types = True + config.write_disposition = WriteDisposition.WRITE_TRUNCATE + config.date_format = "%Y-%m-%d" + config.time_zone = "America/New_York" + parquet_options = ParquetOptions() + parquet_options.enable_list_inference = True + config.parquet_options = parquet_options + config.column_name_character_map = ColumnNameCharacterMap.V2 + config._properties["load"]["someNewField"] = "some-value" + config.timestamp_target_precision = [6, 12] + + api_repr = config.to_api_repr() + + expected = self.RESOURCE + self.assertEqual(api_repr, expected) diff --git a/tests/unit/job/test_query.py b/tests/unit/job/test_query.py index 4bbd31c73..4a6771c46 100644 --- a/tests/unit/job/test_query.py +++ b/tests/unit/job/test_query.py @@ -20,15 +20,11 @@ import types from unittest import mock -import freezegun -from google.api_core import exceptions -import google.api_core.retry import requests from google.cloud.bigquery.client import _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS import google.cloud.bigquery._job_helpers import google.cloud.bigquery.query -import google.cloud.bigquery.retry from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT from google.cloud.bigquery.table import _EmptyRowIterator @@ -842,6 +838,23 @@ def test_search_stats(self): assert isinstance(job.search_stats, SearchStats) assert job.search_stats.mode == "INDEX_USAGE_MODE_UNSPECIFIED" + def test_incremental_result_stats(self): + from google.cloud.bigquery.job.query import IncrementalResultStats + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + assert job.incremental_result_stats is None + + statistics = job._properties["statistics"] = {} + assert job.incremental_result_stats is None + + query_stats = statistics["query"] = {} + assert job.incremental_result_stats is None + + query_stats["incrementalResultStats"] = {"disabledReason": "BAZ"} + assert isinstance(job.incremental_result_stats, IncrementalResultStats) + assert job.incremental_result_stats.disabled_reason == "BAZ" + def test_reload_query_results_uses_transport_timeout(self): conn = make_connection({}) client = _make_client(self.PROJECT, connection=conn) @@ -887,6 +900,11 @@ def test_result_reloads_job_state_until_done(self): } job_resource = self._make_resource(started=True, location="EU") job_resource_done = self._make_resource(started=True, ended=True, location="EU") + job_resource_done["statistics"]["query"]["totalBytesProcessed"] = str(1234) + job_resource_done["statistics"]["query"]["totalSlotMs"] = str(5678) + job_resource_done["statistics"]["creationTime"] = str(11) + job_resource_done["statistics"]["startTime"] = str(22) + job_resource_done["statistics"]["endTime"] = str(33) job_resource_done["configuration"]["query"]["destinationTable"] = { "projectId": "dest-project", "datasetId": "dest_dataset", @@ -966,6 +984,12 @@ def test_result_reloads_job_state_until_done(self): # Test that the total_rows property has changed during iteration, based # on the response from tabledata.list. self.assertEqual(result.total_rows, 1) + self.assertEqual(result.query, job.query) + self.assertEqual(result.total_bytes_processed, 1234) + self.assertEqual(result.slot_millis, 5678) + self.assertEqual(result.created.timestamp() * 1000, 11) + self.assertEqual(result.started.timestamp() * 1000, 22) + self.assertEqual(result.ended.timestamp() * 1000, 33) query_results_path = f"/projects/{self.PROJECT}/queries/{self.JOB_ID}" query_results_call = mock.call( @@ -1324,102 +1348,6 @@ def test_result_with_max_results(self): [jobs_get_call, query_page_waiting_call, query_page_2_call] ) - def test_result_w_custom_retry(self): - from google.cloud.bigquery.table import RowIterator - - query_resource = { - "jobComplete": False, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - } - query_resource_done = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - "totalRows": "2", - } - job_resource = self._make_resource(started=True, location="asia-northeast1") - job_resource_done = self._make_resource( - started=True, ended=True, location="asia-northeast1" - ) - job_resource_done["configuration"]["query"]["destinationTable"] = { - "projectId": "dest-project", - "datasetId": "dest_dataset", - "tableId": "dest_table", - } - - connection = make_connection( - # Also, for each API request, raise an exception that we know can - # be retried. Because of this, for each iteration we do: - # jobs.get (x2) & jobs.getQueryResults (x2) - exceptions.NotFound("not normally retriable"), - job_resource, - exceptions.NotFound("not normally retriable"), - query_resource, - # Query still not done, repeat both. - exceptions.NotFound("not normally retriable"), - job_resource, - exceptions.NotFound("not normally retriable"), - query_resource, - exceptions.NotFound("not normally retriable"), - # Query still not done, repeat both. - job_resource_done, - exceptions.NotFound("not normally retriable"), - query_resource_done, - # Query finished! - ) - client = _make_client(self.PROJECT, connection=connection) - job = self._get_target_class().from_api_repr(job_resource, client) - - custom_predicate = mock.Mock() - custom_predicate.return_value = True - custom_retry = google.api_core.retry.Retry( - initial=0.001, - maximum=0.001, - multiplier=1.0, - deadline=0.1, - predicate=custom_predicate, - ) - - self.assertIsInstance(job.result(retry=custom_retry), RowIterator) - query_results_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/queries/{self.JOB_ID}", - query_params={"maxResults": 0, "location": "asia-northeast1"}, - # TODO(tswast): Why do we end up setting timeout to - # google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT in - # some cases but not others? - timeout=mock.ANY, - ) - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={"projection": "full", "location": "asia-northeast1"}, - timeout=DEFAULT_GET_JOB_TIMEOUT, - ) - - connection.api_request.assert_has_calls( - [ - # See make_connection() call above for explanation of the - # expected API calls. - # - # Query not done. - reload_call, - reload_call, - query_results_call, - query_results_call, - # Query still not done. - reload_call, - reload_call, - query_results_call, - query_results_call, - # Query done! - reload_call, - reload_call, - query_results_call, - query_results_call, - ] - ) - def test_result_w_empty_schema(self): from google.cloud.bigquery.table import _EmptyRowIterator @@ -1444,102 +1372,6 @@ def test_result_w_empty_schema(self): self.assertEqual(result.location, "asia-northeast1") self.assertEqual(result.query_id, "xyz-abc") - def test_result_w_timeout_doesnt_raise(self): - import google.cloud.bigquery.client - - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = make_connection(begun_resource, query_resource, done_resource) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - job._properties["jobReference"]["location"] = "US" - job._properties["status"] = {"state": "RUNNING"} - - with freezegun.freeze_time("1970-01-01 00:00:00", tick=False): - job.result( - # Test that fractional seconds are supported, but use a timeout - # that is representable as a floating point without rounding - # errors since it can be represented exactly in base 2. In this - # case 1.125 is 9 / 8, which is a fraction with a power of 2 in - # the denominator. - timeout=1.125, - ) - - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={"projection": "full", "location": "US"}, - timeout=1.125, - ) - get_query_results_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/queries/{self.JOB_ID}", - query_params={ - "maxResults": 0, - "location": "US", - }, - timeout=google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT, - ) - connection.api_request.assert_has_calls( - [ - reload_call, - get_query_results_call, - reload_call, - ] - ) - - def test_result_w_timeout_raises_concurrent_futures_timeout(self): - import google.cloud.bigquery.client - - begun_resource = self._make_resource() - begun_resource["jobReference"]["location"] = "US" - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = make_connection(begun_resource, query_resource, done_resource) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - job._properties["jobReference"]["location"] = "US" - job._properties["status"] = {"state": "RUNNING"} - - with freezegun.freeze_time( - "1970-01-01 00:00:00", auto_tick_seconds=1.0 - ), self.assertRaises(concurrent.futures.TimeoutError): - job.result(timeout=1.125) - - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={"projection": "full", "location": "US"}, - timeout=1.125, - ) - get_query_results_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/queries/{self.JOB_ID}", - query_params={ - "maxResults": 0, - "location": "US", - }, - timeout=google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT, - ) - connection.api_request.assert_has_calls( - [ - reload_call, - get_query_results_call, - # Timeout before we can reload with the final job state. - ] - ) - def test_result_w_page_size(self): # Arrange query_results_resource = { @@ -1679,6 +1511,78 @@ def test_result_with_start_index(self): tabledata_list_request[1]["query_params"]["maxResults"], page_size ) + def test_result_with_start_index_multi_page(self): + # When there are multiple pages of response and the user has set + # start_index, we should supply start_index to the server in the first + # request. However, in the subsequent requests, we will pass only + # page_token but not start_index, because the server only allows one + # of them. + from google.cloud.bigquery.table import RowIterator + + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "7", + } + + # Although the result has 7 rows, the response only returns 6, because + # start_index is 1. + tabledata_resource_1 = { + "totalRows": "7", + "pageToken": "page_token_1", + "rows": [ + {"f": [{"v": "abc"}]}, + {"f": [{"v": "def"}]}, + {"f": [{"v": "ghi"}]}, + ], + } + tabledata_resource_2 = { + "totalRows": "7", + "pageToken": None, + "rows": [ + {"f": [{"v": "jkl"}]}, + {"f": [{"v": "mno"}]}, + {"f": [{"v": "pqe"}]}, + ], + } + + connection = make_connection( + query_resource, tabledata_resource_1, tabledata_resource_2 + ) + client = _make_client(self.PROJECT, connection=connection) + resource = self._make_resource(ended=True) + job = self._get_target_class().from_api_repr(resource, client) + + start_index = 1 + page_size = 3 + + result = job.result(page_size=page_size, start_index=start_index) + + self.assertIsInstance(result, RowIterator) + self.assertEqual(result.total_rows, 7) + + rows = list(result) + + self.assertEqual(len(rows), 6) + self.assertEqual(len(connection.api_request.call_args_list), 3) + + # First call has both startIndex and maxResults. + tabledata_list_request_1 = connection.api_request.call_args_list[1] + self.assertEqual( + tabledata_list_request_1[1]["query_params"]["startIndex"], start_index + ) + self.assertEqual( + tabledata_list_request_1[1]["query_params"]["maxResults"], page_size + ) + + # Second call only has maxResults. + tabledata_list_request_2 = connection.api_request.call_args_list[2] + self.assertFalse("startIndex" in tabledata_list_request_2[1]["query_params"]) + self.assertEqual( + tabledata_list_request_2[1]["query_params"]["maxResults"], page_size + ) + def test_result_error(self): from google.cloud import exceptions diff --git a/tests/unit/job/test_query_config.py b/tests/unit/job/test_query_config.py index 7818236f4..a63a14b73 100644 --- a/tests/unit/job/test_query_config.py +++ b/tests/unit/job/test_query_config.py @@ -167,6 +167,16 @@ def test_connection_properties(self): self.assertEqual(config.connection_properties[1].key, "time_zone") self.assertEqual(config.connection_properties[1].value, "America/Chicago") + def test_incremental_results(self): + config = self._get_target_class()() + config.write_incremental_results = True + self.assertEqual(config.write_incremental_results, True) + + def test_max_slots(self): + config = self._get_target_class()() + config.max_slots = 99 + self.assertEqual(config.max_slots, 99) + def test_create_session(self): config = self._get_target_class()() self.assertIsNone(config.create_session) diff --git a/tests/unit/job/test_query_job_retry.py b/tests/unit/job/test_query_job_retry.py new file mode 100644 index 000000000..c8355b688 --- /dev/null +++ b/tests/unit/job/test_query_job_retry.py @@ -0,0 +1,229 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock + +import concurrent.futures +import freezegun +from google.api_core import exceptions +import google.api_core.retry +import pytest + +from google.cloud.bigquery.client import _MIN_GET_QUERY_RESULTS_TIMEOUT +from google.cloud.bigquery.job import QueryJob +from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT +from google.cloud.bigquery.table import RowIterator + +from ..helpers import make_connection +from .helpers import _make_client + + +PROJECT = "test-project" +JOB_ID = "test-job-id" +QUERY = "select count(*) from persons" + + +def _make_resource(started=False, ended=False, location="US"): + resource = { + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID, "location": location}, + "status": {"state": "PENDING"}, + "configuration": { + "query": {"query": QUERY}, + "job_type": "query", + }, + "statistics": {"creationTime": "1"}, + } + + if started: + resource["status"]["state"] = "RUNNING" + resource["statistics"]["startTime"] = "2" + + if ended: + resource["status"]["state"] = "DONE" + resource["statistics"]["endTime"] = "3" + + return resource + + +def test_result_w_custom_retry(global_time_lock): + query_resource = { + "jobComplete": False, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + } + query_resource_done = { + "jobComplete": True, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "2", + } + job_resource = _make_resource(started=True, location="asia-northeast1") + job_resource_done = _make_resource( + started=True, ended=True, location="asia-northeast1" + ) + job_resource_done["configuration"]["query"]["destinationTable"] = { + "projectId": "dest-project", + "datasetId": "dest_dataset", + "tableId": "dest_table", + } + + connection = make_connection( + # Also, for each API request, raise an exception that we know can + # be retried. Because of this, for each iteration we do: + # jobs.get (x2) & jobs.getQueryResults (x2) + exceptions.NotFound("not normally retriable"), + job_resource, + exceptions.NotFound("not normally retriable"), + query_resource, + # Query still not done, repeat both. + exceptions.NotFound("not normally retriable"), + job_resource, + exceptions.NotFound("not normally retriable"), + query_resource, + exceptions.NotFound("not normally retriable"), + # Query still not done, repeat both. + job_resource_done, + exceptions.NotFound("not normally retriable"), + query_resource_done, + # Query finished! + ) + client = _make_client(PROJECT, connection=connection) + job = QueryJob.from_api_repr(job_resource, client) + + custom_predicate = mock.Mock() + custom_predicate.return_value = True + custom_retry = google.api_core.retry.Retry( + initial=0.001, + maximum=0.001, + multiplier=1.0, + deadline=0.1, + predicate=custom_predicate, + ) + + assert isinstance(job.result(retry=custom_retry), RowIterator) + query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/queries/{JOB_ID}", + query_params={"maxResults": 0, "location": "asia-northeast1"}, + timeout=mock.ANY, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={"projection": "full", "location": "asia-northeast1"}, + timeout=DEFAULT_GET_JOB_TIMEOUT, + ) + + connection.api_request.assert_has_calls( + [ + reload_call, + reload_call, + query_results_call, + query_results_call, + reload_call, + reload_call, + query_results_call, + query_results_call, + reload_call, + reload_call, + query_results_call, + query_results_call, + ] + ) + + +def test_result_w_timeout_doesnt_raise(global_time_lock): + begun_resource = _make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + } + done_resource = begun_resource.copy() + done_resource["status"] = {"state": "DONE"} + connection = make_connection(begun_resource, query_resource, done_resource) + client = _make_client(project=PROJECT, connection=connection) + job = QueryJob(JOB_ID, QUERY, client) + job._properties["jobReference"]["location"] = "US" + job._properties["status"] = {"state": "RUNNING"} + + with freezegun.freeze_time("1970-01-01 00:00:00", tick=False): + job.result( + timeout=1.125, + ) + + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={"projection": "full", "location": "US"}, + timeout=1.125, + ) + get_query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/queries/{JOB_ID}", + query_params={ + "maxResults": 0, + "location": "US", + }, + timeout=_MIN_GET_QUERY_RESULTS_TIMEOUT, + ) + connection.api_request.assert_has_calls( + [ + reload_call, + get_query_results_call, + reload_call, + ] + ) + + +def test_result_w_timeout_raises_concurrent_futures_timeout(global_time_lock): + begun_resource = _make_resource() + begun_resource["jobReference"]["location"] = "US" + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + } + done_resource = begun_resource.copy() + done_resource["status"] = {"state": "DONE"} + connection = make_connection(begun_resource, query_resource, done_resource) + client = _make_client(project=PROJECT, connection=connection) + job = QueryJob(JOB_ID, QUERY, client) + job._properties["jobReference"]["location"] = "US" + job._properties["status"] = {"state": "RUNNING"} + + with freezegun.freeze_time( + "1970-01-01 00:00:00", auto_tick_seconds=1.0 + ), pytest.raises(concurrent.futures.TimeoutError): + job.result(timeout=1.125) + + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={"projection": "full", "location": "US"}, + timeout=1.125, + ) + get_query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/queries/{JOB_ID}", + query_params={ + "maxResults": 0, + "location": "US", + }, + timeout=_MIN_GET_QUERY_RESULTS_TIMEOUT, + ) + connection.api_request.assert_has_calls( + [ + reload_call, + get_query_results_call, + ] + ) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 3a5d92dbd..e0e0438f5 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -22,6 +22,7 @@ from ..helpers import make_connection from .helpers import _make_client from .helpers import _make_job_resource +from google.cloud.bigquery.enums import DefaultPandasDTypes try: from google.cloud import bigquery_storage @@ -30,6 +31,7 @@ except (ImportError, AttributeError): bigquery_storage = None + try: import shapely except (ImportError, AttributeError): @@ -177,6 +179,8 @@ def test_to_dataframe_bqstorage_preserve_order(query, table_read_options_kwarg): parent="projects/test-project", read_session=expected_session, max_stream_count=1, # Use a single stream to preserve row order. + retry=None, + timeout=None, ) @@ -591,6 +595,8 @@ def test_to_dataframe_bqstorage(table_read_options_kwarg): parent="projects/bqstorage-billing-project", read_session=expected_session, max_stream_count=0, # Use default number of streams for best performance. + retry=None, + timeout=None, ) bqstorage_client.read_rows.assert_called_once_with(stream_id) @@ -642,13 +648,11 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression(): parent="projects/bqstorage-billing-project", read_session=expected_session, max_stream_count=0, + retry=None, + timeout=None, ) -@pytest.mark.skipif( - pandas.__version__.startswith("2."), - reason="pandas 2.0 changes some default dtypes and we haven't update the test to account for those", -) @pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") def test_to_dataframe_column_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class @@ -700,7 +704,6 @@ def test_to_dataframe_column_dtypes(): exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] assert list(df) == exp_columns # verify the column names - assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" assert df.seconds.dtype.name == "Int64" assert df.miles.dtype.name == "float64" assert df.km.dtype.name == "float16" @@ -708,6 +711,11 @@ def test_to_dataframe_column_dtypes(): assert df.complete.dtype.name == "boolean" assert df.date.dtype.name == "dbdate" + if pandas.__version__.startswith("2."): + assert df.start_timestamp.dtype.name == "datetime64[us, UTC]" + else: + assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" + def test_to_dataframe_column_date_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class @@ -1017,5 +1025,42 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, + timeout=None, ) assert df is row_iterator.to_geodataframe.return_value + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@mock.patch("google.cloud.bigquery.job.query.wait_for_query") +def test_query_job_to_dataframe_delegation(wait_for_query): + job = _make_job() + bqstorage_client = object() + timeout = 123.45 + + job.to_dataframe(bqstorage_client=bqstorage_client, timeout=timeout) + + wait_for_query.assert_called_once_with(job, None, max_results=None) + row_iterator = wait_for_query.return_value + row_iterator.to_dataframe.assert_called_once() + call_args = row_iterator.to_dataframe.call_args + assert call_args.kwargs["timeout"] == timeout + + +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +@mock.patch("google.cloud.bigquery.job.query.wait_for_query") +def test_query_job_to_arrow_delegation(wait_for_query): + job = _make_job() + bqstorage_client = object() + timeout = 123.45 + + job.to_arrow(bqstorage_client=bqstorage_client, timeout=timeout) + + wait_for_query.assert_called_once_with(job, None, max_results=None) + row_iterator = wait_for_query.return_value + row_iterator.to_arrow.assert_called_once() + call_args = row_iterator.to_arrow.call_args + assert call_args.kwargs["timeout"] == timeout diff --git a/tests/unit/job/test_query_stats.py b/tests/unit/job/test_query_stats.py index 61b278d43..c7c7a31e0 100644 --- a/tests/unit/job/test_query_stats.py +++ b/tests/unit/job/test_query_stats.py @@ -13,6 +13,7 @@ # limitations under the License. from .helpers import _Base +import datetime class TestBiEngineStats: @@ -520,3 +521,63 @@ def test_from_api_repr_normal(self): self.assertEqual(entry.pending_units, self.PENDING_UNITS) self.assertEqual(entry.completed_units, self.COMPLETED_UNITS) self.assertEqual(entry.slot_millis, self.SLOT_MILLIS) + + +class TestIncrementalResultStats: + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import IncrementalResultStats + + return IncrementalResultStats + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def test_ctor_defaults(self): + stats = self._make_one() + assert stats.disabled_reason is None + assert stats.result_set_last_replace_time is None + assert stats.result_set_last_modify_time is None + + def test_from_api_repr_partial_stats(self): + klass = self._get_target_class() + stats = klass.from_api_repr({"disabledReason": "FOO"}) + + assert isinstance(stats, klass) + assert stats.disabled_reason == "FOO" + assert stats.result_set_last_replace_time is None + assert stats.result_set_last_modify_time is None + + def test_from_api_repr_full_stats(self): + klass = self._get_target_class() + stats = klass.from_api_repr( + { + "disabledReason": "BAR", + "resultSetLastReplaceTime": "2025-01-02T03:04:05.06Z", + "resultSetLastModifyTime": "2025-02-02T02:02:02.02Z", + } + ) + + assert isinstance(stats, klass) + assert stats.disabled_reason == "BAR" + assert stats.result_set_last_replace_time == datetime.datetime( + 2025, 1, 2, 3, 4, 5, 60000, tzinfo=datetime.timezone.utc + ) + assert stats.result_set_last_modify_time == datetime.datetime( + 2025, 2, 2, 2, 2, 2, 20000, tzinfo=datetime.timezone.utc + ) + + def test_from_api_repr_invalid_stats(self): + klass = self._get_target_class() + stats = klass.from_api_repr( + { + "disabledReason": "BAR", + "resultSetLastReplaceTime": "xxx", + "resultSetLastModifyTime": "yyy", + } + ) + + assert isinstance(stats, klass) + assert stats.disabled_reason == "BAR" + assert stats.result_set_last_replace_time is None + assert stats.result_set_last_modify_time is None diff --git a/tests/unit/routine/test_external_runtime_options.py b/tests/unit/routine/test_external_runtime_options.py new file mode 100644 index 000000000..d4edaae9a --- /dev/null +++ b/tests/unit/routine/test_external_runtime_options.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +@pytest.fixture +def target_class(): + from google.cloud.bigquery.routine.routine import ExternalRuntimeOptions + + return ExternalRuntimeOptions + + +@pytest.fixture +def object_under_test(target_class): + return target_class() + + +def test_ctor(target_class): + container_memory = "1G" + container_cpu = 1 + runtime_connection = ( + "projects/my-project/locations/us-central1/connections/my-connection" + ) + max_batching_rows = 100 + runtime_version = "python-3.11" + + instance = target_class( + container_memory=container_memory, + container_cpu=container_cpu, + runtime_connection=runtime_connection, + max_batching_rows=max_batching_rows, + runtime_version=runtime_version, + ) + + assert instance.container_memory == container_memory + assert instance.container_cpu == container_cpu + assert instance.runtime_connection == runtime_connection + assert instance.max_batching_rows == max_batching_rows + assert instance.runtime_version == runtime_version + + +def test_container_memory(object_under_test): + container_memory = "512Mi" + object_under_test.container_memory = container_memory + assert object_under_test.container_memory == container_memory + + +def test_container_cpu(object_under_test): + container_cpu = 1 + object_under_test.container_cpu = container_cpu + assert object_under_test.container_cpu == container_cpu + + +def test_runtime_connection(object_under_test): + runtime_connection = ( + "projects/my-project/locations/us-central1/connections/my-connection" + ) + object_under_test.runtime_connection = runtime_connection + assert object_under_test.runtime_connection == runtime_connection + + +def test_max_batching_rows(object_under_test): + max_batching_rows = 100 + object_under_test.max_batching_rows = max_batching_rows + assert object_under_test.max_batching_rows == max_batching_rows + + +def test_runtime_version(object_under_test): + runtime_version = "python-3.11" + object_under_test.runtime_version = runtime_version + assert object_under_test.runtime_version == runtime_version + + +def test_ctor_w_properties(target_class): + properties = { + "containerMemory": "1G", + "containerCpu": 1, + } + instance = target_class(_properties=properties) + assert instance._properties == properties + + +def test_ne(target_class): + instance1 = target_class(container_memory="1G") + instance2 = target_class(container_memory="2G") + assert instance1 != instance2 + + +def test_ne_false(target_class): + instance1 = target_class(container_memory="1G") + instance2 = target_class(container_memory="1G") + assert not (instance1 != instance2) + + +def test_eq_not_implemented(object_under_test): + assert not (object_under_test == object()) + assert object_under_test != object() + + +def test_from_api_repr(target_class): + resource = { + "containerMemory": "1G", + "containerCpu": 1, + "runtimeConnection": "projects/my-project/locations/us-central1/connections/my-connection", + "maxBatchingRows": "100", + "runtimeVersion": "python-3.11", + } + instance = target_class.from_api_repr(resource) + + assert instance.container_memory == "1G" + assert instance.container_cpu == 1 + assert ( + instance.runtime_connection + == "projects/my-project/locations/us-central1/connections/my-connection" + ) + assert instance.max_batching_rows == 100 + assert instance.runtime_version == "python-3.11" + + +def test_to_api_repr(target_class): + instance = target_class( + container_memory="1G", + container_cpu=1, + runtime_connection="projects/my-project/locations/us-central1/connections/my-connection", + max_batching_rows=100, + runtime_version="python-3.11", + ) + resource = instance.to_api_repr() + + assert resource == { + "containerMemory": "1G", + "containerCpu": 1, + "runtimeConnection": "projects/my-project/locations/us-central1/connections/my-connection", + "maxBatchingRows": "100", + "runtimeVersion": "python-3.11", + } + + +def test_repr(target_class): + instance = target_class( + container_memory="1G", + container_cpu=1, + ) + expected_repr = ( + "ExternalRuntimeOptions(container_cpu=1, container_memory='1G', " + "max_batching_rows=None, runtime_connection=None, runtime_version=None)" + ) + assert repr(instance) == expected_repr + + +def test_invalid_container_memory(object_under_test): + with pytest.raises(ValueError, match="container_memory must be a string or None."): + object_under_test.container_memory = 123 + + +def test_invalid_container_cpu(object_under_test): + with pytest.raises(ValueError, match="container_cpu must be an integer or None."): + object_under_test.container_cpu = "1" + + +def test_invalid_runtime_connection(object_under_test): + with pytest.raises( + ValueError, match="runtime_connection must be a string or None." + ): + object_under_test.runtime_connection = 123 + + +def test_invalid_max_batching_rows(object_under_test): + with pytest.raises( + ValueError, match="max_batching_rows must be an integer or None." + ): + object_under_test.max_batching_rows = "100" + + +def test_invalid_runtime_version(object_under_test): + with pytest.raises(ValueError, match="runtime_version must be a string or None."): + object_under_test.runtime_version = 123 diff --git a/tests/unit/routine/test_routine.py b/tests/unit/routine/test_routine.py index acd3bc40e..965c6b2eb 100644 --- a/tests/unit/routine/test_routine.py +++ b/tests/unit/routine/test_routine.py @@ -81,6 +81,13 @@ def test_ctor_w_properties(target_class): max_batching_rows=99, user_defined_context={"foo": "bar"}, ) + external_runtime_options = bigquery.ExternalRuntimeOptions( + container_memory="1G", + container_cpu=1, + runtime_connection="projects/p/locations/l/connections/c", + max_batching_rows=100, + runtime_version="python-3.11", + ) actual_routine = target_class( routine_id, @@ -92,6 +99,7 @@ def test_ctor_w_properties(target_class): description=description, determinism_level=determinism_level, remote_function_options=options, + external_runtime_options=external_runtime_options, ) ref = RoutineReference.from_string(routine_id) @@ -106,6 +114,7 @@ def test_ctor_w_properties(target_class): actual_routine.determinism_level == bigquery.DeterminismLevel.NOT_DETERMINISTIC ) assert actual_routine.remote_function_options == options + assert actual_routine.external_runtime_options == external_runtime_options def test_ctor_invalid_remote_function_options(target_class): @@ -119,6 +128,17 @@ def test_ctor_invalid_remote_function_options(target_class): ) +def test_ctor_invalid_external_runtime_options(target_class): + with pytest.raises( + ValueError, + match=".*must be google.cloud.bigquery.routine.ExternalRuntimeOptions.*", + ): + target_class( + "my-proj.my_dset.my_routine", + external_runtime_options=object(), + ) + + def test_from_api_repr(target_class): from google.cloud.bigquery.routine import RoutineArgument from google.cloud.bigquery.routine import RoutineReference @@ -155,6 +175,13 @@ def test_from_api_repr(target_class): }, }, "dataGovernanceType": "DATA_MASKING", + "externalRuntimeOptions": { + "containerMemory": "1G", + "containerCpu": 1, + "runtimeConnection": "projects/p/locations/l/connections/c", + "maxBatchingRows": 100, + "runtimeVersion": "python-3.11", + }, } actual_routine = target_class.from_api_repr(resource) @@ -194,6 +221,14 @@ def test_from_api_repr(target_class): assert actual_routine.remote_function_options.max_batching_rows == 50 assert actual_routine.remote_function_options.user_defined_context == {"foo": "bar"} assert actual_routine.data_governance_type == "DATA_MASKING" + assert actual_routine.external_runtime_options.container_memory == "1G" + assert actual_routine.external_runtime_options.container_cpu == 1 + assert ( + actual_routine.external_runtime_options.runtime_connection + == "projects/p/locations/l/connections/c" + ) + assert actual_routine.external_runtime_options.max_batching_rows == 100 + assert actual_routine.external_runtime_options.runtime_version == "python-3.11" def test_from_api_repr_tvf_function(target_class): @@ -297,6 +332,7 @@ def test_from_api_repr_w_minimal_resource(target_class): assert actual_routine.determinism_level is None assert actual_routine.remote_function_options is None assert actual_routine.data_governance_type is None + assert actual_routine.external_runtime_options is None def test_from_api_repr_w_unknown_fields(target_class): @@ -571,6 +607,12 @@ def test_set_remote_function_options_w_none(object_under_test): assert object_under_test._properties["remoteFunctionOptions"] is None +def test_set_external_runtime_options_w_none(object_under_test): + object_under_test.external_runtime_options = None + assert object_under_test.external_runtime_options is None + assert object_under_test._properties["externalRuntimeOptions"] is None + + def test_set_data_governance_type_w_none(object_under_test): object_under_test.data_governance_type = None assert object_under_test.data_governance_type is None diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 0a307498f..4e53236e3 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import base64 import datetime import decimal import json @@ -24,6 +23,7 @@ from unittest import mock import google.api_core +from google.cloud.bigquery._helpers import _isinstance_or_raise @pytest.mark.skipif( @@ -132,484 +132,6 @@ def test_w_value(self): self.assertTrue(self._call_fut(object(), object())) -class Test_int_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _int_from_json - - return _int_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("42", object()) - self.assertEqual(coerced, 42) - - def test_w_float_value(self): - coerced = self._call_fut(42, object()) - self.assertEqual(coerced, 42) - - -class Test_json_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _json_from_json - - return _json_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_json_field(self): - data_field = _Field("REQUIRED", "data", "JSON") - - value = json.dumps( - {"v": {"key": "value"}}, - ) - - expected_output = {"v": {"key": "value"}} - coerced_output = self._call_fut(value, data_field) - self.assertEqual(coerced_output, expected_output) - - def test_w_string_value(self): - coerced = self._call_fut('"foo"', object()) - self.assertEqual(coerced, "foo") - - -class Test_float_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _float_from_json - - return _float_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("3.1415", object()) - self.assertEqual(coerced, 3.1415) - - def test_w_float_value(self): - coerced = self._call_fut(3.1415, object()) - self.assertEqual(coerced, 3.1415) - - -class Test_decimal_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _decimal_from_json - - return _decimal_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("3.1415", object()) - self.assertEqual(coerced, decimal.Decimal("3.1415")) - - def test_w_float_value(self): - coerced = self._call_fut(3.1415, object()) - # There is no exact float representation of 3.1415. - self.assertEqual(coerced, decimal.Decimal(3.1415)) - - -class Test_bool_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _bool_from_json - - return _bool_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(AttributeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_value_t(self): - coerced = self._call_fut("T", object()) - self.assertTrue(coerced) - - def test_w_value_true(self): - coerced = self._call_fut("True", object()) - self.assertTrue(coerced) - - def test_w_value_1(self): - coerced = self._call_fut("1", object()) - self.assertTrue(coerced) - - def test_w_value_other(self): - coerced = self._call_fut("f", object()) - self.assertFalse(coerced) - - -class Test_string_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _string_from_json - - return _string_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - self.assertIsNone(self._call_fut(None, _Field("REQUIRED"))) - - def test_w_string_value(self): - coerced = self._call_fut("Wonderful!", object()) - self.assertEqual(coerced, "Wonderful!") - - -class Test_bytes_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _bytes_from_json - - return _bytes_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_base64_encoded_bytes(self): - expected = b"Wonderful!" - encoded = base64.standard_b64encode(expected) - coerced = self._call_fut(encoded, object()) - self.assertEqual(coerced, expected) - - def test_w_base64_encoded_text(self): - expected = b"Wonderful!" - encoded = base64.standard_b64encode(expected).decode("ascii") - coerced = self._call_fut(encoded, object()) - self.assertEqual(coerced, expected) - - -class Test_timestamp_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _timestamp_from_json - - return _timestamp_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_int_value(self): - from google.cloud._helpers import _EPOCH - - coerced = self._call_fut("1234567", object()) - self.assertEqual( - coerced, _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) - ) - - def test_w_int_value(self): - from google.cloud._helpers import _EPOCH - - coerced = self._call_fut(1234567, object()) - self.assertEqual( - coerced, _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) - ) - - -class Test_timestamp_query_param_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery import _helpers - - return _helpers._timestamp_query_param_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_timestamp_valid(self): - from google.cloud._helpers import UTC - - samples = [ - ( - "2016-12-20 15:58:27.339328+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20 15:58:27+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27.339328+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ( - "2016-12-20 15:58:27.339328Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20 15:58:27Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27.339328Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ] - for timestamp_str, expected_result in samples: - self.assertEqual( - self._call_fut(timestamp_str, _Field("NULLABLE")), expected_result - ) - - def test_w_timestamp_invalid(self): - with self.assertRaises(ValueError): - self._call_fut("definitely-not-a-timestamp", _Field("NULLABLE")) - - -class Test_datetime_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _datetime_from_json - - return _datetime_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("2016-12-02T18:51:33", object()) - self.assertEqual(coerced, datetime.datetime(2016, 12, 2, 18, 51, 33)) - - def test_w_microseconds(self): - coerced = self._call_fut("2015-05-22T10:11:12.987654", object()) - self.assertEqual(coerced, datetime.datetime(2015, 5, 22, 10, 11, 12, 987654)) - - -class Test_date_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _date_from_json - - return _date_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("1987-09-22", object()) - self.assertEqual(coerced, datetime.date(1987, 9, 22)) - - -class Test_time_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _time_from_json - - return _time_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("12:12:27", object()) - self.assertEqual(coerced, datetime.time(12, 12, 27)) - - def test_w_subsecond_string_value(self): - coerced = self._call_fut("12:12:27.123456", object()) - self.assertEqual(coerced, datetime.time(12, 12, 27, 123456)) - - def test_w_bogus_string_value(self): - with self.assertRaises(ValueError): - self._call_fut("12:12:27.123", object()) - - -class Test_range_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _range_from_json - - return _range_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_wrong_format(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATE"), - ) - with self.assertRaises(ValueError): - self._call_fut("[2009-06-172019-06-17)", range_field) - - def test_w_wrong_element_type(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="TIME"), - ) - with self.assertRaises(ValueError): - self._call_fut("[15:31:38, 15:50:38)", range_field) - - def test_w_unbounded_value(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATE"), - ) - coerced = self._call_fut("[UNBOUNDED, 2019-06-17)", range_field) - self.assertEqual( - coerced, - {"start": None, "end": datetime.date(2019, 6, 17)}, - ) - - def test_w_date_value(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATE"), - ) - coerced = self._call_fut("[2009-06-17, 2019-06-17)", range_field) - self.assertEqual( - coerced, - { - "start": datetime.date(2009, 6, 17), - "end": datetime.date(2019, 6, 17), - }, - ) - - def test_w_datetime_value(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATETIME"), - ) - coerced = self._call_fut( - "[2009-06-17T13:45:30, 2019-06-17T13:45:30)", range_field - ) - self.assertEqual( - coerced, - { - "start": datetime.datetime(2009, 6, 17, 13, 45, 30), - "end": datetime.datetime(2019, 6, 17, 13, 45, 30), - }, - ) - - def test_w_timestamp_value(self): - from google.cloud._helpers import _EPOCH - - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="TIMESTAMP"), - ) - coerced = self._call_fut("[1234567, 1234789)", range_field) - self.assertEqual( - coerced, - { - "start": _EPOCH + datetime.timedelta(seconds=1, microseconds=234567), - "end": _EPOCH + datetime.timedelta(seconds=1, microseconds=234789), - }, - ) - - -class Test_record_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _record_from_json - - return _record_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_nullable_subfield_none(self): - subfield = _Field("NULLABLE", "age", "INTEGER") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": None}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"age": None}) - - def test_w_scalar_subfield(self): - subfield = _Field("REQUIRED", "age", "INTEGER") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": 42}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"age": 42}) - - def test_w_scalar_subfield_geography(self): - subfield = _Field("REQUIRED", "geo", "GEOGRAPHY") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": "POINT(1, 2)"}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"geo": "POINT(1, 2)"}) - - def test_w_repeated_subfield(self): - subfield = _Field("REPEATED", "color", "STRING") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": [{"v": "red"}, {"v": "yellow"}, {"v": "blue"}]}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"color": ["red", "yellow", "blue"]}) - - def test_w_record_subfield(self): - full_name = _Field("REQUIRED", "full_name", "STRING") - area_code = _Field("REQUIRED", "area_code", "STRING") - local_number = _Field("REQUIRED", "local_number", "STRING") - rank = _Field("REQUIRED", "rank", "INTEGER") - phone = _Field( - "NULLABLE", "phone", "RECORD", fields=[area_code, local_number, rank] - ) - person = _Field("REQUIRED", "person", "RECORD", fields=[full_name, phone]) - value = { - "f": [ - {"v": "Phred Phlyntstone"}, - {"v": {"f": [{"v": "800"}, {"v": "555-1212"}, {"v": 1}]}}, - ] - } - expected = { - "full_name": "Phred Phlyntstone", - "phone": {"area_code": "800", "local_number": "555-1212", "rank": 1}, - } - coerced = self._call_fut(value, person) - self.assertEqual(coerced, expected) - - class Test_field_to_index_mapping(unittest.TestCase): def _call_fut(self, schema): from google.cloud.bigquery._helpers import _field_to_index_mapping @@ -1661,3 +1183,34 @@ def test_w_env_var(self): host = self._call_fut() self.assertEqual(host, HOST) + + +class Test__isinstance_or_raise: + @pytest.mark.parametrize( + "value,dtype,none_allowed,expected", + [ + (None, str, True, None), + ("hello world.uri", str, True, "hello world.uri"), + ("hello world.uri", str, False, "hello world.uri"), + (None, (str, float), True, None), + ("hello world.uri", (str, float), True, "hello world.uri"), + ("hello world.uri", (str, float), False, "hello world.uri"), + ], + ) + def test__valid_isinstance_or_raise(self, value, dtype, none_allowed, expected): + result = _isinstance_or_raise(value, dtype, none_allowed=none_allowed) + assert result == expected + + @pytest.mark.parametrize( + "value,dtype,none_allowed,expected", + [ + (None, str, False, pytest.raises(TypeError)), + ({"key": "value"}, str, True, pytest.raises(TypeError)), + ({"key": "value"}, str, False, pytest.raises(TypeError)), + ({"key": "value"}, (str, float), True, pytest.raises(TypeError)), + ({"key": "value"}, (str, float), False, pytest.raises(TypeError)), + ], + ) + def test__invalid_isinstance_or_raise(self, value, dtype, none_allowed, expected): + with expected: + _isinstance_or_raise(value, dtype, none_allowed=none_allowed) diff --git a/tests/unit/test__job_helpers.py b/tests/unit/test__job_helpers.py index 96914d9f9..19390c7ec 100644 --- a/tests/unit/test__job_helpers.py +++ b/tests/unit/test__job_helpers.py @@ -15,7 +15,6 @@ from typing import Any, Dict, Optional from unittest import mock -import freezegun import google.api_core.exceptions from google.api_core import retry as retries import pytest @@ -194,6 +193,26 @@ def make_query_response( make_query_request({"maximumBytesBilled": "987654"}), id="job_config-with-maximum_bytes_billed", ), + pytest.param( + job_query.QueryJobConfig( + write_incremental_results=True, + ), + make_query_request({"writeIncrementalResults": True}), + id="job_config-with-incremental-results", + ), + pytest.param( + job_query.QueryJobConfig( + reservation="foo", + max_slots=100, + ), + make_query_request( + { + "maxSlots": "100", + "reservation": "foo", + } + ), + id="job_config-with-reservation-and-slots", + ), ), ) def test__to_query_request(job_config, expected): @@ -316,6 +335,7 @@ def test_query_jobs_query_defaults(): assert request["location"] == "asia-northeast1" assert request["formatOptions"]["useInt64Timestamp"] is True assert "timeoutMs" not in request + assert "timestampOutputFormat" not in request["formatOptions"] def test_query_jobs_query_sets_format_options(): @@ -381,6 +401,35 @@ def test_query_jobs_query_sets_timeout(timeout, expected_timeout): assert request["timeoutMs"] == expected_timeout +def test_query_jobs_query_picosecond(): + mock_client = mock.create_autospec(Client) + mock_retry = mock.create_autospec(retries.Retry) + mock_job_retry = mock.create_autospec(retries.Retry) + mock_client._call_api.return_value = { + "jobReference": { + "projectId": "test-project", + "jobId": "abc", + "location": "asia-northeast1", + } + } + _job_helpers.query_jobs_query( + mock_client, + "SELECT * FROM test", + None, + "asia-northeast1", + "test-project", + mock_retry, + None, + mock_job_retry, + enums.TimestampPrecision.PICOSECOND, + ) + + _, call_kwargs = mock_client._call_api.call_args + request = call_kwargs["data"] + assert "useInt64Timestamp" not in request["formatOptions"] + assert request["formatOptions"]["timestampOutputFormat"] == "ISO8601_STRING" + + def test_query_and_wait_uses_jobs_insert(): """With unsupported features, call jobs.insert instead of jobs.query.""" client = mock.create_autospec(Client) @@ -443,117 +492,9 @@ def test_query_and_wait_uses_jobs_insert(): ) -def test_query_and_wait_retries_job(): - freezegun.freeze_time(auto_tick_seconds=100) - client = mock.create_autospec(Client) - client._call_api.__name__ = "_call_api" - client._call_api.__qualname__ = "Client._call_api" - client._call_api.__annotations__ = {} - client._call_api.__type_params__ = () - client._call_api.side_effect = ( - google.api_core.exceptions.BadGateway("retry me"), - google.api_core.exceptions.InternalServerError("job_retry me"), - google.api_core.exceptions.BadGateway("retry me"), - { - "jobReference": { - "projectId": "response-project", - "jobId": "abc", - "location": "response-location", - }, - "jobComplete": True, - "schema": { - "fields": [ - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "age", "type": "INT64", "mode": "NULLABLE"}, - ], - }, - "rows": [ - {"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]}, - {"f": [{"v": "Bhetty Rhubble"}, {"v": "28"}]}, - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - ], - }, - ) - rows = _job_helpers.query_and_wait( - client, - query="SELECT 1", - location="request-location", - project="request-project", - job_config=None, - page_size=None, - max_results=None, - retry=retries.Retry( - lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), - multiplier=1.0, - ).with_deadline( - 200.0 - ), # Since auto_tick_seconds is 100, we should get at least 1 retry. - job_retry=retries.Retry( - lambda exc: isinstance(exc, google.api_core.exceptions.InternalServerError), - multiplier=1.0, - ).with_deadline(600.0), - ) - assert len(list(rows)) == 4 - - # For this code path, where the query has finished immediately, we should - # only be calling the jobs.query API and no other request path. - request_path = "/projects/request-project/queries" - for call in client._call_api.call_args_list: - _, kwargs = call - assert kwargs["method"] == "POST" - assert kwargs["path"] == request_path - - -@freezegun.freeze_time(auto_tick_seconds=100) -def test_query_and_wait_retries_job_times_out(): - client = mock.create_autospec(Client) - client._call_api.__name__ = "_call_api" - client._call_api.__qualname__ = "Client._call_api" - client._call_api.__annotations__ = {} - client._call_api.__type_params__ = () - client._call_api.side_effect = ( - google.api_core.exceptions.BadGateway("retry me"), - google.api_core.exceptions.InternalServerError("job_retry me"), - google.api_core.exceptions.BadGateway("retry me"), - google.api_core.exceptions.InternalServerError("job_retry me"), - ) - - with pytest.raises(google.api_core.exceptions.RetryError) as exc_info: - _job_helpers.query_and_wait( - client, - query="SELECT 1", - location="request-location", - project="request-project", - job_config=None, - page_size=None, - max_results=None, - retry=retries.Retry( - lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), - multiplier=1.0, - ).with_deadline( - 200.0 - ), # Since auto_tick_seconds is 100, we should get at least 1 retry. - job_retry=retries.Retry( - lambda exc: isinstance( - exc, google.api_core.exceptions.InternalServerError - ), - multiplier=1.0, - ).with_deadline(400.0), - ) - - assert isinstance( - exc_info.value.cause, google.api_core.exceptions.InternalServerError - ) - - -def test_query_and_wait_sets_job_creation_mode(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv( - "QUERY_PREVIEW_ENABLED", - # The comparison should be case insensitive. - "TrUe", - ) +def test_query_and_wait_sets_job_creation_mode(): client = mock.create_autospec(Client) + client.default_job_creation_mode = "JOB_CREATION_OPTIONAL" client._call_api.return_value = { "jobReference": { "projectId": "response-project", @@ -635,6 +576,7 @@ def test_query_and_wait_sets_location(): "useInt64Timestamp": True, }, "requestId": mock.ANY, + "jobCreationMode": mock.ANY, }, timeout=None, ) @@ -651,6 +593,7 @@ def test_query_and_wait_sets_location(): ) def test_query_and_wait_sets_max_results(max_results, page_size, expected): client = mock.create_autospec(Client) + client.default_job_creation_mode = None client._call_api.return_value = { "jobReference": { "projectId": "response-project", @@ -696,6 +639,7 @@ def test_query_and_wait_sets_max_results(max_results, page_size, expected): def test_query_and_wait_caches_completed_query_results_one_page(): client = mock.create_autospec(Client) + client.default_job_creation_mode = None client._call_api.return_value = { "jobReference": { "projectId": "response-project", @@ -761,6 +705,7 @@ def test_query_and_wait_caches_completed_query_results_one_page(): def test_query_and_wait_caches_completed_query_results_one_page_no_rows(): client = mock.create_autospec(Client) + client.default_job_creation_mode = None client._call_api.return_value = { "jobReference": { "projectId": "response-project", @@ -1141,6 +1086,26 @@ def test_make_job_id_w_job_id_overrides_prefix(): False, id="priority=BATCH", ), + pytest.param( + job_query.QueryJobConfig(write_incremental_results=True), + True, + id="write_incremental_results", + ), + pytest.param( + job_query.QueryJobConfig(job_timeout_ms=1000), + True, + id="job_timeout_ms", + ), + pytest.param( + job_query.QueryJobConfig(reservation="foo"), + True, + id="reservation", + ), + pytest.param( + job_query.QueryJobConfig(max_slots=20), + True, + id="max_slots", + ), ), ) def test_supported_by_jobs_query_from_queryjobconfig( diff --git a/tests/unit/test__job_helpers_retry.py b/tests/unit/test__job_helpers_retry.py new file mode 100644 index 000000000..3ea4b1aae --- /dev/null +++ b/tests/unit/test__job_helpers_retry.py @@ -0,0 +1,122 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import freezegun +import google.api_core.exceptions +from google.api_core import retry as retries +import pytest + +from google.cloud.bigquery import _job_helpers + +from . import helpers + + +def test_query_and_wait_retries_job(global_time_lock): + with freezegun.freeze_time(auto_tick_seconds=100): + conn = helpers.make_connection( + google.api_core.exceptions.BadGateway("retry me"), + google.api_core.exceptions.InternalServerError("job_retry me"), + google.api_core.exceptions.BadGateway("retry me"), + { + "jobReference": { + "projectId": "response-project", + "jobId": "abc", + "location": "response-location", + }, + "jobComplete": True, + "schema": { + "fields": [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INT64", "mode": "NULLABLE"}, + ], + }, + "rows": [ + {"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]}, + {"f": [{"v": "Bhetty Rhubble"}, {"v": "28"}]}, + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ], + }, + ) + client = helpers.make_client(project="client-project") + client._connection = conn + rows = _job_helpers.query_and_wait( + client, + query="SELECT 1", + location="request-location", + project="request-project", + job_config=None, + page_size=None, + max_results=None, + retry=retries.Retry( + lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), + multiplier=1.0, + ).with_deadline( + 200.0 + ), # Since auto_tick_seconds is 100, we should get at least 1 retry. + job_retry=retries.Retry( + lambda exc: isinstance( + exc, google.api_core.exceptions.InternalServerError + ), + multiplier=1.0, + ).with_deadline(600.0), + ) + assert len(list(rows)) == 4 + + # For this code path, where the query has finished immediately, we should + # only be calling the jobs.query API and no other request path. + request_path = "/projects/request-project/queries" + for call in client._connection.api_request.call_args_list: + _, kwargs = call + assert kwargs["method"] == "POST" + assert kwargs["path"] == request_path + + +def test_query_and_wait_retries_job_times_out(global_time_lock): + with freezegun.freeze_time(auto_tick_seconds=100): + conn = helpers.make_connection( + google.api_core.exceptions.BadGateway("retry me"), + google.api_core.exceptions.InternalServerError("job_retry me"), + google.api_core.exceptions.BadGateway("retry me"), + google.api_core.exceptions.InternalServerError("job_retry me"), + ) + client = helpers.make_client(project="client-project") + client._connection = conn + + with pytest.raises(google.api_core.exceptions.RetryError) as exc_info: + _job_helpers.query_and_wait( + client, + query="SELECT 1", + location="request-location", + project="request-project", + job_config=None, + page_size=None, + max_results=None, + retry=retries.Retry( + lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), + multiplier=1.0, + ).with_deadline( + 200.0 + ), # Since auto_tick_seconds is 100, we should get at least 1 retry. + job_retry=retries.Retry( + lambda exc: isinstance( + exc, google.api_core.exceptions.InternalServerError + ), + multiplier=1.0, + ).with_deadline(400.0), + ) + + assert isinstance( + exc_info.value.cause, google.api_core.exceptions.InternalServerError + ) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 58d2b73b3..6ec62c0b6 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -13,18 +13,19 @@ # limitations under the License. import collections +import concurrent.futures import datetime import decimal import functools +import gc import operator import queue +import time +from typing import Union from unittest import mock import warnings -try: - import importlib.metadata as metadata -except ImportError: - import importlib_metadata as metadata +import importlib.metadata as metadata try: import pandas @@ -33,6 +34,11 @@ except ImportError: pandas = None +try: + import pandas_gbq.schema.pandas_to_bigquery +except ImportError: + pandas_gbq = None + try: import geopandas except ImportError: @@ -46,6 +52,7 @@ from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import schema +from google.cloud.bigquery._pandas_helpers import determine_requested_streams pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() @@ -881,7 +888,7 @@ def test_list_columns_and_indexes_with_named_index_same_as_column_name( @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_dataframe_to_json_generator(module_under_test): - utcnow = datetime.datetime.utcnow() + utcnow = datetime.datetime.now(datetime.timezone.utc) dataframe = pandas.DataFrame( { "a_series": [1, 2, 3, 4], @@ -1278,7 +1285,21 @@ def test_dataframe_to_parquet_compression_method(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_dataframe_to_bq_schema_w_named_index(module_under_test): +@pytest.mark.skipif(pandas_gbq is None, reason="Requires `pandas-gbq`") +def test_dataframe_to_bq_schema_returns_schema_with_pandas_gbq( + module_under_test, monkeypatch +): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + dataframe = pandas.DataFrame({"field00": ["foo", "bar"]}) + got = module_under_test.dataframe_to_bq_schema(dataframe, []) + # Don't assert beyond this, since pandas-gbq is now source of truth. + assert got is not None + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_dataframe_to_bq_schema_w_named_index(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + df_data = collections.OrderedDict( [ ("str_column", ["hello", "world"]), @@ -1289,7 +1310,8 @@ def test_dataframe_to_bq_schema_w_named_index(module_under_test): index = pandas.Index(["a", "b"], name="str_index") dataframe = pandas.DataFrame(df_data, index=index) - returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, []) + with pytest.warns(FutureWarning, match="pandas-gbq"): + returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, []) expected_schema = ( schema.SchemaField("str_index", "STRING", "NULLABLE"), @@ -1301,7 +1323,9 @@ def test_dataframe_to_bq_schema_w_named_index(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_dataframe_to_bq_schema_w_multiindex(module_under_test): +def test_dataframe_to_bq_schema_w_multiindex(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + df_data = collections.OrderedDict( [ ("str_column", ["hello", "world"]), @@ -1318,7 +1342,8 @@ def test_dataframe_to_bq_schema_w_multiindex(module_under_test): ) dataframe = pandas.DataFrame(df_data, index=index) - returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, []) + with pytest.warns(FutureWarning, match="pandas-gbq"): + returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, []) expected_schema = ( schema.SchemaField("str_index", "STRING", "NULLABLE"), @@ -1332,7 +1357,9 @@ def test_dataframe_to_bq_schema_w_multiindex(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_dataframe_to_bq_schema_w_bq_schema(module_under_test): +def test_dataframe_to_bq_schema_w_bq_schema(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + df_data = collections.OrderedDict( [ ("str_column", ["hello", "world"]), @@ -1347,7 +1374,10 @@ def test_dataframe_to_bq_schema_w_bq_schema(module_under_test): {"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"}, ] - returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, dict_schema) + with pytest.warns(FutureWarning, match="pandas-gbq"): + returned_schema = module_under_test.dataframe_to_bq_schema( + dataframe, dict_schema + ) expected_schema = ( schema.SchemaField("str_column", "STRING", "NULLABLE"), @@ -1358,7 +1388,11 @@ def test_dataframe_to_bq_schema_w_bq_schema(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test): +def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow( + module_under_test, monkeypatch +): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + dataframe = pandas.DataFrame( data=[ {"id": 10, "status": "FOO", "execution_date": datetime.date(2019, 5, 10)}, @@ -1386,7 +1420,11 @@ def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") -def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test): +def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow( + module_under_test, monkeypatch +): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + dataframe = pandas.DataFrame( data=[ {"id": 10, "status": "FOO", "created_at": datetime.date(2019, 5, 10)}, @@ -1416,7 +1454,9 @@ def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") -def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test): +def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + dataframe = pandas.DataFrame( data=[ {"struct_field": {"one": 2}, "status": "FOO"}, @@ -1440,9 +1480,11 @@ def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test): @pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") -def test_dataframe_to_bq_schema_geography(module_under_test): +def test_dataframe_to_bq_schema_geography(module_under_test, monkeypatch): from shapely import wkt + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + df = geopandas.GeoDataFrame( pandas.DataFrame( dict( @@ -1453,7 +1495,10 @@ def test_dataframe_to_bq_schema_geography(module_under_test): ), geometry="geo1", ) - bq_schema = module_under_test.dataframe_to_bq_schema(df, []) + + with pytest.warns(FutureWarning, match="pandas-gbq"): + bq_schema = module_under_test.dataframe_to_bq_schema(df, []) + assert bq_schema == ( schema.SchemaField("name", "STRING"), schema.SchemaField("geo1", "GEOGRAPHY"), @@ -1525,31 +1570,7 @@ def test_augment_schema_type_detection_succeeds(module_under_test): # set to "datetime64[ns]", and pyarrow converts that to pyarrow.TimestampArray. # We thus cannot expect to get a DATETIME date when converting back to the # BigQuery type. - - current_schema = ( - schema.SchemaField("bool_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("int_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("float_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("time_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("timestamp_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("date_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("bytes_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("string_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), - ) - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - - # there should be no relevant warnings - unwanted_warnings = [ - warning for warning in warned if "Pyarrow could not" in str(warning) - ] - assert not unwanted_warnings - - # the augmented schema must match the expected - expected_schema = ( + expected_schemas = ( schema.SchemaField("bool_field", field_type="BOOL", mode="NULLABLE"), schema.SchemaField("int_field", field_type="INT64", mode="NULLABLE"), schema.SchemaField("float_field", field_type="FLOAT64", mode="NULLABLE"), @@ -1564,8 +1585,13 @@ def test_augment_schema_type_detection_succeeds(module_under_test): ), ) - by_name = operator.attrgetter("name") - assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) + for col_name, expected_schema in zip(dataframe, expected_schemas): + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field == expected_schema @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @@ -1596,30 +1622,20 @@ def test_augment_schema_repeated_fields(module_under_test): ] ) - current_schema = ( - schema.SchemaField("string_array", field_type=None, mode="NULLABLE"), - schema.SchemaField("timestamp_array", field_type=None, mode="NULLABLE"), - schema.SchemaField("datetime_array", field_type=None, mode="NULLABLE"), - ) - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - - # there should be no relevant warnings - unwanted_warnings = [ - warning for warning in warned if "Pyarrow could not" in str(warning) - ] - assert not unwanted_warnings - # the augmented schema must match the expected - expected_schema = ( + expected_schemas = ( schema.SchemaField("string_array", field_type="STRING", mode="REPEATED"), schema.SchemaField("timestamp_array", field_type="TIMESTAMP", mode="REPEATED"), schema.SchemaField("datetime_array", field_type="DATETIME", mode="REPEATED"), ) - by_name = operator.attrgetter("name") - assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) + for col_name, expected_schema in zip(dataframe, expected_schemas): + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field == expected_schema @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @@ -1638,24 +1654,21 @@ def test_augment_schema_type_detection_fails(module_under_test): }, ] ) - current_schema = [ - schema.SchemaField("status", field_type="STRING", mode="NULLABLE"), - schema.SchemaField("struct_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("struct_field_2", field_type=None, mode="NULLABLE"), - ] - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - assert augmented_schema is None + expected_schemas = ( + schema.SchemaField("status", field_type="STRING", mode="NULLABLE"), + # Could not determine the type of these columns + None, + None, + ) - expected_warnings = [ - warning for warning in warned if "could not determine" in str(warning) - ] - assert len(expected_warnings) == 1 - warning_msg = str(expected_warnings[0]) - assert "pyarrow" in warning_msg.lower() - assert "struct_field" in warning_msg and "struct_field_2" in warning_msg + for col_name, expected_schema in zip(dataframe, expected_schemas): + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field == expected_schema @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @@ -1663,23 +1676,14 @@ def test_augment_schema_type_detection_fails_array_data(module_under_test): dataframe = pandas.DataFrame( data=[{"all_none_array": [None, float("NaN")], "empty_array": []}] ) - current_schema = [ - schema.SchemaField("all_none_array", field_type=None, mode="NULLABLE"), - schema.SchemaField("empty_array", field_type=None, mode="NULLABLE"), - ] - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - assert augmented_schema is None - - expected_warnings = [ - warning for warning in warned if "could not determine" in str(warning) - ] - assert len(expected_warnings) == 1 - warning_msg = str(expected_warnings[0]) - assert "pyarrow" in warning_msg.lower() - assert "all_none_array" in warning_msg and "empty_array" in warning_msg + for col_name in dataframe: + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field is None @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") @@ -1844,6 +1848,99 @@ def fake_download_stream( assert queue_used.maxsize == expected_maxsize +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test__download_table_bqstorage_shuts_down_workers( + monkeypatch, + module_under_test, +): + """Regression test for https://github.com/googleapis/python-bigquery/issues/2032 + + Make sure that when the top-level iterator goes out of scope (is deleted), + the child threads are also stopped. + """ + pytest.importorskip("google.cloud.bigquery_storage_v1") + from google.cloud.bigquery import dataset + from google.cloud.bigquery import table + import google.cloud.bigquery_storage_v1.reader + import google.cloud.bigquery_storage_v1.types + + monkeypatch.setattr( + _versions_helpers.BQ_STORAGE_VERSIONS, "_installed_version", None + ) + monkeypatch.setattr(bigquery_storage, "__version__", "2.5.0") + + # Create a fake stream with a decent number of rows. + arrow_schema = pyarrow.schema( + [ + ("int_col", pyarrow.int64()), + ("str_col", pyarrow.string()), + ] + ) + arrow_rows = pyarrow.record_batch( + [ + pyarrow.array([0, 1, 2], type=pyarrow.int64()), + pyarrow.array(["a", "b", "c"], type=pyarrow.string()), + ], + schema=arrow_schema, + ) + session = google.cloud.bigquery_storage_v1.types.ReadSession() + session.data_format = "ARROW" + session.arrow_schema = {"serialized_schema": arrow_schema.serialize().to_pybytes()} + session.streams = [ + google.cloud.bigquery_storage_v1.types.ReadStream(name=name) + for name in ("stream/s0", "stream/s1", "stream/s2") + ] + bqstorage_client = mock.create_autospec( + bigquery_storage.BigQueryReadClient, instance=True + ) + reader = mock.create_autospec( + google.cloud.bigquery_storage_v1.reader.ReadRowsStream, instance=True + ) + reader.__iter__.return_value = [ + google.cloud.bigquery_storage_v1.types.ReadRowsResponse( + arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, + arrow_record_batch={ + "serialized_record_batch": arrow_rows.serialize().to_pybytes() + }, + ) + for _ in range(100) + ] + reader.rows.return_value = google.cloud.bigquery_storage_v1.reader.ReadRowsIterable( + reader, read_session=session + ) + bqstorage_client.read_rows.return_value = reader + bqstorage_client.create_read_session.return_value = session + table_ref = table.TableReference( + dataset.DatasetReference("project-x", "dataset-y"), + "table-z", + ) + download_state = module_under_test._DownloadState() + assert download_state.started_workers == 0 + assert download_state.finished_workers == 0 + + result_gen = module_under_test._download_table_bqstorage( + "some-project", + table_ref, + bqstorage_client, + max_queue_size=1, + page_to_item=module_under_test._bqstorage_page_to_arrow, + download_state=download_state, + ) + + result_gen_iter = iter(result_gen) + next(result_gen_iter) + assert download_state.started_workers == 3 + assert download_state.finished_workers == 0 + + # Stop iteration early and simulate the variables going out of scope + # to be doubly sure that the worker threads are supposed to be cleaned up. + del result_gen, result_gen_iter + gc.collect() + + assert download_state.started_workers == 3 + assert download_state.finished_workers == 3 + + @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_download_arrow_row_iterator_unknown_field_type(module_under_test): fake_page = api_core.page_iterator.Page( @@ -2002,6 +2099,23 @@ def test_bq_to_arrow_field_type_override(module_under_test): ) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_field_set_repeated_nullable_false(module_under_test): + assert ( + module_under_test.bq_to_arrow_field( + schema.SchemaField("name", "STRING", mode="REPEATED") + ).nullable + is False + ) + + assert ( + module_under_test.bq_to_arrow_field( + schema.SchemaField("name", "STRING", mode="NULLABLE") + ).nullable + is True + ) + + @pytest.mark.parametrize( "field_type, metadata", [ @@ -2036,3 +2150,236 @@ def test_verify_pandas_imports_no_db_dtypes(module_under_test, monkeypatch): monkeypatch.setattr(module_under_test, "db_dtypes", None) with pytest.raises(ValueError, match="Please install the 'db-dtypes' package"): module_under_test.verify_pandas_imports() + + +@pytest.mark.parametrize( + "preserve_order, max_stream_count, expected_requested_streams", + [ + # If preserve_order is set/True, it takes precedence: + (True, 10, 1), # use 1 + (True, None, 1), # use 1 + # If preserve_order is not set check max_stream_count: + (False, 10, 10), # max_stream_count (X) takes precedence + (False, None, 0), # Unbounded (0) when both are unset + ], +) +def test_determine_requested_streams( + preserve_order: bool, + max_stream_count: Union[int, None], + expected_requested_streams: int, +): + """Tests various combinations of preserve_order and max_stream_count.""" + actual_requested_streams = determine_requested_streams( + preserve_order, max_stream_count + ) + assert actual_requested_streams == expected_requested_streams + + +def test_determine_requested_streams_invalid_max_stream_count(): + """Tests that a ValueError is raised if max_stream_count is negative.""" + with pytest.raises(ValueError): + determine_requested_streams(preserve_order=False, max_stream_count=-1) + + +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires google-cloud-bigquery-storage" +) +def test__download_table_bqstorage_w_timeout_error(module_under_test): + from google.cloud.bigquery import dataset + from google.cloud.bigquery import table + from unittest import mock + + mock_bqstorage_client = mock.create_autospec( + bigquery_storage.BigQueryReadClient, instance=True + ) + fake_session = mock.Mock(streams=[mock.Mock()]) + mock_bqstorage_client.create_read_session.return_value = fake_session + + table_ref = table.TableReference( + dataset.DatasetReference("project-x", "dataset-y"), + "table-z", + ) + + def slow_download_stream( + download_state, bqstorage_client, session, stream, worker_queue, page_to_item + ): + # Block until the main thread sets done=True (which it will on timeout) + while not download_state.done: + time.sleep(0.01) + + with mock.patch.object( + module_under_test, "_download_table_bqstorage_stream", new=slow_download_stream + ): + # Use a very small timeout + result_gen = module_under_test._download_table_bqstorage( + "some-project", table_ref, mock_bqstorage_client, timeout=0.01 + ) + with pytest.raises(concurrent.futures.TimeoutError, match="timed out"): + list(result_gen) + + +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires google-cloud-bigquery-storage" +) +def test__download_table_bqstorage_w_timeout_success(module_under_test): + from google.cloud.bigquery import dataset + from google.cloud.bigquery import table + from unittest import mock + + mock_bqstorage_client = mock.create_autospec( + bigquery_storage.BigQueryReadClient, instance=True + ) + fake_session = mock.Mock(streams=["stream/s0"]) + mock_bqstorage_client.create_read_session.return_value = fake_session + + table_ref = table.TableReference( + dataset.DatasetReference("project-x", "dataset-y"), + "table-z", + ) + + def fast_download_stream( + download_state, bqstorage_client, session, stream, worker_queue, page_to_item + ): + worker_queue.put("result_page") + + with mock.patch.object( + module_under_test, "_download_table_bqstorage_stream", new=fast_download_stream + ): + # Use a generous timeout + result_gen = module_under_test._download_table_bqstorage( + "some-project", table_ref, mock_bqstorage_client, timeout=10.0 + ) + results = list(result_gen) + + assert results == ["result_page"] + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.parametrize( + "sleep_time, timeout, should_timeout", + [ + (0.1, 0.05, True), # Timeout case + (0, 10.0, False), # Success case + ], +) +def test_download_arrow_row_iterator_with_timeout( + module_under_test, sleep_time, timeout, should_timeout +): + bq_schema = [schema.SchemaField("name", "STRING")] + + # Mock page with to_arrow method + mock_page = mock.Mock() + mock_page.to_arrow.return_value = pyarrow.RecordBatch.from_arrays( + [pyarrow.array(["foo"])], + names=["name"], + ) + mock_page.__iter__ = lambda self: iter(["row1"]) + mock_page._columns = [["foo"]] + + def pages_gen(): + # First page yields quickly + yield mock_page + if sleep_time > 0: + time.sleep(sleep_time) + yield mock_page + + iterator = module_under_test.download_arrow_row_iterator( + pages_gen(), bq_schema, timeout=timeout + ) + + # First item should always succeed + next(iterator) + + if should_timeout: + with pytest.raises(concurrent.futures.TimeoutError): + next(iterator) + else: + # Should succeed and complete + results = list(iterator) + assert len(results) == 1 # 1 remaining item + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.parametrize( + "sleep_time, timeout, should_timeout", + [ + (0.1, 0.05, True), # Timeout case + (0, 10.0, False), # Success case + ], +) +def test_download_dataframe_row_iterator_with_timeout( + module_under_test, sleep_time, timeout, should_timeout +): + bq_schema = [schema.SchemaField("name", "STRING")] + dtypes = {} + + # Mock page + mock_page = mock.Mock() + # Mock iterator for _row_iterator_page_to_dataframe checking next(iter(page)) + mock_page.__iter__ = lambda self: iter(["row1"]) + mock_page._columns = [["foo"]] + + def pages_gen(): + yield mock_page + if sleep_time > 0: + time.sleep(sleep_time) + yield mock_page + + iterator = module_under_test.download_dataframe_row_iterator( + pages_gen(), bq_schema, dtypes, timeout=timeout + ) + + next(iterator) + + if should_timeout: + with pytest.raises(concurrent.futures.TimeoutError): + next(iterator) + else: + results = list(iterator) + assert len(results) == 1 + + +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +def test_download_arrow_bqstorage_passes_timeout_to_create_read_session( + module_under_test, +): + # Mock dependencies + project_id = "test-project" + table = mock.Mock() + table.table_id = "test_table" + table.to_bqstorage.return_value = "projects/test/datasets/test/tables/test" + + bqstorage_client = mock.create_autospec( + bigquery_storage.BigQueryReadClient, instance=True + ) + # Mock create_read_session to return a session with no streams so the function returns early + # (Checking start of loop logic vs empty streams return) + session = mock.Mock() + # If streams is empty, _download_table_bqstorage returns early, which is fine for this test + session.streams = [] + bqstorage_client.create_read_session.return_value = session + + # Call the function + timeout = 123.456 + # download_arrow_bqstorage yields frames, so we need to iterate to trigger execution + list( + module_under_test.download_arrow_bqstorage( + project_id, table, bqstorage_client, timeout=timeout + ) + ) + + # Verify timeout and retry were passed + bqstorage_client.create_read_session.assert_called_once() + _, kwargs = bqstorage_client.create_read_session.call_args + assert "timeout" in kwargs + assert kwargs["timeout"] == timeout + + assert "retry" in kwargs + retry_policy = kwargs["retry"] + assert retry_policy is not None + # Check if deadline is set correctly in the retry policy + assert retry_policy._deadline == timeout diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py index f0a872c88..c12a526de 100644 --- a/tests/unit/test__pyarrow_helpers.py +++ b/tests/unit/test__pyarrow_helpers.py @@ -14,7 +14,7 @@ import pytest - +numpy = pytest.importorskip("numpy") pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") @@ -27,8 +27,16 @@ def module_under_test(): def test_bq_to_arrow_scalars(module_under_test): assert ( - module_under_test.bq_to_arrow_scalars("BIGNUMERIC") - == module_under_test.pyarrow_bignumeric + module_under_test.bq_to_arrow_scalars("BIGNUMERIC")() + == module_under_test.pyarrow_bignumeric() + ) + assert ( + # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0), + # but we'd like this to map as closely to the BQ Storage API as + # possible, which uses the string() dtype, as JSON support in Arrow + # predates JSON support in BigQuery by several years. + module_under_test.bq_to_arrow_scalars("JSON")() + == pyarrow.string() ) assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py index b1d0ef1ac..8379c87c1 100644 --- a/tests/unit/test__versions_helpers.py +++ b/tests/unit/test__versions_helpers.py @@ -188,14 +188,19 @@ def test_bqstorage_is_read_session_optional_false(): @pytest.mark.skipif(pandas is None, reason="pandas is not installed") -@pytest.mark.parametrize("version", ["1.5.0", "2.0.0", "2.1.0"]) +@pytest.mark.parametrize("version", ["1.1.5", "2.0.0", "2.1.0"]) def test_try_import_raises_no_error_w_recent_pandas(version): + # Comparing against the minimum allowed pandas version. + # As long as the installed version is greater than that, no + # error is raised. versions = _versions_helpers.PandasVersions() with mock.patch("pandas.__version__", new=version): try: pandas = versions.try_import(raise_if_error=True) assert pandas is not None - except exceptions.LegacyPandasError: # pragma: NO COVER + # this exception should not fire unless there is something broken + # hence the pragma. + except exceptions.LegacyPandasError: # pragma: no cover raise ("Legacy error raised with a non-legacy dependency version.") diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index cd336b73f..1c4a9badb 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -16,7 +16,6 @@ import collections import datetime import decimal -import email import gzip import http.client import io @@ -28,9 +27,9 @@ from unittest import mock import warnings -import requests import packaging import pytest +import requests try: @@ -55,7 +54,8 @@ import google.cloud._helpers from google.cloud import bigquery -from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery.dataset import DatasetReference, Dataset +from google.cloud.bigquery.enums import UpdateMode, DatasetView, TimestampPrecision from google.cloud.bigquery import exceptions from google.cloud.bigquery import ParquetOptions import google.cloud.bigquery.retry @@ -208,6 +208,17 @@ def test_ctor_w_client_options_universe(self): ) self.assertEqual(client._connection.API_BASE_URL, "https://bigquery.foo.com") + def test_ctor_w_job_creation_mode(self): + creds = _make_credentials() + http = object() + client = self._make_one( + project=self.PROJECT, + credentials=creds, + _http=http, + default_job_creation_mode="foo", + ) + self.assertEqual(client.default_job_creation_mode, "foo") + def test_ctor_w_location(self): from google.cloud.bigquery._http import Connection @@ -296,31 +307,6 @@ def test__call_api_extra_headers(self): headers = kwargs["headers"] assert headers["x-goog-request-reason"] == "because-friday" - def test__call_api_applying_custom_retry_on_timeout(self): - from concurrent.futures import TimeoutError - from google.cloud.bigquery.retry import DEFAULT_RETRY - - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - - api_request_patcher = mock.patch.object( - client._connection, - "api_request", - side_effect=[TimeoutError, "result"], - ) - retry = DEFAULT_RETRY.with_deadline(1).with_predicate( - lambda exc: isinstance(exc, TimeoutError) - ) - - with api_request_patcher as fake_api_request: - result = client._call_api(retry, foo="bar") - - self.assertEqual(result, "result") - self.assertEqual( - fake_api_request.call_args_list, - [mock.call(foo="bar"), mock.call(foo="bar")], # was retried once - ) - def test__call_api_span_creator_not_called(self): from concurrent.futures import TimeoutError from google.cloud.bigquery.retry import DEFAULT_RETRY @@ -627,48 +613,6 @@ def test_get_service_account_email_w_alternate_project(self): ) self.assertEqual(service_account_email, email) - def test_get_service_account_email_w_custom_retry(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - api_path = "/projects/{}/serviceAccount".format(self.PROJECT) - creds = _make_credentials() - http = object() - client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) - - resource = { - "kind": "bigquery#getServiceAccountResponse", - "email": "bq-123@bigquery-encryption.iam.gserviceaccount.com", - } - api_request_patcher = mock.patch.object( - client._connection, - "api_request", - side_effect=[ValueError, resource], - ) - - retry = DEFAULT_RETRY.with_deadline(1).with_predicate( - lambda exc: isinstance(exc, ValueError) - ) - - with api_request_patcher as fake_api_request: - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - service_account_email = client.get_service_account_email( - retry=retry, timeout=7.5 - ) - - final_attributes.assert_called_once_with({"path": api_path}, client, None) - self.assertEqual( - service_account_email, "bq-123@bigquery-encryption.iam.gserviceaccount.com" - ) - self.assertEqual( - fake_api_request.call_args_list, - [ - mock.call(method="GET", path=api_path, timeout=7.5), - mock.call(method="GET", path=api_path, timeout=7.5), # was retried once - ], - ) - def test_dataset_with_specified_project(self): from google.cloud.bigquery.dataset import DatasetReference @@ -736,7 +680,7 @@ def test_get_dataset(self): final_attributes.assert_called_once_with({"path": "/%s" % path}, client, None) conn.api_request.assert_called_once_with( - method="GET", path="/%s" % path, timeout=7.5 + method="GET", path="/%s" % path, timeout=7.5, query_params={} ) self.assertEqual(dataset.dataset_id, self.DS_ID) @@ -802,6 +746,72 @@ def test_get_dataset(self): self.assertEqual(dataset.dataset_id, self.DS_ID) + def test_get_dataset_with_dataset_view(self): + path = "projects/%s/datasets/%s" % (self.PROJECT, self.DS_ID) + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + resource = { + "id": "%s:%s" % (self.PROJECT, self.DS_ID), + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + } + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + + test_cases = [ + (None, None), + (DatasetView.DATASET_VIEW_UNSPECIFIED, "DATASET_VIEW_UNSPECIFIED"), + (DatasetView.METADATA, "METADATA"), + (DatasetView.ACL, "ACL"), + (DatasetView.FULL, "FULL"), + ] + + for dataset_view_arg, expected_param_value in test_cases: + with self.subTest( + dataset_view_arg=dataset_view_arg, + expected_param_value=expected_param_value, + ): + # Re-initialize the connection mock for each sub-test to reset side_effect + conn = client._connection = make_connection(resource) + + dataset = client.get_dataset(dataset_ref, dataset_view=dataset_view_arg) + + self.assertEqual(dataset.dataset_id, self.DS_ID) + + if expected_param_value: + expected_query_params = {"datasetView": expected_param_value} + else: + expected_query_params = {} + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % path, + timeout=DEFAULT_TIMEOUT, + query_params=expected_query_params if expected_query_params else {}, + ) + + def test_get_dataset_with_invalid_dataset_view(self): + invalid_view_values = [ + "INVALID_STRING", + 123, + 123.45, + object(), + ] + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + resource = { + "id": "%s:%s" % (self.PROJECT, self.DS_ID), + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + } + conn = client._connection = make_connection(resource) + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + + for invalid_view_value in invalid_view_values: + with self.subTest(invalid_view_value=invalid_view_value): + conn.api_request.reset_mock() # Reset mock for each sub-test + with self.assertRaises(AttributeError): + client.get_dataset(dataset_ref, dataset_view=invalid_view_value) + def test_ensure_bqstorage_client_creating_new_instance(self): bigquery_storage = pytest.importorskip("google.cloud.bigquery_storage") @@ -2028,6 +2038,7 @@ def test_update_dataset(self): LABELS = {"priority": "high"} ACCESS = [{"role": "OWNER", "userByEmail": "phred@example.com"}] EXP = 17 + RESOURCE_TAGS = {"123456789012/key": "value"} RESOURCE = { "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, "etag": "etag", @@ -2037,6 +2048,7 @@ def test_update_dataset(self): "defaultTableExpirationMs": EXP, "labels": LABELS, "access": ACCESS, + "resourceTags": RESOURCE_TAGS, } creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) @@ -2048,12 +2060,14 @@ def test_update_dataset(self): ds.default_table_expiration_ms = EXP ds.labels = LABELS ds.access_entries = [AccessEntry("OWNER", "userByEmail", "phred@example.com")] - fields = [ + ds.resource_tags = RESOURCE_TAGS + filter_fields = [ "description", "friendly_name", "location", "labels", "access_entries", + "resource_tags", ] with mock.patch( @@ -2061,12 +2075,12 @@ def test_update_dataset(self): ) as final_attributes: ds2 = client.update_dataset( ds, - fields=fields, + fields=filter_fields, timeout=7.5, ) final_attributes.assert_called_once_with( - {"path": "/%s" % PATH, "fields": fields}, client, None + {"path": "/%s" % PATH, "fields": filter_fields}, client, None ) conn.api_request.assert_called_once_with( @@ -2077,21 +2091,112 @@ def test_update_dataset(self): "location": LOCATION, "labels": LABELS, "access": ACCESS, + "resourceTags": RESOURCE_TAGS, }, path="/" + PATH, timeout=7.5, + query_params={}, ) self.assertEqual(ds2.description, ds.description) self.assertEqual(ds2.friendly_name, ds.friendly_name) self.assertEqual(ds2.location, ds.location) self.assertEqual(ds2.labels, ds.labels) self.assertEqual(ds2.access_entries, ds.access_entries) + self.assertEqual(ds2.resource_tags, ds.resource_tags) # ETag becomes If-Match header. ds._properties["etag"] = "etag" client.update_dataset(ds, []) req = conn.api_request.call_args self.assertEqual(req[1]["headers"]["If-Match"], "etag") + self.assertEqual(req[1].get("query_params"), {}) + + def test_update_dataset_w_update_mode(self): + PATH = f"projects/{self.PROJECT}/datasets/{self.DS_ID}" + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + + DESCRIPTION = "DESCRIPTION" + RESOURCE = { + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + "etag": "etag", + "description": DESCRIPTION, + } + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + orig_dataset = Dataset(dataset_ref) + orig_dataset.description = DESCRIPTION + filter_fields = ["description"] + + test_cases = [ + (None, None), + (UpdateMode.UPDATE_MODE_UNSPECIFIED, "UPDATE_MODE_UNSPECIFIED"), + (UpdateMode.UPDATE_METADATA, "UPDATE_METADATA"), + (UpdateMode.UPDATE_ACL, "UPDATE_ACL"), + (UpdateMode.UPDATE_FULL, "UPDATE_FULL"), + ] + + for update_mode_arg, expected_param_value in test_cases: + with self.subTest( + update_mode_arg=update_mode_arg, + expected_param_value=expected_param_value, + ): + conn = client._connection = make_connection(RESOURCE, RESOURCE) + + new_dataset = client.update_dataset( + orig_dataset, + fields=filter_fields, + update_mode=update_mode_arg, + ) + self.assertEqual(orig_dataset.description, new_dataset.description) + + if expected_param_value: + expected_query_params = {"updateMode": expected_param_value} + else: + expected_query_params = {} + + conn.api_request.assert_called_once_with( + method="PATCH", + path="/" + PATH, + data={"description": DESCRIPTION}, + timeout=DEFAULT_TIMEOUT, + query_params=expected_query_params if expected_query_params else {}, + ) + + def test_update_dataset_w_invalid_update_mode(self): + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + + DESCRIPTION = "DESCRIPTION" + resource = { + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + "etag": "etag", + } + + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + orig_dataset = Dataset(dataset_ref) + orig_dataset.description = DESCRIPTION + filter_fields = ["description"] # A non-empty list of fields is required + + # Mock the connection to prevent actual API calls + # and to provide a minimal valid response if the call were to proceed. + conn = client._connection = make_connection(resource) + + test_cases = [ + "INVALID_STRING", + 123, + 123.45, + object(), + ] + + for invalid_update_mode in test_cases: + with self.subTest(invalid_update_mode=invalid_update_mode): + conn.api_request.reset_mock() # Reset mock for each sub-test + with self.assertRaises(AttributeError): + client.update_dataset( + orig_dataset, + fields=filter_fields, + update_mode=invalid_update_mode, + ) def test_update_dataset_w_custom_property(self): # The library should handle sending properties to the API that are not @@ -2123,6 +2228,7 @@ def test_update_dataset_w_custom_property(self): data={"newAlphaProperty": "unreleased property"}, path=path, timeout=DEFAULT_TIMEOUT, + query_params={}, ) self.assertEqual(dataset.dataset_id, self.DS_ID) @@ -2314,6 +2420,7 @@ def test_update_table(self): "description": description, "friendlyName": title, "labels": {"x": "y"}, + "resourceTags": {"123456789012/key": "value"}, } ) schema = [ @@ -2337,7 +2444,8 @@ def test_update_table(self): table.description = description table.friendly_name = title table.labels = {"x": "y"} - fields = ["schema", "description", "friendly_name", "labels"] + table.resource_tags = {"123456789012/key": "value"} + fields = ["schema", "description", "friendly_name", "labels", "resource_tags"] with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: @@ -2369,14 +2477,16 @@ def test_update_table(self): "description": description, "friendlyName": title, "labels": {"x": "y"}, + "resourceTags": {"123456789012/key": "value"}, } conn.api_request.assert_called_once_with( - method="PATCH", data=sent, path="/" + path, timeout=7.5 + method="PATCH", data=sent, path="/" + path, timeout=7.5, query_params={} ) self.assertEqual(updated_table.description, table.description) self.assertEqual(updated_table.friendly_name, table.friendly_name) self.assertEqual(updated_table.schema, table.schema) self.assertEqual(updated_table.labels, table.labels) + self.assertEqual(updated_table.resource_tags, table.resource_tags) # ETag becomes If-Match header. table._properties["etag"] = "etag" @@ -2424,6 +2534,7 @@ def test_update_table_w_custom_property(self): path="/%s" % path, data={"newAlphaProperty": "unreleased property"}, timeout=DEFAULT_TIMEOUT, + query_params={}, ) self.assertEqual( updated_table._properties["newAlphaProperty"], "unreleased property" @@ -2460,6 +2571,7 @@ def test_update_table_only_use_legacy_sql(self): path="/%s" % path, data={"view": {"useLegacySql": True}}, timeout=DEFAULT_TIMEOUT, + query_params={}, ) self.assertEqual(updated_table.view_use_legacy_sql, table.view_use_legacy_sql) @@ -2552,9 +2664,10 @@ def test_update_table_w_query(self): "schema": schema_resource, }, timeout=DEFAULT_TIMEOUT, + query_params={}, ) - def test_update_table_w_schema_None(self): + def test_update_table_w_schema_None_autodetect_schema(self): # Simulate deleting schema: not sure if back-end will actually # allow this operation, but the spec says it is optional. path = "projects/%s/datasets/%s/tables/%s" % ( @@ -2596,7 +2709,9 @@ def test_update_table_w_schema_None(self): with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: - updated_table = client.update_table(table, ["schema"]) + updated_table = client.update_table( + table, ["schema"], autodetect_schema=True + ) final_attributes.assert_called_once_with( {"path": "/%s" % path, "fields": ["schema"]}, client, None @@ -2605,9 +2720,10 @@ def test_update_table_w_schema_None(self): self.assertEqual(len(conn.api_request.call_args_list), 2) req = conn.api_request.call_args_list[1] self.assertEqual(req[1]["method"], "PATCH") - sent = {"schema": None} + sent = {"schema": {"fields": None}} self.assertEqual(req[1]["data"], sent) self.assertEqual(req[1]["path"], "/%s" % path) + self.assertEqual(req[1]["query_params"], {"autodetect_schema": True}) self.assertEqual(len(updated_table.schema), 0) def test_update_table_delete_property(self): @@ -3659,176 +3775,6 @@ def test_load_table_from_uri_w_default_load_config(self): timeout=DEFAULT_TIMEOUT, ) - @staticmethod - def _mock_requests_response(status_code, headers, content=b""): - return mock.Mock( - content=content, - headers=headers, - status_code=status_code, - spec=["content", "headers", "status_code"], - ) - - def _mock_transport(self, status_code, headers, content=b""): - fake_transport = mock.Mock(spec=["request"]) - fake_response = self._mock_requests_response( - status_code, headers, content=content - ) - fake_transport.request.return_value = fake_response - return fake_transport - - def _initiate_resumable_upload_helper(self, num_retries=None, mtls=False): - from google.resumable_media.requests import ResumableUpload - from google.cloud.bigquery.client import _DEFAULT_CHUNKSIZE - from google.cloud.bigquery.client import _GENERIC_CONTENT_TYPE - from google.cloud.bigquery.client import _get_upload_headers - from google.cloud.bigquery.job import LoadJob - from google.cloud.bigquery.job import LoadJobConfig - from google.cloud.bigquery.job import SourceFormat - - # Create mocks to be checked for doing transport. - resumable_url = "http://test.invalid?upload_id=hey-you" - response_headers = {"location": resumable_url} - fake_transport = self._mock_transport(http.client.OK, response_headers) - client = self._make_one(project=self.PROJECT, _http=fake_transport) - conn = client._connection = make_connection() - if mtls: - conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") - - # Create some mock arguments and call the method under test. - data = b"goodbye gudbi gootbee" - stream = io.BytesIO(data) - config = LoadJobConfig() - config.source_format = SourceFormat.CSV - job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) - metadata = job.to_api_repr() - upload, transport = client._initiate_resumable_upload( - stream, metadata, num_retries, None - ) - - # Check the returned values. - self.assertIsInstance(upload, ResumableUpload) - - host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" - upload_url = ( - f"{host_name}/upload/bigquery/v2/projects/{self.PROJECT}" - "/jobs?uploadType=resumable" - ) - self.assertEqual(upload.upload_url, upload_url) - expected_headers = _get_upload_headers(conn.user_agent) - self.assertEqual(upload._headers, expected_headers) - self.assertFalse(upload.finished) - self.assertEqual(upload._chunk_size, _DEFAULT_CHUNKSIZE) - self.assertIs(upload._stream, stream) - self.assertIsNone(upload._total_bytes) - self.assertEqual(upload._content_type, _GENERIC_CONTENT_TYPE) - self.assertEqual(upload.resumable_url, resumable_url) - - retry_strategy = upload._retry_strategy - self.assertEqual(retry_strategy.max_sleep, 64.0) - if num_retries is None: - self.assertEqual(retry_strategy.max_cumulative_retry, 600.0) - self.assertIsNone(retry_strategy.max_retries) - else: - self.assertIsNone(retry_strategy.max_cumulative_retry) - self.assertEqual(retry_strategy.max_retries, num_retries) - self.assertIs(transport, fake_transport) - # Make sure we never read from the stream. - self.assertEqual(stream.tell(), 0) - - # Check the mocks. - request_headers = expected_headers.copy() - request_headers["x-upload-content-type"] = _GENERIC_CONTENT_TYPE - fake_transport.request.assert_called_once_with( - "POST", - upload_url, - data=json.dumps(metadata).encode("utf-8"), - headers=request_headers, - timeout=mock.ANY, - ) - - def test__initiate_resumable_upload(self): - self._initiate_resumable_upload_helper() - - def test__initiate_resumable_upload_mtls(self): - self._initiate_resumable_upload_helper(mtls=True) - - def test__initiate_resumable_upload_with_retry(self): - self._initiate_resumable_upload_helper(num_retries=11) - - def _do_multipart_upload_success_helper( - self, get_boundary, num_retries=None, project=None, mtls=False - ): - from google.cloud.bigquery.client import _get_upload_headers - from google.cloud.bigquery.job import LoadJob - from google.cloud.bigquery.job import LoadJobConfig - from google.cloud.bigquery.job import SourceFormat - - fake_transport = self._mock_transport(http.client.OK, {}) - client = self._make_one(project=self.PROJECT, _http=fake_transport) - conn = client._connection = make_connection() - if mtls: - conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") - - if project is None: - project = self.PROJECT - - # Create some mock arguments. - data = b"Bzzzz-zap \x00\x01\xf4" - stream = io.BytesIO(data) - config = LoadJobConfig() - config.source_format = SourceFormat.CSV - job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) - metadata = job.to_api_repr() - size = len(data) - - response = client._do_multipart_upload( - stream, metadata, size, num_retries, None, project=project - ) - - # Check the mocks and the returned value. - self.assertIs(response, fake_transport.request.return_value) - self.assertEqual(stream.tell(), size) - get_boundary.assert_called_once_with() - - host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" - upload_url = ( - f"{host_name}/upload/bigquery/v2/projects/{project}" - "/jobs?uploadType=multipart" - ) - payload = ( - b"--==0==\r\n" - b"content-type: application/json; charset=UTF-8\r\n\r\n" - b"%(json_metadata)s" - b"\r\n" - b"--==0==\r\n" - b"content-type: */*\r\n\r\n" - b"%(data)s" - b"\r\n" - b"--==0==--" - ) % {b"json_metadata": json.dumps(metadata).encode("utf-8"), b"data": data} - - headers = _get_upload_headers(conn.user_agent) - headers["content-type"] = b'multipart/related; boundary="==0=="' - fake_transport.request.assert_called_once_with( - "POST", upload_url, data=payload, headers=headers, timeout=mock.ANY - ) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload_mtls(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary, mtls=True) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload_with_retry(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary, num_retries=8) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload_with_custom_project(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary, project="custom-project") - def test_copy_table(self): from google.cloud.bigquery.job import CopyJob @@ -4698,7 +4644,7 @@ def test_query_w_api_method_query_and_job_id_fails(self): client._connection = make_connection({}) with self.assertRaises(TypeError) as exc: - client.query(query, job_id="abcd", api_method="QUERY") + client.query(query, job_id="abcd", api_method="QUERY", job_retry=None) self.assertIn( "`job_id` was provided, but the 'QUERY' `api_method` was requested", exc.exception.args[0], @@ -4753,7 +4699,11 @@ def test_query_w_explicit_project(self): conn = client._connection = make_connection(resource) client.query( - query, job_id=job_id, project="other-project", location=self.LOCATION + query, + job_id=job_id, + project="other-project", + location=self.LOCATION, + job_retry=None, ) # Check that query actually starts the job. @@ -4812,7 +4762,11 @@ def test_query_w_explicit_job_config(self): original_config_copy = copy.deepcopy(job_config) client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) # Check that query actually starts the job. @@ -4863,7 +4817,11 @@ def test_query_preserving_explicit_job_config(self): original_config_copy = copy.deepcopy(job_config) client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) # Check that query actually starts the job. @@ -4919,7 +4877,13 @@ def test_query_preserving_explicit_default_job_config(self): ) conn = client._connection = make_connection(resource) - client.query(query, job_id=job_id, location=self.LOCATION, job_config=None) + client.query( + query, + job_id=job_id, + location=self.LOCATION, + job_config=None, + job_retry=None, + ) # Check that query actually starts the job. conn.api_request.assert_called_once_with( @@ -4957,7 +4921,11 @@ def test_query_w_invalid_job_config(self): with self.assertRaises(TypeError) as exc: client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) self.assertIn("Expected an instance of QueryJobConfig", exc.exception.args[0]) @@ -5006,7 +4974,11 @@ def test_query_w_explicit_job_config_override(self): job_config.default_dataset = None client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) # Check that query actually starts the job. @@ -5051,7 +5023,7 @@ def test_query_w_client_default_config_no_incoming(self): ) conn = client._connection = make_connection(resource) - client.query(query, job_id=job_id, location=self.LOCATION) + client.query(query, job_id=job_id, location=self.LOCATION, job_retry=None) # Check that query actually starts the job. conn.api_request.assert_called_once_with( @@ -5093,7 +5065,7 @@ def test_query_w_client_location(self): ) conn = client._connection = make_connection(resource) - client.query(query, job_id=job_id, project="other-project") + client.query(query, job_id=job_id, project="other-project", job_retry=None) # Check that query actually starts the job. conn.api_request.assert_called_once_with( @@ -5157,7 +5129,7 @@ def test_query_w_udf_resources(self): config.udf_resources = udf_resources config.use_legacy_sql = True - job = client.query(QUERY, job_config=config, job_id=JOB) + job = client.query(QUERY, job_config=config, job_id=JOB, job_retry=None) self.assertIsInstance(job, QueryJob) self.assertIs(job._client, client) @@ -5213,7 +5185,7 @@ def test_query_w_query_parameters(self): config = QueryJobConfig() config.query_parameters = query_parameters - job = client.query(QUERY, job_config=config, job_id=JOB) + job = client.query(QUERY, job_config=config, job_id=JOB, job_retry=None) self.assertIsInstance(job, QueryJob) self.assertIs(job._client, client) @@ -5242,6 +5214,56 @@ def test_query_w_query_parameters(self): }, ) + def test_query_pico_timestamp(self): + query = "select *;" + response = { + "jobReference": { + "projectId": self.PROJECT, + "location": "EU", + "jobId": "abcd", + }, + } + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection(response) + + client.query( + query, + location="EU", + api_method="QUERY", + timestamp_precision=TimestampPrecision.PICOSECOND, + ) + + # Check that query actually starts the job. + expected_resource = { + "query": query, + "useLegacySql": False, + "location": "EU", + "formatOptions": {"timestampOutputFormat": "ISO8601_STRING"}, + "requestId": mock.ANY, + } + conn.api_request.assert_called_once_with( + method="POST", + path=f"/projects/{self.PROJECT}/queries", + data=expected_resource, + timeout=None, + ) + + def test_query_pico_timestamp_insert_error(self): + query = "select *;" + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + with pytest.raises(ValueError, match="Picosecond Timestamp is only"): + client.query( + query, + location="EU", + api_method="INSERT", + timestamp_precision=TimestampPrecision.PICOSECOND, + ) + def test_query_job_rpc_fail_w_random_error(self): from google.api_core.exceptions import Unknown from google.cloud.bigquery.job import QueryJob @@ -5256,7 +5278,7 @@ def test_query_job_rpc_fail_w_random_error(self): ) with job_begin_patcher: with pytest.raises(Unknown, match="Not sure what went wrong."): - client.query("SELECT 1;", job_id="123") + client.query("SELECT 1;", job_id="123", job_retry=None) def test_query_job_rpc_fail_w_conflict_job_id_given(self): from google.api_core.exceptions import Conflict @@ -5272,7 +5294,7 @@ def test_query_job_rpc_fail_w_conflict_job_id_given(self): ) with job_begin_patcher: with pytest.raises(Conflict, match="Job already exists."): - client.query("SELECT 1;", job_id="123") + client.query("SELECT 1;", job_id="123", job_retry=None) def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_fails(self): from google.api_core.exceptions import Conflict @@ -5298,6 +5320,36 @@ def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_fails(self): with pytest.raises(DataLoss, match="we lost your job, sorry"): client.query("SELECT 1;", job_id=None) + def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_fails_no_retries(self): + from google.api_core.exceptions import Conflict + from google.api_core.exceptions import DataLoss + from google.cloud.bigquery.job import QueryJob + + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + job_create_error = Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + QueryJob, "_begin", side_effect=job_create_error + ) + get_job_patcher = mock.patch.object( + client, "get_job", side_effect=DataLoss("we lost your job, sorry") + ) + + with job_begin_patcher, get_job_patcher: + # If get job request fails but supposedly there does exist a job + # with this ID already, raise the exception explaining why we + # couldn't recover the job. + with pytest.raises(DataLoss, match="we lost your job, sorry"): + client.query( + "SELECT 1;", + job_id=None, + # Explicitly test with no retries to make sure those branches are covered. + retry=None, + job_retry=None, + ) + def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_succeeds(self): from google.api_core.exceptions import Conflict from google.cloud.bigquery.job import QueryJob @@ -5335,6 +5387,11 @@ def test_query_and_wait_defaults(self): "totalRows": "1", "rows": [{"f": [{"v": "5552452"}]}], "queryId": "job_abcDEF_", + "totalBytesProcessed": 1234, + "totalSlotMs": 5678, + "creationTime": "1437767599006", + "startTime": "1437767600007", + "endTime": "1437767601008", } creds = _make_credentials() http = object() @@ -5350,6 +5407,12 @@ def test_query_and_wait_defaults(self): self.assertIsNone(rows.job_id) self.assertIsNone(rows.project) self.assertIsNone(rows.location) + self.assertEqual(rows.query, query) + self.assertEqual(rows.total_bytes_processed, 1234) + self.assertEqual(rows.slot_millis, 5678) + self.assertEqual(rows.created.timestamp() * 1000, 1437767599006) + self.assertEqual(rows.started.timestamp() * 1000, 1437767600007) + self.assertEqual(rows.ended.timestamp() * 1000, 1437767601008) # Verify the request we send is to jobs.query. conn.api_request.assert_called_once() @@ -5636,7 +5699,7 @@ def test_insert_rows_w_schema(self): from google.cloud.bigquery.schema import SchemaField WHEN_TS = 1437767599.006 - WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS).replace(tzinfo=UTC) + WHEN = datetime.datetime.fromtimestamp(WHEN_TS, UTC).replace(tzinfo=UTC) PATH = "projects/%s/datasets/%s/tables/%s/insertAll" % ( self.PROJECT, self.DS_ID, @@ -5697,7 +5760,7 @@ def test_insert_rows_w_list_of_dictionaries(self): from google.cloud.bigquery.table import Table WHEN_TS = 1437767599.006 - WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS).replace(tzinfo=UTC) + WHEN = datetime.datetime.fromtimestamp(WHEN_TS, UTC).replace(tzinfo=UTC) PATH = "projects/%s/datasets/%s/tables/%s/insertAll" % ( self.PROJECT, self.DS_ID, @@ -5880,6 +5943,7 @@ def _row_data(row): ) def test_insert_rows_w_repeated_fields(self): + from google.cloud._helpers import UTC from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table @@ -5909,12 +5973,8 @@ def test_insert_rows_w_repeated_fields(self): ( 12, [ - datetime.datetime( - 2018, 12, 1, 12, 0, 0, tzinfo=datetime.timezone.utc - ), - datetime.datetime( - 2018, 12, 1, 13, 0, 0, tzinfo=datetime.timezone.utc - ), + datetime.datetime(2018, 12, 1, 12, 0, 0, tzinfo=UTC), + datetime.datetime(2018, 12, 1, 13, 0, 0, tzinfo=UTC), ], [1.25, 2.5], ), @@ -6749,7 +6809,9 @@ def test_list_rows(self): ) WHEN_TS = 1437767599006000 - WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS / 1e6).replace(tzinfo=UTC) + WHEN = datetime.datetime.fromtimestamp( + WHEN_TS / 1e6, datetime.timezone.utc + ).replace(tzinfo=UTC) WHEN_1 = WHEN + datetime.timedelta(microseconds=1) WHEN_2 = WHEN + datetime.timedelta(microseconds=2) ROWS = 1234 @@ -6805,6 +6867,39 @@ def test_list_rows(self): timeout=7.5, ) + def test_list_rows_pico_timestamp(self): + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.table import Table + + PATH = "projects/%s/datasets/%s/tables/%s/data" % ( + self.PROJECT, + self.DS_ID, + self.TABLE_ID, + ) + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection({}, {}) + pico_col = SchemaField( + "full_name", + "TIMESTAMP", + mode="REQUIRED", + timestamp_precision=TimestampPrecision.PICOSECOND, + ) + table = Table(self.TABLE_REF, schema=[pico_col]) + + iterator = client.list_rows( + table, timestamp_precision=TimestampPrecision.PICOSECOND + ) + next(iterator.pages) + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % PATH, + query_params={"formatOptions.timestampOutputFormat": "ISO8601_STRING"}, + timeout=None, + ) + def test_list_rows_w_start_index_w_page_size(self): from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table @@ -8381,8 +8476,12 @@ def test_load_table_from_dataframe_w_automatic_schema_detection_fails(self): autospec=True, side_effect=google.api_core.exceptions.NotFound("Table not found"), ) + pandas_gbq_patch = mock.patch( + "google.cloud.bigquery._pandas_helpers.pandas_gbq", + new=None, + ) - with load_patch as load_table_from_file, get_table_patch: + with load_patch as load_table_from_file, get_table_patch, pandas_gbq_patch: with warnings.catch_warnings(record=True) as warned: client.load_table_from_dataframe( dataframe, self.TABLE_REF, location=self.LOCATION @@ -8438,7 +8537,6 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self): load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) - get_table_patch = mock.patch( "google.cloud.bigquery.client.Client.get_table", autospec=True, @@ -8450,6 +8548,7 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self): ] ), ) + with load_patch as load_table_from_file, get_table_patch: client.load_table_from_dataframe( dataframe, self.TABLE_REF, location=self.LOCATION @@ -8570,10 +8669,10 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se client = self._make_client() dataframe = pandas.DataFrame({"x": [1, 2, None, 4]}, dtype="Int64") + load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) - get_table_patch = mock.patch( "google.cloud.bigquery.client.Client.get_table", autospec=True, @@ -8602,8 +8701,11 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se sent_config = load_table_from_file.mock_calls[0][2]["job_config"] assert sent_config.source_format == job.SourceFormat.PARQUET - assert tuple(sent_config.schema) == ( - SchemaField("x", "INT64", "NULLABLE", None), + assert ( + # Accept either the GoogleSQL or legacy SQL type name from pandas-gbq. + tuple(sent_config.schema) == (SchemaField("x", "INT64", "NULLABLE", None),) + or tuple(sent_config.schema) + == (SchemaField("x", "INTEGER", "NULLABLE", None),) ) def test_load_table_from_dataframe_struct_fields(self): @@ -8749,7 +8851,7 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self): data=records, columns=["float_column", "array_column"] ) - expected_schema = [ + expected_schema_googlesql = [ SchemaField("float_column", "FLOAT"), SchemaField( "array_column", @@ -8757,6 +8859,14 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self): mode="REPEATED", ), ] + expected_schema_legacy_sql = [ + SchemaField("float_column", "FLOAT"), + SchemaField( + "array_column", + "INTEGER", + mode="REPEATED", + ), + ] load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True @@ -8792,7 +8902,10 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self): sent_config = load_table_from_file.mock_calls[0][2]["job_config"] assert sent_config.source_format == job.SourceFormat.PARQUET - assert sent_config.schema == expected_schema + assert ( + sent_config.schema == expected_schema_googlesql + or sent_config.schema == expected_schema_legacy_sql + ) def test_load_table_from_dataframe_w_partial_schema(self): pandas = pytest.importorskip("pandas") @@ -8912,7 +9025,6 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self): load_table_from_file.assert_not_called() message = str(exc_context.value) - assert "bq_schema contains fields not present in dataframe" in message assert "unknown_col" in message def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): @@ -9623,213 +9735,6 @@ def test_load_table_from_json_unicode_emoji_data_case(self): assert sent_data_file.getvalue() == expected_bytes # Low-level tests - - @classmethod - def _make_resumable_upload_responses(cls, size): - """Make a series of responses for a successful resumable upload.""" - from google import resumable_media - - resumable_url = "http://test.invalid?upload_id=and-then-there-was-1" - initial_response = cls._make_response( - http.client.OK, "", {"location": resumable_url} - ) - data_response = cls._make_response( - resumable_media.PERMANENT_REDIRECT, - "", - {"range": "bytes=0-{:d}".format(size - 1)}, - ) - final_response = cls._make_response( - http.client.OK, - json.dumps({"size": size}), - {"Content-Type": "application/json"}, - ) - return [initial_response, data_response, final_response] - - @staticmethod - def _make_transport(responses=None): - import google.auth.transport.requests - - transport = mock.create_autospec( - google.auth.transport.requests.AuthorizedSession, instance=True - ) - transport.request.side_effect = responses - return transport - - def test__do_resumable_upload(self): - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - transport = self._make_transport( - self._make_resumable_upload_responses(file_obj_len) - ) - client = self._make_client(transport) - - result = client._do_resumable_upload( - file_obj, self.EXPECTED_CONFIGURATION, None, None - ) - - content = result.content.decode("utf-8") - assert json.loads(content) == {"size": file_obj_len} - - # Verify that configuration data was passed in with the initial - # request. - transport.request.assert_any_call( - "POST", - mock.ANY, - data=json.dumps(self.EXPECTED_CONFIGURATION).encode("utf-8"), - headers=mock.ANY, - timeout=mock.ANY, - ) - - def test__do_resumable_upload_custom_project(self): - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - transport = self._make_transport( - self._make_resumable_upload_responses(file_obj_len) - ) - client = self._make_client(transport) - - result = client._do_resumable_upload( - file_obj, - self.EXPECTED_CONFIGURATION, - None, - None, - project="custom-project", - ) - - content = result.content.decode("utf-8") - assert json.loads(content) == {"size": file_obj_len} - - # Verify that configuration data was passed in with the initial - # request. - transport.request.assert_any_call( - "POST", - mock.ANY, - data=json.dumps(self.EXPECTED_CONFIGURATION).encode("utf-8"), - headers=mock.ANY, - timeout=mock.ANY, - ) - - initiation_url = next( - ( - call[0][1] - for call in transport.request.call_args_list - if call[0][0] == "POST" and "uploadType=resumable" in call[0][1] - ), - None, - ) # pragma: NO COVER - - assert initiation_url is not None - assert "projects/custom-project" in initiation_url - - def test__do_resumable_upload_custom_timeout(self): - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - transport = self._make_transport( - self._make_resumable_upload_responses(file_obj_len) - ) - client = self._make_client(transport) - - client._do_resumable_upload( - file_obj, self.EXPECTED_CONFIGURATION, num_retries=0, timeout=3.14 - ) - - # The timeout should be applied to all underlying calls. - for call_args in transport.request.call_args_list: - assert call_args[1].get("timeout") == 3.14 - - def test__do_multipart_upload(self): - transport = self._make_transport([self._make_response(http.client.OK)]) - client = self._make_client(transport) - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - - client._do_multipart_upload( - file_obj, self.EXPECTED_CONFIGURATION, file_obj_len, None, None - ) - - # Verify that configuration data was passed in with the initial - # request. - request_args = transport.request.mock_calls[0][2] - request_data = request_args["data"].decode("utf-8") - request_headers = request_args["headers"] - - request_content = email.message_from_string( - "Content-Type: {}\r\n{}".format( - request_headers["content-type"].decode("utf-8"), request_data - ) - ) - - # There should be two payloads: the configuration and the binary daya. - configuration_data = request_content.get_payload(0).get_payload() - binary_data = request_content.get_payload(1).get_payload() - - assert json.loads(configuration_data) == self.EXPECTED_CONFIGURATION - assert binary_data.encode("utf-8") == file_obj.getvalue() - - def test__do_multipart_upload_wrong_size(self): - client = self._make_client() - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - - with pytest.raises(ValueError): - client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None, None) - - def test_schema_from_json_with_file_path(self): - from google.cloud.bigquery.schema import SchemaField - - file_content = """[ - { - "description": "quarter", - "mode": "REQUIRED", - "name": "qtr", - "type": "STRING" - }, - { - "description": "sales representative", - "mode": "NULLABLE", - "name": "rep", - "type": "STRING" - }, - { - "description": "total sales", - "mode": "NULLABLE", - "name": "sales", - "type": "FLOAT" - } - ]""" - - expected = [ - SchemaField("qtr", "STRING", "REQUIRED", description="quarter"), - SchemaField( - "rep", - "STRING", - "NULLABLE", - description="sales representative", - ), - SchemaField( - "sales", - "FLOAT", - "NULLABLE", - description="total sales", - ), - ] - - client = self._make_client() - mock_file_path = "/mocked/file.json" - - open_patch = mock.patch( - "builtins.open", new=mock.mock_open(read_data=file_content) - ) - - with open_patch as _mock_file: - actual = client.schema_from_json(mock_file_path) - _mock_file.assert_called_once_with(mock_file_path) - # This assert is to make sure __exit__ is called in the context - # manager that opens the file in the function - _mock_file().__exit__.assert_called_once() - - assert expected == actual - def test_schema_from_json_with_file_object(self): from google.cloud.bigquery.schema import SchemaField diff --git a/tests/unit/test_client_bigframes.py b/tests/unit/test_client_bigframes.py new file mode 100644 index 000000000..0260da5e4 --- /dev/null +++ b/tests/unit/test_client_bigframes.py @@ -0,0 +1,411 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Client features enabling the bigframes integration.""" + +from __future__ import annotations + +import datetime +from unittest import mock + +import pytest + +import google.auth.credentials +from google.api_core import exceptions +from google.cloud import bigquery +import google.cloud.bigquery.client +from google.cloud.bigquery import _job_helpers + + +PROJECT = "test-project" +LOCATION = "test-location" + + +def make_response(body, *, status_code: int = 200): + response = mock.Mock() + type(response).status_code = mock.PropertyMock(return_value=status_code) + response.json.return_value = body + return response + + +@pytest.fixture +def client(): + """A real client object with mocked API requests.""" + credentials = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + http_session = mock.Mock() + return google.cloud.bigquery.client.Client( + project=PROJECT, + credentials=credentials, + _http=http_session, + location=LOCATION, + ) + + +def test_query_and_wait_bigframes_dry_run_no_callback(client): + client._http.request.side_effect = [ + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query + "location": LOCATION, + "queryId": "abcdefg", + "totalBytesProcessed": "123", + "jobComplete": True, + } + ), + ] + callback = mock.Mock() + job_config = bigquery.QueryJobConfig(dry_run=True) + response = client._query_and_wait_bigframes( + query="SELECT 1", job_config=job_config, callback=callback + ) + callback.assert_not_called() + assert response.total_bytes_processed == 123 + assert response.query_id == "abcdefg" + + +def test_query_and_wait_bigframes_callback(client): + created = datetime.datetime( + 2025, 8, 18, 10, 11, 12, 345000, tzinfo=datetime.timezone.utc + ) + started = datetime.datetime( + 2025, 8, 18, 10, 11, 13, 456000, tzinfo=datetime.timezone.utc + ) + ended = datetime.datetime( + 2025, 8, 18, 10, 11, 14, 567000, tzinfo=datetime.timezone.utc + ) + client._http.request.side_effect = [ + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query + "location": LOCATION, + "queryId": "abcdefg", + "totalRows": "100", + "totalBytesProcessed": "123", + "totalSlotMs": "987", + "jobComplete": True, + "creationTime": _to_millis(created), + "startTime": _to_millis(started), + "endTime": _to_millis(ended), + } + ), + ] + callback = mock.Mock() + client._query_and_wait_bigframes(query="SELECT 1", callback=callback) + callback.assert_has_calls( + [ + mock.call( + _job_helpers.QuerySentEvent( + query="SELECT 1", + billing_project=PROJECT, + location=LOCATION, + # No job ID, because a basic query is eligible for jobs.query. + job_id=None, + request_id=mock.ANY, + ) + ), + mock.call( + _job_helpers.QueryFinishedEvent( + billing_project=PROJECT, + location=LOCATION, + query_id="abcdefg", + total_rows=100, + total_bytes_processed=123, + slot_millis=987, + created=created, + started=started, + ended=ended, + # No job ID or destination, because a basic query is eligible for jobs.query. + job_id=None, + destination=None, + ), + ), + ] + ) + + +def _to_millis(dt: datetime.datetime) -> str: + return str( + int( + (dt - datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)) + / datetime.timedelta(milliseconds=1) + ) + ) + + +def test_query_and_wait_bigframes_with_jobs_insert_callback_empty_results(client): + client._http.request.side_effect = [ + # jobs.insert because destination table present in job_config + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "statistics": { + "creationTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ) + ), + "query": { + "statementType": "SELECT", + # "queryPlan": [{"name": "part1"}, {"name": "part2"}], + }, + }, + "status": { + "state": "PENDING", + }, + } + ), + # jobs.get waiting for query to finish + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "status": { + "state": "PENDING", + }, + } + ), + # jobs.getQueryResults with max_results=0 + make_response( + { + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "jobComplete": True, + # totalRows is intentionally missing so we end up in the _EmptyRowIterator code path. + } + ), + # jobs.get + make_response( + { + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "statistics": { + "creationTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ) + ), + "startTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 32, 123000, tzinfo=datetime.timezone.utc + ) + ), + "endTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 33, 123000, tzinfo=datetime.timezone.utc + ) + ), + "query": { + "statementType": "SELECT", + "totalBytesProcessed": 123, + "totalSlotMs": 987, + }, + }, + "status": {"state": "DONE"}, + } + ), + ] + callback = mock.Mock() + config = bigquery.QueryJobConfig() + config.destination = "proj.dset.table" + client._query_and_wait_bigframes( + query="SELECT 1", job_config=config, callback=callback + ) + callback.assert_has_calls( + [ + mock.call( + _job_helpers.QuerySentEvent( + query="SELECT 1", + billing_project="response-project", + location="response-location", + job_id="response-job-id", + # We use jobs.insert not jobs.query because destination is + # present on job_config. + request_id=None, + ) + ), + mock.call( + _job_helpers.QueryReceivedEvent( + billing_project="response-project", + location="response-location", + job_id="response-job-id", + statement_type="SELECT", + state="PENDING", + query_plan=[], + created=datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ), + started=None, + ended=None, + ) + ), + mock.call( + _job_helpers.QueryFinishedEvent( + billing_project="response-project", + location="response-location", + job_id="response-job-id", + query_id=None, + total_rows=0, + total_bytes_processed=123, + slot_millis=987, + created=datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ), + started=datetime.datetime( + 2025, 8, 13, 13, 7, 32, 123000, tzinfo=datetime.timezone.utc + ), + ended=datetime.datetime( + 2025, 8, 13, 13, 7, 33, 123000, tzinfo=datetime.timezone.utc + ), + destination=None, + ), + ), + ] + ) + + +def test_query_and_wait_bigframes_with_jobs_insert_dry_run_no_callback(client): + client._http.request.side_effect = [ + # jobs.insert because destination table present in job_config + make_response( + { + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "statistics": { + "creationTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ) + ), + "query": { + "statementType": "SELECT", + "totalBytesProcessed": 123, + "schema": { + "fields": [ + {"name": "_f0", "type": "INTEGER"}, + ], + }, + }, + }, + "configuration": { + "dryRun": True, + }, + "status": {"state": "DONE"}, + } + ), + ] + callback = mock.Mock() + config = bigquery.QueryJobConfig() + config.destination = "proj.dset.table" + config.dry_run = True + result = client._query_and_wait_bigframes( + query="SELECT 1", job_config=config, callback=callback + ) + callback.assert_not_called() + assert result.total_bytes_processed == 123 + assert result.schema == [bigquery.SchemaField("_f0", "INTEGER")] + + +def test_query_and_wait_bigframes_with_query_retry_callbacks(client, global_time_lock): + created = datetime.datetime( + 2025, 8, 18, 10, 11, 12, 345000, tzinfo=datetime.timezone.utc + ) + started = datetime.datetime( + 2025, 8, 18, 10, 11, 13, 456000, tzinfo=datetime.timezone.utc + ) + ended = datetime.datetime( + 2025, 8, 18, 10, 11, 14, 567000, tzinfo=datetime.timezone.utc + ) + client._http.request.side_effect = [ + exceptions.InternalServerError( + "first try", errors=({"reason": "jobInternalError"},) + ), + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query + "location": LOCATION, + "queryId": "abcdefg", + "totalRows": "100", + "totalBytesProcessed": "123", + "totalSlotMs": "987", + "jobComplete": True, + "creationTime": _to_millis(created), + "startTime": _to_millis(started), + "endTime": _to_millis(ended), + } + ), + ] + callback = mock.Mock() + client._query_and_wait_bigframes(query="SELECT 1", callback=callback) + callback.assert_has_calls( + [ + mock.call( + _job_helpers.QuerySentEvent( + query="SELECT 1", + billing_project=PROJECT, + location=LOCATION, + # No job ID, because a basic query is eligible for jobs.query. + job_id=None, + request_id=mock.ANY, + ) + ), + mock.call( + _job_helpers.QueryRetryEvent( + query="SELECT 1", + billing_project=PROJECT, + location=LOCATION, + # No job ID, because a basic query is eligible for jobs.query. + job_id=None, + request_id=mock.ANY, + ) + ), + mock.call( + _job_helpers.QueryFinishedEvent( + billing_project=PROJECT, + location=LOCATION, + query_id=mock.ANY, + total_rows=100, + total_bytes_processed=123, + slot_millis=987, + created=created, + started=started, + ended=ended, + # No job ID or destination, because a basic query is eligible for jobs.query. + job_id=None, + destination=None, + ), + ), + ] + ) diff --git a/tests/unit/test_client_resumable_media_upload.py b/tests/unit/test_client_resumable_media_upload.py new file mode 100644 index 000000000..642c18d15 --- /dev/null +++ b/tests/unit/test_client_resumable_media_upload.py @@ -0,0 +1,433 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock +import email +import http.client +import io +import json + +import pytest + +from google.cloud.bigquery.table import TableReference + +from .helpers import make_connection + + +PROJECT = "test-project" +TABLE_REF = TableReference.from_string(f"{PROJECT}.test_dataset.test_table") +EXPECTED_CONFIGURATION = { + "load": { + "destinationTable": { + "projectId": PROJECT, + "datasetId": "test_dataset", + "tableId": "test_table", + }, + "sourceFormat": "CSV", + } +} + + +@pytest.fixture(autouse=True) +def mock_sleep(monkeypatch): + sleep = mock.Mock() + monkeypatch.setattr("time.sleep", sleep) + + +def _make_credentials(): + import google.auth.credentials + + return mock.Mock(spec=google.auth.credentials.Credentials) + + +def _make_client(*args, **kw): + from google.cloud.bigquery.client import Client + + kw["credentials"] = _make_credentials() + kw["project"] = PROJECT + return Client(*args, **kw) + + +def _make_file_obj(contents=b"some data"): + return io.BytesIO(contents) + + +def _make_response(status_code, content=b"", headers=None): + response = mock.Mock(spec=["status_code", "content", "request", "headers"]) + response.status_code = status_code + response.content = content + response.headers = headers or {} + response.request = mock.Mock(spec=["headers"]) + return response + + +def _make_resumable_upload_responses(num_bytes): + # In a real scenario, the upload URL is returned in a 'Location' + # header. + return [ + _make_response( + http.client.OK, + headers={"location": "http://test.invalid/upload-id"}, + ), + _make_response( + http.client.OK, content=json.dumps({"size": num_bytes}).encode("utf-8") + ), + ] + + +def _make_transport(responses=None): + import google.auth.transport.requests + + transport = mock.create_autospec( + google.auth.transport.requests.AuthorizedSession, instance=True + ) + transport.request.side_effect = responses + return transport + + +def _mock_requests_response(status_code, headers, content=b""): + return mock.Mock( + content=content, + headers=headers, + status_code=status_code, + spec=["content", "headers", "status_code"], + ) + + +def _mock_transport(status_code, headers, content=b""): + fake_transport = mock.Mock(spec=["request"]) + fake_response = _mock_requests_response(status_code, headers, content=content) + fake_transport.request.return_value = fake_response + return fake_transport + + +def _initiate_resumable_upload_helper(num_retries=None, mtls=False): + from google.resumable_media.requests import ResumableUpload + from google.cloud.bigquery.client import _DEFAULT_CHUNKSIZE + from google.cloud.bigquery.client import _GENERIC_CONTENT_TYPE + from google.cloud.bigquery.client import _get_upload_headers + from google.cloud.bigquery.job import LoadJob + from google.cloud.bigquery.job import LoadJobConfig + from google.cloud.bigquery.job import SourceFormat + + # Create mocks to be checked for doing transport. + resumable_url = "http://test.invalid?upload_id=hey-you" + response_headers = {"location": resumable_url} + fake_transport = _mock_transport(http.client.OK, response_headers) + client = _make_client(_http=fake_transport) + conn = client._connection = make_connection() + if mtls: + conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") + + # Create some mock arguments and call the method under test. + data = b"goodbye gudbi gootbee" + stream = io.BytesIO(data) + config = LoadJobConfig() + config.source_format = SourceFormat.CSV + job = LoadJob(None, None, TABLE_REF, client, job_config=config) + metadata = job.to_api_repr() + upload, transport_out = client._initiate_resumable_upload( + stream, metadata, num_retries, None + ) + + # Check the returned values. + assert isinstance(upload, ResumableUpload) + + host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" + upload_url = ( + f"{host_name}/upload/bigquery/v2/projects/{PROJECT}/jobs?uploadType=resumable" + ) + assert upload.upload_url == upload_url + expected_headers = _get_upload_headers(conn.user_agent) + assert upload._headers == expected_headers + assert not upload.finished + assert upload._chunk_size == _DEFAULT_CHUNKSIZE + assert upload._stream is stream + assert upload._total_bytes is None + assert upload._content_type == _GENERIC_CONTENT_TYPE + assert upload.resumable_url == resumable_url + + retry_strategy = upload._retry_strategy + assert retry_strategy.max_sleep == 64.0 + if num_retries is None: + assert retry_strategy.max_cumulative_retry == 600.0 + assert retry_strategy.max_retries is None + else: + assert retry_strategy.max_cumulative_retry is None + assert retry_strategy.max_retries == num_retries + assert transport_out is fake_transport + # Make sure we never read from the stream. + assert stream.tell() == 0 + + # Check the mocks. + request_headers = expected_headers.copy() + request_headers["x-upload-content-type"] = _GENERIC_CONTENT_TYPE + fake_transport.request.assert_called_once_with( + "POST", + upload_url, + data=json.dumps(metadata).encode("utf-8"), + headers=request_headers, + timeout=mock.ANY, + ) + + +def test__initiate_resumable_upload(): + _initiate_resumable_upload_helper() + + +def test__initiate_resumable_upload_mtls(): + _initiate_resumable_upload_helper(mtls=True) + + +def test_initiate_resumable_upload_with_retry(): + _initiate_resumable_upload_helper(num_retries=11) + + +def _do_multipart_upload_success_helper( + get_boundary, num_retries=None, project=None, mtls=False +): + from google.cloud.bigquery.client import _get_upload_headers + from google.cloud.bigquery.job import LoadJob + from google.cloud.bigquery.job import LoadJobConfig + from google.cloud.bigquery.job import SourceFormat + + fake_transport = _mock_transport(http.client.OK, {}) + client = _make_client(_http=fake_transport) + conn = client._connection = make_connection() + if mtls: + conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") + + if project is None: + project = PROJECT + + # Create some mock arguments. + data = b"Bzzzz-zap \x00\x01\xf4" + stream = io.BytesIO(data) + config = LoadJobConfig() + config.source_format = SourceFormat.CSV + job = LoadJob(None, None, TABLE_REF, client, job_config=config) + metadata = job.to_api_repr() + size = len(data) + + response = client._do_multipart_upload( + stream, metadata, size, num_retries, None, project=project + ) + + # Check the mocks and the returned value. + assert response is fake_transport.request.return_value + assert stream.tell() == size + get_boundary.assert_called_once_with() + + host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" + upload_url = ( + f"{host_name}/upload/bigquery/v2/projects/{project}/jobs?uploadType=multipart" + ) + payload = ( + b"--==0==\r\n" + b"content-type: application/json; charset=UTF-8\r\n\r\n" + b"%(json_metadata)s" + b"\r\n" + b"--==0==\r\n" + b"content-type: */*\r\n\r\n" + b"%(data)s" + b"\r\n" + b"--==0==--" + ) % {b"json_metadata": json.dumps(metadata).encode("utf-8"), b"data": data} + + headers = _get_upload_headers(conn.user_agent) + headers["content-type"] = b'multipart/related; boundary="==0=="' + fake_transport.request.assert_called_once_with( + "POST", upload_url, data=payload, headers=headers, timeout=mock.ANY + ) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test__do_multipart_upload(get_boundary): + _do_multipart_upload_success_helper(get_boundary) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test__do_multipart_upload_mtls(get_boundary): + _do_multipart_upload_success_helper(get_boundary, mtls=True) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test_do_multipart_upload_with_retry(get_boundary): + _do_multipart_upload_success_helper(get_boundary, num_retries=8) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test__do_multipart_upload_with_custom_project(get_boundary): + _do_multipart_upload_success_helper(get_boundary, project="custom-project") + + +def test__do_resumable_upload(): + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + transport = _make_transport(_make_resumable_upload_responses(file_obj_len)) + client = _make_client(_http=transport) + + result = client._do_resumable_upload(file_obj, EXPECTED_CONFIGURATION, None, None) + + content = result.content.decode("utf-8") + assert json.loads(content) == {"size": file_obj_len} + + transport.request.assert_any_call( + "POST", + mock.ANY, + data=json.dumps(EXPECTED_CONFIGURATION).encode("utf-8"), + headers=mock.ANY, + timeout=mock.ANY, + ) + + +def test__do_resumable_upload_custom_project(): + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + transport = _make_transport(_make_resumable_upload_responses(file_obj_len)) + client = _make_client(_http=transport) + + result = client._do_resumable_upload( + file_obj, + EXPECTED_CONFIGURATION, + None, + None, + project="custom-project", + ) + + content = result.content.decode("utf-8") + assert json.loads(content) == {"size": file_obj_len} + + transport.request.assert_any_call( + "POST", + mock.ANY, + data=json.dumps(EXPECTED_CONFIGURATION).encode("utf-8"), + headers=mock.ANY, + timeout=mock.ANY, + ) + + initiation_url = next( + ( + call[0][1] + for call in transport.request.call_args_list + if call[0][0] == "POST" and "uploadType=resumable" in call[0][1] + ), + None, + ) + assert initiation_url is not None + assert "projects/custom-project" in initiation_url + + +def test__do_resumable_upload_custom_timeout(): + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + transport = _make_transport(_make_resumable_upload_responses(file_obj_len)) + client = _make_client(_http=transport) + + client._do_resumable_upload( + file_obj, EXPECTED_CONFIGURATION, num_retries=0, timeout=3.14 + ) + + for call_args in transport.request.call_args_list: + assert call_args[1].get("timeout") == 3.14 + + +def test__do_multipart_upload_request_body(): + transport = _make_transport([_make_response(http.client.OK)]) + client = _make_client(_http=transport) + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + + client._do_multipart_upload( + file_obj, EXPECTED_CONFIGURATION, file_obj_len, None, None + ) + + request_args = transport.request.mock_calls[0][2] + request_data = request_args["data"].decode("utf-8") + request_headers = request_args["headers"] + + request_content = email.message_from_string( + "Content-Type: {}\n{}".format( + request_headers["content-type"].decode("utf-8"), request_data + ) + ) + + configuration_data = request_content.get_payload(0).get_payload() + binary_data = request_content.get_payload(1).get_payload() + + assert json.loads(configuration_data) == EXPECTED_CONFIGURATION + assert binary_data.encode("utf-8") == file_obj.getvalue() + + +def test__do_multipart_upload_wrong_size(): + client = _make_client() + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + + with pytest.raises(ValueError): + client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None, None) + + +def test_schema_from_json_with_file_path(): + from google.cloud.bigquery.schema import SchemaField + + file_content = """ + [ + { + "description": "quarter", + "mode": "REQUIRED", + "name": "qtr", + "type": "STRING" + }, + { + "description": "sales representative", + "mode": "NULLABLE", + "name": "rep", + "type": "STRING" + }, + { + "description": "total sales", + "mode": "NULLABLE", + "name": "sales", + "type": "FLOAT" + } + ]""" + + expected = [ + SchemaField("qtr", "STRING", "REQUIRED", description="quarter"), + SchemaField( + "rep", + "STRING", + "NULLABLE", + description="sales representative", + ), + SchemaField( + "sales", + "FLOAT", + "NULLABLE", + description="total sales", + ), + ] + + client = _make_client() + mock_file_path = "/mocked/file.json" + + open_patch = mock.patch("builtins.open", new=mock.mock_open(read_data=file_content)) + + with open_patch as _mock_file: + actual = client.schema_from_json(mock_file_path) + _mock_file.assert_called_once_with(mock_file_path) + _mock_file.return_value.read.assert_called_once() + + assert expected == actual diff --git a/tests/unit/test_client_retry.py b/tests/unit/test_client_retry.py new file mode 100644 index 000000000..f0e7ac88f --- /dev/null +++ b/tests/unit/test_client_retry.py @@ -0,0 +1,284 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import freezegun +import google.api_core.exceptions +from google.cloud.bigquery import job as bqjob +from google.cloud.bigquery.retry import DEFAULT_RETRY +from .helpers import make_connection + + +PROJECT = "test-project" + +# A deadline > 1.0s is required because the default retry (google.api_core.retry.Retry) +# has an initial delay of 1.0s. If the deadline is <= 1.0s, the first retry attempt +# (scheduled for now + 1.0s) will be rejected immediately as exceeding the deadline. +_RETRY_DEADLINE = 10.0 + + +def _make_credentials(): + import google.auth.credentials + + return mock.Mock(spec=google.auth.credentials.Credentials) + + +def _make_client(*args, **kw): + from google.cloud.bigquery.client import Client + + return Client(*args, **kw) + + +def test_get_service_account_email_w_custom_retry(global_time_lock): + api_path = f"/projects/{PROJECT}/serviceAccount" + creds = _make_credentials() + http = object() + client = _make_client(project=PROJECT, credentials=creds, _http=http) + + resource = { + "kind": "bigquery#getServiceAccountResponse", + "email": "bq-123@bigquery-encryption.iam.gserviceaccount.com", + } + api_request_patcher = mock.patch.object( + client._connection, + "api_request", + side_effect=[ValueError, resource], + ) + + retry = DEFAULT_RETRY.with_deadline(1).with_predicate( + lambda exc: isinstance(exc, ValueError) + ) + + with api_request_patcher as fake_api_request: + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + service_account_email = client.get_service_account_email( + retry=retry, timeout=7.5 + ) + + final_attributes.assert_called_once_with({"path": api_path}, client, None) + assert service_account_email == "bq-123@bigquery-encryption.iam.gserviceaccount.com" + assert fake_api_request.call_args_list == [ + mock.call(method="GET", path=api_path, timeout=7.5), + mock.call(method="GET", path=api_path, timeout=7.5), # was retried once + ] + + +def test_call_api_applying_custom_retry_on_timeout(global_time_lock): + from concurrent.futures import TimeoutError + + creds = _make_credentials() + client = _make_client(project=PROJECT, credentials=creds) + + api_request_patcher = mock.patch.object( + client._connection, + "api_request", + side_effect=[TimeoutError, "result"], + ) + retry = DEFAULT_RETRY.with_deadline(_RETRY_DEADLINE).with_predicate( + lambda exc: isinstance(exc, TimeoutError) + ) + + with api_request_patcher as fake_api_request: + result = client._call_api(retry, foo="bar") + + assert result == "result" + assert fake_api_request.call_args_list == [ + mock.call(foo="bar"), + mock.call(foo="bar"), + ] + + +def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retries_404( + global_time_lock, +): + """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 + + Sometimes after a Conflict, the fetch fails with a 404, but we know + because of the conflict that really the job does exist. Retry until we + get the job status (or timeout). + """ + job_id = "abc123" + creds = _make_credentials() + http = object() + client = _make_client(project=PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection( + # We're mocking QueryJob._begin, so this is only going to be + # jobs.get requests and responses. + google.api_core.exceptions.TooManyRequests("this is retriable by default"), + google.api_core.exceptions.NotFound("we lost your job"), + google.api_core.exceptions.NotFound("we lost your job again, sorry"), + { + "jobReference": { + "projectId": PROJECT, + "location": "TESTLOC", + "jobId": job_id, + } + }, + ) + + job_create_error = google.api_core.exceptions.Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + bqjob.QueryJob, "_begin", side_effect=job_create_error + ) + job_id_patcher = mock.patch.object( + google.cloud.bigquery._job_helpers, + "make_job_id", + return_value=job_id, + ) + + with job_begin_patcher, job_id_patcher: + # If get job request fails there does exist a job + # with this ID already, retry 404 until we get it (or fails for a + # non-retriable reason, see other tests). + result = client.query("SELECT 1;", job_id=None) + + jobs_get_path = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{job_id}", + query_params={ + "projection": "full", + }, + timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, + ) + conn.api_request.assert_has_calls( + # Double-check that it was jobs.get that was called for each of our + # mocked responses. + [jobs_get_path] + * 4, + ) + assert result.job_id == job_id + + +def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retries_404_and_query_job_insert( + global_time_lock, +): + """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 + + Sometimes after a Conflict, the fetch fails with a 404. If it keeps + failing with a 404, assume that the job actually doesn't exist. + """ + job_id_1 = "abc123" + job_id_2 = "xyz789" + creds = _make_credentials() + http = object() + client = _make_client(project=PROJECT, credentials=creds, _http=http) + + # We're mocking QueryJob._begin, so that the connection should only get + # jobs.get requests. + job_create_error = google.api_core.exceptions.Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + bqjob.QueryJob, "_begin", side_effect=job_create_error + ) + conn = client._connection = make_connection( + google.api_core.exceptions.NotFound("we lost your job again, sorry"), + { + "jobReference": { + "projectId": PROJECT, + "location": "TESTLOC", + "jobId": job_id_2, + } + }, + ) + + # Choose a small deadline so the 404 retries give up. + retry = google.cloud.bigquery.retry._DEFAULT_GET_JOB_CONFLICT_RETRY.with_deadline(1) + job_id_patcher = mock.patch.object( + google.cloud.bigquery._job_helpers, + "make_job_id", + side_effect=[job_id_1, job_id_2], + ) + retry_patcher = mock.patch.object( + google.cloud.bigquery.retry, + "_DEFAULT_GET_JOB_CONFLICT_RETRY", + retry, + ) + + with freezegun.freeze_time( + "2025-01-01 00:00:00", + # 10x the retry deadline to guarantee a timeout. + auto_tick_seconds=10, + ), job_begin_patcher, job_id_patcher, retry_patcher: + # If get job request fails there does exist a job + # with this ID already, retry 404 until we get it (or fails for a + # non-retriable reason, see other tests). + result = client.query("SELECT 1;", job_id=None) + + jobs_get_path_1 = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{job_id_1}", + query_params={ + "projection": "full", + }, + timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, + ) + jobs_get_path_2 = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{job_id_2}", + query_params={ + "projection": "full", + }, + timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, + ) + conn.api_request.assert_has_calls( + # Double-check that it was jobs.get that was called for each of our + # mocked responses. + [jobs_get_path_1, jobs_get_path_2], + ) + assert result.job_id == job_id_2 + + +def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retry(global_time_lock): + """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 + + If we get a 409 conflict on jobs.insert, and we are using a random + job ID, we should retry by getting the job by ID. This test ensures that + if the get job by ID fails, we retry the whole sequence. + """ + from google.cloud.bigquery import job + + client = _make_client(project=PROJECT, credentials=_make_credentials()) + job_id = "some-random-job-id" + query_text = "SELECT 1" + job_config = job.QueryJobConfig() + job_config.use_legacy_sql = False + + job_resource = { + "jobReference": {"projectId": PROJECT, "jobId": job_id}, + "configuration": {"query": {"query": query_text}}, + "status": {"state": "DONE"}, + } + + conn = make_connection( + # First attempt at jobs.insert fails with a 409 + google.api_core.exceptions.Conflict("Job already exists."), + # First attempt at jobs.get fails with a 500 + google.api_core.exceptions.InternalServerError("get job failed"), + # Second attempt at jobs.insert succeeds + job_resource, + ) + client._connection = conn + + job_id_patcher = mock.patch.object( + google.cloud.bigquery._job_helpers, + "make_job_id", + return_value=job_id, + ) + + with job_id_patcher: + query_job = client.query(query_text, job_config=job_config, job_id=None) + + assert query_job.job_id == job_id diff --git a/tests/unit/test_create_dataset.py b/tests/unit/test_create_dataset.py index a2491a812..b144471ca 100644 --- a/tests/unit/test_create_dataset.py +++ b/tests/unit/test_create_dataset.py @@ -65,6 +65,7 @@ def test_create_dataset_w_attrs(client, PROJECT, DS_ID): "tableId": "northern-hemisphere", } DEFAULT_ROUNDING_MODE = "ROUND_HALF_EVEN" + RESOURCE_TAGS = {"123456789012/foo": "bar"} RESOURCE = { "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, "etag": "etag", @@ -76,6 +77,7 @@ def test_create_dataset_w_attrs(client, PROJECT, DS_ID): "labels": LABELS, "access": [{"role": "OWNER", "userByEmail": USER_EMAIL}, {"view": VIEW}], "defaultRoundingMode": DEFAULT_ROUNDING_MODE, + "resourceTags": RESOURCE_TAGS, } conn = client._connection = make_connection(RESOURCE) entries = [ @@ -91,6 +93,7 @@ def test_create_dataset_w_attrs(client, PROJECT, DS_ID): before.default_table_expiration_ms = 3600 before.location = LOCATION before.labels = LABELS + before.resource_tags = RESOURCE_TAGS before.default_rounding_mode = DEFAULT_ROUNDING_MODE after = client.create_dataset(before) assert after.dataset_id == DS_ID @@ -103,6 +106,7 @@ def test_create_dataset_w_attrs(client, PROJECT, DS_ID): assert after.default_table_expiration_ms == 3600 assert after.labels == LABELS assert after.default_rounding_mode == DEFAULT_ROUNDING_MODE + assert after.resource_tags == RESOURCE_TAGS conn.api_request.assert_called_once_with( method="POST", @@ -119,6 +123,7 @@ def test_create_dataset_w_attrs(client, PROJECT, DS_ID): {"view": VIEW, "role": None}, ], "labels": LABELS, + "resourceTags": RESOURCE_TAGS, }, timeout=DEFAULT_TIMEOUT, ) @@ -367,7 +372,12 @@ def test_create_dataset_alreadyexists_w_exists_ok_true(PROJECT, DS_ID, LOCATION) }, timeout=DEFAULT_TIMEOUT, ), - mock.call(method="GET", path=get_path, timeout=DEFAULT_TIMEOUT), + mock.call( + method="GET", + path=get_path, + timeout=DEFAULT_TIMEOUT, + query_params={}, + ), ] ) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index c0164bc73..604e5ed2e 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -19,6 +19,7 @@ import pytest from google.cloud.bigquery.dataset import ( AccessEntry, + Condition, Dataset, DatasetReference, Table, @@ -166,7 +167,10 @@ def test_from_api_repr_wo_role(self): entity_type="view", entity_id=resource["view"], ) - self.assertEqual(entry, exp_entry) + + assert entry.entity_type == exp_entry.entity_type + assert entry.entity_id == exp_entry.entity_id + assert entry.role is None def test_to_api_repr_w_extra_properties(self): resource = { @@ -178,15 +182,6 @@ def test_to_api_repr_w_extra_properties(self): exp_resource = entry.to_api_repr() self.assertEqual(resource, exp_resource) - def test_from_api_repr_entries_w_extra_keys(self): - resource = { - "role": "READER", - "specialGroup": "projectReaders", - "userByEmail": "salmon@example.com", - } - with self.assertRaises(ValueError): - self._get_target_class().from_api_repr(resource) - def test_view_getter_setter(self): view = { "projectId": "my_project", @@ -306,7 +301,10 @@ def test_dataset_getter_setter_dataset_ref(self): entry.dataset = dataset_ref resource = entry.to_api_repr() exp_resource = { - "dataset": {"dataset": dataset_ref, "targetTypes": None}, + "dataset": { + "dataset": {"datasetId": "my_dataset", "projectId": "my-project"}, + "targetTypes": None, + }, "role": None, } self.assertEqual(resource, exp_resource) @@ -493,6 +491,278 @@ def test_dataset_target_types_getter_setter_w_dataset(self): self.assertEqual(entry.dataset_target_types, target_types) +# --- Tests for AccessEntry when using Condition --- + +EXPRESSION = "request.time < timestamp('2026-01-01T00:00:00Z')" +TITLE = "Expires end 2025" +DESCRIPTION = "Access expires at the start of 2026." + + +@pytest.fixture +def condition_1(): + """Provides a sample Condition object.""" + return Condition( + expression=EXPRESSION, + title=TITLE, + description=DESCRIPTION, + ) + + +@pytest.fixture +def condition_1_api_repr(): + """Provides the API representation for condition_1.""" + # Use the actual to_api_repr method + return Condition( + expression=EXPRESSION, + title=TITLE, + description=DESCRIPTION, + ).to_api_repr() + + +@pytest.fixture +def condition_2(): + """Provides a second, different Condition object.""" + return Condition( + expression="resource.name.startsWith('projects/_/buckets/restricted/')", + title="Restricted Buckets", + ) + + +@pytest.fixture +def condition_2_api_repr(): + """Provides the API representation for condition2.""" + # Use the actual to_api_repr method + return Condition( + expression="resource.name.startsWith('projects/_/buckets/restricted/')", + title="Restricted Buckets", + ).to_api_repr() + + +class TestAccessEntryAndCondition: + @staticmethod + def _get_target_class(): + return AccessEntry + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + # Test __init__ without condition + def test_init_without_condition(self): + entry = AccessEntry("READER", "userByEmail", "test@example.com") + assert entry.role == "READER" + assert entry.entity_type == "userByEmail" + assert entry.entity_id == "test@example.com" + assert entry.condition is None + # Accessing _properties is for internal verification in tests + assert "condition" not in entry._properties + + # Test __init__ with condition object + def test_init_with_condition_object(self, condition_1, condition_1_api_repr): + entry = AccessEntry( + "READER", "userByEmail", "test@example.com", condition=condition_1 + ) + assert entry.condition == condition_1 + assert entry._properties.get("condition") == condition_1_api_repr + + # Test __init__ with condition=None + def test_init_with_condition_none(self): + entry = AccessEntry("READER", "userByEmail", "test@example.com", condition=None) + assert entry.condition is None + + # Test condition getter/setter + def test_condition_getter_setter( + self, condition_1, condition_1_api_repr, condition_2, condition_2_api_repr + ): + entry = AccessEntry("WRITER", "group", "admins@example.com") + assert entry.condition is None + + # Set condition 1 + entry.condition = condition_1 + assert entry.condition.to_api_repr() == condition_1_api_repr + assert entry._properties.get("condition") == condition_1_api_repr + + # Set condition 2 + entry.condition = condition_2 + assert entry.condition.to_api_repr() == condition_2_api_repr + assert entry._properties.get("condition") != condition_1_api_repr + assert entry._properties.get("condition") == condition_2.to_api_repr() + + # Set back to None + entry.condition = None + assert entry.condition is None + + # Set condition using a dict + entry.condition = condition_1_api_repr + assert entry._properties.get("condition") == condition_1_api_repr + + # Test setter validation + def test_condition_setter_invalid_type(self): + entry = AccessEntry("READER", "domain", "example.com") + with pytest.raises( + TypeError, match="condition must be a Condition object, dict, or None" + ): + entry.condition = 123 # type: ignore + + # Test equality/hash without condition + def test_equality_and_hash_without_condition(self): + entry1 = AccessEntry("OWNER", "specialGroup", "projectOwners") + entry2 = AccessEntry("OWNER", "specialGroup", "projectOwners") + entry3 = AccessEntry("WRITER", "specialGroup", "projectOwners") + assert entry1 == entry2 + assert entry1 != entry3 + assert hash(entry1) == hash(entry2) + assert hash(entry1) != hash(entry3) # Usually true + + def test_equality_and_hash_from_api_repr(self): + """Compare equal entries where one was created via from_api_repr.""" + entry1 = AccessEntry("OWNER", "specialGroup", "projectOwners") + entry2 = AccessEntry.from_api_repr( + {"role": "OWNER", "specialGroup": "projectOwners"} + ) + assert entry1 == entry2 + assert hash(entry1) == hash(entry2) + + def test_equality_and_hash_with_condition(self, condition_1, condition_2): + cond1a = Condition( + condition_1.expression, condition_1.title, condition_1.description + ) + cond1b = Condition( + condition_1.expression, condition_1.title, condition_1.description + ) # Same values, different object + + entry1a = AccessEntry( + "READER", "userByEmail", "a@example.com", condition=cond1a + ) + entry1b = AccessEntry( + "READER", "userByEmail", "a@example.com", condition=cond1b + ) # Different Condition instance + entry2 = AccessEntry( + "READER", "userByEmail", "a@example.com", condition=condition_2 + ) + entry3 = AccessEntry("READER", "userByEmail", "a@example.com") # No condition + entry4 = AccessEntry( + "WRITER", "userByEmail", "a@example.com", condition=cond1a + ) # Different role + + assert entry1a == entry1b + assert entry1a != entry2 + assert entry1a != entry3 + assert entry1a != entry4 + assert entry2 != entry3 + + assert hash(entry1a) == hash(entry1b) + assert hash(entry1a) != hash(entry2) # Usually true + assert hash(entry1a) != hash(entry3) # Usually true + assert hash(entry1a) != hash(entry4) # Usually true + + # Test to_api_repr with condition + def test_to_api_repr_with_condition(self, condition_1, condition_1_api_repr): + entry = AccessEntry( + "WRITER", "groupByEmail", "editors@example.com", condition=condition_1 + ) + expected_repr = { + "role": "WRITER", + "groupByEmail": "editors@example.com", + "condition": condition_1_api_repr, + } + assert entry.to_api_repr() == expected_repr + + def test_view_property_with_condition(self, condition_1): + """Test setting/getting view property when condition is present.""" + entry = AccessEntry(role=None, entity_type="view", condition=condition_1) + view_ref = TableReference(DatasetReference("proj", "dset"), "view_tbl") + entry.view = view_ref # Use the setter + assert entry.view == view_ref + assert entry.condition == condition_1 # Condition should persist + assert entry.role is None + assert entry.entity_type == "view" + + # Check internal representation + assert "view" in entry._properties + assert "condition" in entry._properties + + def test_user_by_email_property_with_condition(self, condition_1): + """Test setting/getting user_by_email property when condition is present.""" + entry = AccessEntry( + role="READER", entity_type="userByEmail", condition=condition_1 + ) + email = "test@example.com" + entry.user_by_email = email # Use the setter + assert entry.user_by_email == email + assert entry.condition == condition_1 # Condition should persist + assert entry.role == "READER" + assert entry.entity_type == "userByEmail" + + # Check internal representation + assert "userByEmail" in entry._properties + assert "condition" in entry._properties + + # Test from_api_repr without condition + def test_from_api_repr_without_condition(self): + api_repr = {"role": "OWNER", "userByEmail": "owner@example.com"} + entry = AccessEntry.from_api_repr(api_repr) + assert entry.role == "OWNER" + assert entry.entity_type == "userByEmail" + assert entry.entity_id == "owner@example.com" + assert entry.condition is None + + # Test from_api_repr with condition + def test_from_api_repr_with_condition(self, condition_1, condition_1_api_repr): + api_repr = { + "role": "READER", + "view": {"projectId": "p", "datasetId": "d", "tableId": "v"}, + "condition": condition_1_api_repr, + } + entry = AccessEntry.from_api_repr(api_repr) + assert entry.role == "READER" + assert entry.entity_type == "view" + # The entity_id for view/routine/dataset is the dict itself + assert entry.entity_id == {"projectId": "p", "datasetId": "d", "tableId": "v"} + assert entry.condition == condition_1 + + # Test from_api_repr edge case + def test_from_api_repr_no_entity(self, condition_1, condition_1_api_repr): + api_repr = {"role": "READER", "condition": condition_1_api_repr} + entry = AccessEntry.from_api_repr(api_repr) + assert entry.role == "READER" + assert entry.entity_type is None + assert entry.entity_id is None + assert entry.condition == condition_1 + + def test_dataset_property_with_condition(self, condition_1): + project = "my-project" + dataset_id = "my_dataset" + dataset_ref = DatasetReference(project, dataset_id) + entry = self._make_one(None) + entry.dataset = dataset_ref + entry.condition = condition_1 + + resource = entry.to_api_repr() + exp_resource = { + "role": None, + "dataset": { + "dataset": {"datasetId": "my_dataset", "projectId": "my-project"}, + "targetTypes": None, + }, + "condition": { + "expression": "request.time < timestamp('2026-01-01T00:00:00Z')", + "title": "Expires end 2025", + "description": "Access expires at the start of 2026.", + }, + } + assert resource == exp_resource + # Check internal representation + assert "dataset" in entry._properties + assert "condition" in entry._properties + + def test_repr_from_api_repr(self): + """Check that repr() includes the correct entity_type when the object is initialized from a dictionary.""" + api_repr = {"role": "OWNER", "userByEmail": "owner@example.com"} + entry = AccessEntry.from_api_repr(api_repr) + entry_str = repr(entry) + assert entry_str == "" + + class TestDatasetReference(unittest.TestCase): @staticmethod def _get_target_class(): @@ -650,6 +920,16 @@ class TestDataset(unittest.TestCase): DS_ID = "dataset-id" DS_REF = DatasetReference(PROJECT, DS_ID) KMS_KEY_NAME = "projects/1/locations/us/keyRings/1/cryptoKeys/1" + DEFAULT_STORAGE_LOCATION_URI = "gs://test-bucket/test-path" + PARAMETERS = {"key": "value"} + API_REPR = { + "datasetReference": {"projectId": "project", "datasetId": "dataset-id"}, + "labels": {}, + "externalCatalogDatasetOptions": { + "defaultStorageLocationUri": DEFAULT_STORAGE_LOCATION_URI, + "parameters": PARAMETERS, + }, + } @staticmethod def _get_target_class(): @@ -665,7 +945,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.006 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.ETAG = "ETAG" self.DS_FULL_ID = "%s:%s" % (self.PROJECT, self.DS_ID) self.RESOURCE_URL = "http://example.com/path/to/resource" @@ -785,6 +1067,7 @@ def test_ctor_defaults(self): self.assertIsNone(dataset.friendly_name) self.assertIsNone(dataset.location) self.assertEqual(dataset.is_case_insensitive, False) + self.assertIsNone(dataset.access_policy_version) def test_ctor_string(self): dataset = self._make_one("some-project.some_dset") @@ -810,7 +1093,15 @@ def test_ctor_explicit(self): self.assertEqual( dataset.path, "/projects/%s/datasets/%s" % (OTHER_PROJECT, self.DS_ID) ) - self.assertEqual(dataset.access_entries, entries) + # creating a list of entries relies on AccessEntry.from_api_repr + # which does not create an object in exactly the same way as calling the + # class directly. We rely on calls to .entity_type and .entity_id to + # finalize the settings on each class. + entry_pairs = zip(dataset.access_entries, entries) + for pair in entry_pairs: + assert pair[0].role == pair[1].role + assert pair[0].entity_type == pair[1].entity_type + assert pair[0].entity_id == pair[1].entity_id self.assertIsNone(dataset.created) self.assertIsNone(dataset.full_dataset_id) @@ -824,6 +1115,34 @@ def test_ctor_explicit(self): self.assertIsNone(dataset.location) self.assertEqual(dataset.is_case_insensitive, False) + def test_access_entries_getter_from_api_repr(self): + """Check that `in` works correctly when Dataset is made via from_api_repr().""" + from google.cloud.bigquery.dataset import AccessEntry + + dataset = self._get_target_class().from_api_repr( + { + "datasetReference": {"projectId": "my-proj", "datasetId": "my_dset"}, + "access": [ + { + "role": "OWNER", + "userByEmail": "uilma@example.com", + }, + { + "role": "READER", + "groupByEmail": "rhubbles@example.com", + }, + ], + } + ) + assert ( + AccessEntry("OWNER", "userByEmail", "uilma@example.com") + in dataset.access_entries + ) + assert ( + AccessEntry("READER", "groupByEmail", "rhubbles@example.com") + in dataset.access_entries + ) + def test_access_entries_setter_non_list(self): dataset = self._make_one(self.DS_REF) with self.assertRaises(TypeError): @@ -843,8 +1162,18 @@ def test_access_entries_setter(self): dataset = self._make_one(self.DS_REF) phred = AccessEntry("OWNER", "userByEmail", "phred@example.com") bharney = AccessEntry("OWNER", "userByEmail", "bharney@example.com") - dataset.access_entries = [phred, bharney] - self.assertEqual(dataset.access_entries, [phred, bharney]) + entries = [phred, bharney] + dataset.access_entries = entries + + # creating a list of entries relies on AccessEntry.from_api_repr + # which does not create an object in exactly the same way as calling the + # class directly. We rely on calls to .entity_type and .entity_id to + # finalize the settings on each class. + entry_pairs = zip(dataset.access_entries, entries) + for pair in entry_pairs: + assert pair[0].role == pair[1].role + assert pair[0].entity_type == pair[1].entity_type + assert pair[0].entity_id == pair[1].entity_id def test_default_partition_expiration_ms(self): dataset = self._make_one("proj.dset") @@ -894,6 +1223,28 @@ def test_location_setter(self): dataset.location = "LOCATION" self.assertEqual(dataset.location, "LOCATION") + def test_resource_tags_update_in_place(self): + dataset = self._make_one(self.DS_REF) + tags = dataset.resource_tags + tags["123456789012/foo"] = "bar" # update in place + self.assertEqual(dataset.resource_tags, {"123456789012/foo": "bar"}) + + def test_resource_tags_setter(self): + dataset = self._make_one(self.DS_REF) + dataset.resource_tags = {"123456789012/foo": "bar"} + self.assertEqual(dataset.resource_tags, {"123456789012/foo": "bar"}) + + def test_resource_tags_setter_bad_value(self): + dataset = self._make_one(self.DS_REF) + with self.assertRaises(ValueError): + dataset.resource_tags = "invalid" + with self.assertRaises(ValueError): + dataset.resource_tags = 123 + + def test_resource_tags_getter_missing_value(self): + dataset = self._make_one(self.DS_REF) + self.assertEqual(dataset.resource_tags, {}) + def test_labels_update_in_place(self): dataset = self._make_one(self.DS_REF) del dataset._properties["labels"] # don't start w/ existing dict @@ -1045,6 +1396,109 @@ def test___repr__(self): expected = "Dataset(DatasetReference('project1', 'dataset1'))" self.assertEqual(repr(dataset), expected) + def test_external_catalog_dataset_options_setter(self): + # GIVEN the parameters DEFAULT_STORAGE_LOCATION_URI and PARAMETERS + # WHEN an ExternalCatalogDatasetOptions obj is created + # and added to a dataset. + # THEN the api representation of the dataset will match API_REPR + + from google.cloud.bigquery.external_config import ExternalCatalogDatasetOptions + + dataset = self._make_one(self.DS_REF) + + ecdo_obj = ExternalCatalogDatasetOptions( + default_storage_location_uri=self.DEFAULT_STORAGE_LOCATION_URI, + parameters=self.PARAMETERS, + ) + dataset.external_catalog_dataset_options = ecdo_obj + + result = dataset.to_api_repr() + expected = self.API_REPR + assert result == expected + + def test_external_catalog_dataset_options_getter_prop_exists(self): + # GIVEN default dataset PLUS an ExternalCatalogDatasetOptions + # THEN confirm that the api_repr of the ExternalCatalogDatasetsOptions + # matches the api_repr of the external_catalog_dataset_options attribute. + + from google.cloud.bigquery.external_config import ExternalCatalogDatasetOptions + + dataset = self._make_one(self.DS_REF) + ecdo_obj = ExternalCatalogDatasetOptions( + default_storage_location_uri=self.DEFAULT_STORAGE_LOCATION_URI, + parameters=self.PARAMETERS, + ) + dataset.external_catalog_dataset_options = ecdo_obj + result = dataset.external_catalog_dataset_options.to_api_repr() + expected = ecdo_obj.to_api_repr() + assert result == expected + + def test_external_catalog_dataset_options_getter_prop_is_none(self): + # GIVEN only a default dataset + # THEN confirm that external_catalog_dataset_options is None + + dataset = self._make_one(self.DS_REF) + expected = None + result = dataset.external_catalog_dataset_options + assert result == expected + + def test_external_catalog_dataset_options_from_api_repr(self): + # GIVEN default dataset including an ExternalCatalogDatasetOptions + # THEN confirm that the api_repr of the ExternalCatalogDatasetsOptions + # on a dataset object created via from_api_repr matches the api_repr + # of the "externalCatalogDatasetOptions" key. + + api_repr = self.API_REPR + klass = self._get_target_class() + dataset = klass.from_api_repr(api_repr) + + result = dataset.external_catalog_dataset_options.to_api_repr() + expected = api_repr["externalCatalogDatasetOptions"] + assert result == expected + + def test_external_catalog_dataset_options_to_api_repr(self): + # GIVEN a dataset api_repr including an ExternalCatalogDatasetOptions key + # THEN confirm that the api_repr of that key from a dataset object created + # via the to_api_repr() method matches the value of the key + # used to create the dataset object + + api_repr = self.API_REPR + klass = self._get_target_class() + dataset = klass.from_api_repr(api_repr) + + result = dataset.to_api_repr()["externalCatalogDatasetOptions"] + expected = api_repr["externalCatalogDatasetOptions"] + assert result == expected + + def test_access_policy_version_valid_input(self): + dataset = self._make_one(self.DS_REF) + # Valid inputs for access_policy_version are currently + # ints 1, 2, 3, and None + # We rely upon the BQ backend to validate acceptable integer + # values, rather than perform that validation in the client. + for expected in [1, 2, 3, None]: + # set property using setter and integer + dataset.access_policy_version = expected + + # check getter and _properties dict + assert ( + dataset.access_policy_version == expected + ), f"Expected {expected} but got {dataset.access_policy_version}" + assert dataset._properties["accessPolicyVersion"] == expected + + def test_access_policy_version_invalid_input(self): + dataset = self._make_one(self.DS_REF) + # Valid inputs for access_policy_version are currently + # ints 1, 2, 3, and None + + with pytest.raises(ValueError): + invalid_value = "a string" + dataset.access_policy_version = invalid_value + + with pytest.raises(ValueError): + invalid_value = 42.0 + dataset.access_policy_version = invalid_value + class TestDatasetListItem(unittest.TestCase): @staticmethod @@ -1122,3 +1576,261 @@ def test_table(self): self.assertEqual(table.table_id, "table_id") self.assertEqual(table.dataset_id, dataset_id) self.assertEqual(table.project, project) + + +class TestCondition: + EXPRESSION = 'resource.name.startsWith("projects/my-project/instances/")' + TITLE = "Instance Access" + DESCRIPTION = "Access to instances in my-project" + + @pytest.fixture + def condition_instance(self): + """Provides a Condition instance for tests.""" + return Condition( + expression=self.EXPRESSION, + title=self.TITLE, + description=self.DESCRIPTION, + ) + + @pytest.fixture + def condition_api_repr(self): + """Provides the API representation for the test Condition.""" + return { + "expression": self.EXPRESSION, + "title": self.TITLE, + "description": self.DESCRIPTION, + } + + # --- Basic Functionality Tests --- + + def test_constructor_and_getters_full(self, condition_instance): + """Test initialization with all arguments and subsequent attribute access.""" + assert condition_instance.expression == self.EXPRESSION + assert condition_instance.title == self.TITLE + assert condition_instance.description == self.DESCRIPTION + + def test_constructor_and_getters_minimal(self): + """Test initialization with only the required expression.""" + condition = Condition(expression=self.EXPRESSION) + assert condition.expression == self.EXPRESSION + assert condition.title is None + assert condition.description is None + + def test_setters(self, condition_instance): + """Test setting attributes after initialization.""" + new_title = "New Title" + new_desc = "New Description" + new_expr = "request.time < timestamp('2024-01-01T00:00:00Z')" + + condition_instance.title = new_title + assert condition_instance.title == new_title + + condition_instance.description = new_desc + assert condition_instance.description == new_desc + + condition_instance.expression = new_expr + assert condition_instance.expression == new_expr + + # Test setting title and description to empty strings + condition_instance.title = "" + assert condition_instance.title == "" + + condition_instance.description = "" + assert condition_instance.description == "" + + # Test setting optional fields back to None + condition_instance.title = None + assert condition_instance.title is None + condition_instance.description = None + assert condition_instance.description is None + + # --- API Representation Tests --- + + def test_to_api_repr_full(self, condition_instance, condition_api_repr): + """Test converting a fully populated Condition to API representation.""" + api_repr = condition_instance.to_api_repr() + assert api_repr == condition_api_repr + + def test_to_api_repr_minimal(self): + """Test converting a minimally populated Condition to API representation.""" + condition = Condition(expression=self.EXPRESSION) + expected_api_repr = { + "expression": self.EXPRESSION, + "title": None, + "description": None, + } + api_repr = condition.to_api_repr() + assert api_repr == expected_api_repr + + def test_from_api_repr_full(self, condition_api_repr): + """Test creating a Condition from a full API representation.""" + condition = Condition.from_api_repr(condition_api_repr) + assert condition.expression == self.EXPRESSION + assert condition.title == self.TITLE + assert condition.description == self.DESCRIPTION + + def test_from_api_repr_minimal(self): + """Test creating a Condition from a minimal API representation.""" + minimal_repr = {"expression": self.EXPRESSION} + condition = Condition.from_api_repr(minimal_repr) + assert condition.expression == self.EXPRESSION + assert condition.title is None + assert condition.description is None + + def test_from_api_repr_with_extra_fields(self): + """Test creating a Condition from an API repr with unexpected fields.""" + api_repr = { + "expression": self.EXPRESSION, + "title": self.TITLE, + "unexpected_field": "some_value", + } + condition = Condition.from_api_repr(api_repr) + assert condition.expression == self.EXPRESSION + assert condition.title == self.TITLE + assert condition.description is None + # Check that the extra field didn't get added to internal properties + assert "unexpected_field" not in condition._properties + + # # --- Validation Tests --- + + @pytest.mark.parametrize( + "kwargs, error_msg", + [ + ({"expression": None}, "Pass a non-empty string for expression"), # type: ignore + ({"expression": ""}, "expression cannot be an empty string"), + ({"expression": 123}, "Pass a non-empty string for expression"), # type: ignore + ({"expression": EXPRESSION, "title": 123}, "Pass a string for title, or None"), # type: ignore + ({"expression": EXPRESSION, "description": False}, "Pass a string for description, or None"), # type: ignore + ], + ) + def test_validation_init(self, kwargs, error_msg): + """Test validation during __init__.""" + with pytest.raises(ValueError, match=error_msg): + Condition(**kwargs) + + @pytest.mark.parametrize( + "attribute, value, error_msg", + [ + ("expression", None, "Pass a non-empty string for expression"), # type: ignore + ("expression", "", "expression cannot be an empty string"), + ("expression", 123, "Pass a non-empty string for expression"), # type: ignore + ("title", 123, "Pass a string for title, or None"), # type: ignore + ("description", [], "Pass a string for description, or None"), # type: ignore + ], + ) + def test_validation_setters(self, condition_instance, attribute, value, error_msg): + """Test validation via setters.""" + with pytest.raises(ValueError, match=error_msg): + setattr(condition_instance, attribute, value) + + def test_validation_expression_required_from_api(self): + """Test ValueError is raised if expression is missing in from_api_repr.""" + api_repr = {"title": self.TITLE} + with pytest.raises( + ValueError, match="API representation missing required 'expression' field." + ): + Condition.from_api_repr(api_repr) + + def test___eq___equality(self, condition_1): + result = condition_1 + expected = condition_1 + assert result == expected + + def test___eq___equality_not_condition(self, condition_1): + result = condition_1 + other = "not a condition" + expected = result.__eq__(other) + assert expected is NotImplemented + + def test__ne__not_equality(self): + result = condition_1 + expected = condition_2 + assert result != expected + + def test__hash__function(self, condition_2): + cond1 = Condition( + expression=self.EXPRESSION, title=self.TITLE, description=self.DESCRIPTION + ) + cond2 = cond1 + cond_not_equal = condition_2 + assert cond1 == cond2 + assert cond1 is cond2 + assert hash(cond1) == hash(cond2) + assert hash(cond1) is not None + assert cond_not_equal != cond1 + assert hash(cond_not_equal) != hash(cond1) + + def test__hash__with_minimal_inputs(self): + cond1 = Condition( + expression="example", + title=None, + description=None, + ) + assert hash(cond1) is not None + + def test_access_entry_view_equality(self): + from google.cloud import bigquery + + entry1 = bigquery.dataset.AccessEntry( + entity_type="view", + entity_id={ + "projectId": "my_project", + "datasetId": "my_dataset", + "tableId": "my_table", + }, + ) + entry2 = bigquery.dataset.AccessEntry.from_api_repr( + { + "view": { + "projectId": "my_project", + "datasetId": "my_dataset", + "tableId": "my_table", + } + } + ) + + entry3 = bigquery.dataset.AccessEntry( + entity_type="routine", + entity_id={ + "projectId": "my_project", + "datasetId": "my_dataset", + "routineId": "my_routine", + }, + ) + + entry4 = bigquery.dataset.AccessEntry.from_api_repr( + { + "routine": { + "projectId": "my_project", + "datasetId": "my_dataset", + "routineId": "my_routine", + } + } + ) + + entry5 = bigquery.dataset.AccessEntry( + entity_type="dataset", + entity_id={ + "dataset": { + "projectId": "my_project", + "datasetId": "my_dataset", + }, + "target_types": "VIEWS", + }, + ) + + entry6 = bigquery.dataset.AccessEntry.from_api_repr( + { + "dataset": { + "dataset": { + "projectId": "my_project", + "datasetId": "my_dataset", + }, + "target_types": "VIEWS", + } + } + ) + + assert entry1 == entry2 + assert entry3 == entry4 + assert entry5 == entry6 diff --git a/tests/unit/test_dbapi__helpers.py b/tests/unit/test_dbapi__helpers.py index 7e1da0034..9907df97b 100644 --- a/tests/unit/test_dbapi__helpers.py +++ b/tests/unit/test_dbapi__helpers.py @@ -210,6 +210,7 @@ def test_empty_iterable(self): self.assertEqual(list(result), []) def test_non_empty_iterable(self): + pytest.importorskip("numpy") pytest.importorskip("pyarrow") from tests.unit.helpers import _to_pyarrow diff --git a/tests/unit/test_dbapi_cursor.py b/tests/unit/test_dbapi_cursor.py index 6fca4cec0..c5cad8c91 100644 --- a/tests/unit/test_dbapi_cursor.py +++ b/tests/unit/test_dbapi_cursor.py @@ -480,7 +480,11 @@ def fake_ensure_bqstorage_client(bqstorage_client=None, **kwargs): data_format=bigquery_storage.DataFormat.ARROW, ) mock_bqstorage_client.create_read_session.assert_called_once_with( - parent="projects/P", read_session=expected_session, max_stream_count=1 + parent="projects/P", + read_session=expected_session, + max_stream_count=1, + retry=None, + timeout=None, ) # Check the data returned. diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py index 9fd16e699..ea827a560 100644 --- a/tests/unit/test_external_config.py +++ b/tests/unit/test_external_config.py @@ -14,14 +14,24 @@ import base64 import copy +from typing import Any, Dict, Optional import unittest from google.cloud.bigquery import external_config from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import SourceColumnMatch + +import pytest class TestExternalConfig(unittest.TestCase): SOURCE_URIS = ["gs://foo", "gs://bar"] + SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME + DATE_FORMAT = "MM/DD/YYYY" + DATETIME_FORMAT = "MM/DD/YYYY HH24:MI:SS" + TIME_ZONE = "America/Los_Angeles" + TIME_FORMAT = "HH24:MI:SS" + TIMESTAMP_FORMAT = "MM/DD/YYYY HH24:MI:SS.FF6 TZR" BASE_RESOURCE = { "sourceFormat": "", @@ -30,6 +40,11 @@ class TestExternalConfig(unittest.TestCase): "autodetect": True, "ignoreUnknownValues": False, "compression": "compression", + "dateFormat": DATE_FORMAT, + "datetimeFormat": DATETIME_FORMAT, + "timeZone": TIME_ZONE, + "timeFormat": TIME_FORMAT, + "timestampFormat": TIMESTAMP_FORMAT, } def test_from_api_repr_base(self): @@ -76,6 +91,12 @@ def test_to_api_repr_base(self): ec.connection_id = "path/to/connection" ec.schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")] + ec.date_format = self.DATE_FORMAT + ec.datetime_format = self.DATETIME_FORMAT + ec.time_zone = self.TIME_ZONE + ec.time_format = self.TIME_FORMAT + ec.timestamp_format = self.TIMESTAMP_FORMAT + exp_schema = { "fields": [{"name": "full_name", "type": "STRING", "mode": "REQUIRED"}] } @@ -89,6 +110,11 @@ def test_to_api_repr_base(self): "compression": "compression", "connectionId": "path/to/connection", "schema": exp_schema, + "dateFormat": self.DATE_FORMAT, + "datetimeFormat": self.DATETIME_FORMAT, + "timeZone": self.TIME_ZONE, + "timeFormat": self.TIME_FORMAT, + "timestampFormat": self.TIMESTAMP_FORMAT, } self.assertEqual(got_resource, exp_resource) @@ -124,6 +150,11 @@ def _verify_base(self, ec): self.assertEqual(ec.ignore_unknown_values, False) self.assertEqual(ec.max_bad_records, 17) self.assertEqual(ec.source_uris, self.SOURCE_URIS) + self.assertEqual(ec.date_format, self.DATE_FORMAT) + self.assertEqual(ec.datetime_format, self.DATETIME_FORMAT) + self.assertEqual(ec.time_zone, self.TIME_ZONE) + self.assertEqual(ec.time_format, self.TIME_FORMAT) + self.assertEqual(ec.timestamp_format, self.TIMESTAMP_FORMAT) def test_to_api_repr_source_format(self): ec = external_config.ExternalConfig("CSV") @@ -248,6 +279,8 @@ def test_from_api_repr_csv(self): "allowJaggedRows": False, "encoding": "encoding", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, + "nullMarkers": ["", "NA"], }, }, ) @@ -264,6 +297,11 @@ def test_from_api_repr_csv(self): self.assertEqual(ec.options.allow_jagged_rows, False) self.assertEqual(ec.options.encoding, "encoding") self.assertEqual(ec.options.preserve_ascii_control_characters, False) + self.assertEqual( + ec.options.source_column_match, + self.SOURCE_COLUMN_MATCH, + ) + self.assertEqual(ec.options.null_markers, ["", "NA"]) got_resource = ec.to_api_repr() @@ -285,6 +323,9 @@ def test_to_api_repr_csv(self): options.skip_leading_rows = 123 options.allow_jagged_rows = False options.preserve_ascii_control_characters = False + options.source_column_match = self.SOURCE_COLUMN_MATCH + options.null_markers = ["", "NA"] + ec.csv_options = options exp_resource = { @@ -297,6 +338,8 @@ def test_to_api_repr_csv(self): "allowJaggedRows": False, "encoding": "encoding", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, + "nullMarkers": ["", "NA"], }, } @@ -848,7 +891,9 @@ def test_to_api_repr(self): ) -class CSVOptions(unittest.TestCase): +class TestCSVOptions(unittest.TestCase): + SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME + def test_to_api_repr(self): options = external_config.CSVOptions() options.field_delimiter = "\t" @@ -858,6 +903,7 @@ def test_to_api_repr(self): options.allow_jagged_rows = False options.encoding = "UTF-8" options.preserve_ascii_control_characters = False + options.source_column_match = self.SOURCE_COLUMN_MATCH resource = options.to_api_repr() @@ -871,9 +917,37 @@ def test_to_api_repr(self): "allowJaggedRows": False, "encoding": "UTF-8", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, }, ) + def test_source_column_match_None(self): + ec = external_config.CSVOptions() + ec.source_column_match = None + expected = None + result = ec.source_column_match + self.assertEqual(expected, result) + + def test_source_column_match_valid_input(self): + ec = external_config.CSVOptions() + ec.source_column_match = SourceColumnMatch.NAME + expected = "NAME" + result = ec.source_column_match + self.assertEqual(expected, result) + + ec.source_column_match = "POSITION" + expected = "POSITION" + result = ec.source_column_match + self.assertEqual(expected, result) + + def test_source_column_match_invalid_input(self): + ec = external_config.CSVOptions() + with self.assertRaisesRegex( + TypeError, + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None", + ): + ec.source_column_match = 3.14 + class TestGoogleSheetsOptions(unittest.TestCase): def test_to_api_repr(self): @@ -890,3 +964,226 @@ def _copy_and_update(d, u): d = copy.deepcopy(d) d.update(u) return d + + +class TestExternalCatalogDatasetOptions: + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.external_config import ExternalCatalogDatasetOptions + + return ExternalCatalogDatasetOptions + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + DEFAULT_STORAGE_LOCATION_URI = "gs://test-bucket/test-path" + PARAMETERS = {"key": "value"} + + @pytest.mark.parametrize( + "default_storage_location_uri,parameters", + [ + (DEFAULT_STORAGE_LOCATION_URI, PARAMETERS), # set all params + (DEFAULT_STORAGE_LOCATION_URI, None), # set only one argument at a time + (None, PARAMETERS), + (None, None), # use default parameters + ], + ) + def test_ctor_initialization( + self, + default_storage_location_uri, + parameters, + ): + """Test ExternalCatalogDatasetOptions constructor with explicit values.""" + + instance = self._make_one( + default_storage_location_uri=default_storage_location_uri, + parameters=parameters, + ) + + assert instance.default_storage_location_uri == default_storage_location_uri + assert instance.parameters == parameters + + @pytest.mark.parametrize( + "default_storage_location_uri,parameters", + [ + (123, None), # does not accept integers + (None, 123), + ], + ) + def test_ctor_invalid_input(self, default_storage_location_uri, parameters): + """Test ExternalCatalogDatasetOptions constructor with invalid input.""" + + with pytest.raises(TypeError) as e: + self._make_one( + default_storage_location_uri=default_storage_location_uri, + parameters=parameters, + ) + + # Looking for the first word from the string "Pass as..." + assert "Pass " in str(e.value) + + def test_to_api_repr(self): + """Test ExternalCatalogDatasetOptions.to_api_repr method.""" + + instance = self._make_one( + default_storage_location_uri=self.DEFAULT_STORAGE_LOCATION_URI, + parameters=self.PARAMETERS, + ) + resource = instance.to_api_repr() + assert ( + resource["defaultStorageLocationUri"] == self.DEFAULT_STORAGE_LOCATION_URI + ) + assert resource["parameters"] == self.PARAMETERS + + def test_from_api_repr(self): + """GIVEN an api representation of an ExternalCatalogDatasetOptions object (i.e. api_repr) + WHEN converted into an ExternalCatalogDatasetOptions object using from_api_repr() + THEN it will have the representation in dict format as an ExternalCatalogDatasetOptions + object made directly (via _make_one()) and represented in dict format. + """ + + instance = self._make_one() + api_repr = { + "defaultStorageLocationUri": self.DEFAULT_STORAGE_LOCATION_URI, + "parameters": self.PARAMETERS, + } + result = instance.from_api_repr(api_repr) + + assert isinstance(result, external_config.ExternalCatalogDatasetOptions) + assert result._properties == api_repr + + +class TestExternalCatalogTableOptions: + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.external_config import ExternalCatalogTableOptions + + return ExternalCatalogTableOptions + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + storage_descriptor_repr = { + "inputFormat": "testpath.to.OrcInputFormat", + "locationUri": "gs://test/path/", + "outputFormat": "testpath.to.OrcOutputFormat", + "serDeInfo": { + "serializationLibrary": "testpath.to.LazySimpleSerDe", + "name": "serde_lib_name", + "parameters": {"key": "value"}, + }, + } + + CONNECTIONID = "connection123" + PARAMETERS = {"key": "value"} + STORAGEDESCRIPTOR = schema.StorageDescriptor.from_api_repr(storage_descriptor_repr) + EXTERNALCATALOGTABLEOPTIONS = { + "connectionId": "connection123", + "parameters": {"key": "value"}, + "storageDescriptor": STORAGEDESCRIPTOR.to_api_repr(), + } + + @pytest.mark.parametrize( + "connection_id,parameters,storage_descriptor", + [ + ( + CONNECTIONID, + PARAMETERS, + STORAGEDESCRIPTOR, + ), # set all parameters at once + (CONNECTIONID, None, None), # set only one parameter at a time + (None, PARAMETERS, None), + (None, None, STORAGEDESCRIPTOR), # set storage descriptor using obj + (None, None, storage_descriptor_repr), # set storage descriptor using dict + (None, None, None), # use default parameters + ], + ) + def test_ctor_initialization( + self, + connection_id, + parameters, + storage_descriptor, + ): + instance = self._make_one( + connection_id=connection_id, + parameters=parameters, + storage_descriptor=storage_descriptor, + ) + + assert instance.connection_id == connection_id + assert instance.parameters == parameters + + if isinstance(storage_descriptor, schema.StorageDescriptor): + assert ( + instance.storage_descriptor.to_api_repr() + == storage_descriptor.to_api_repr() + ) + elif isinstance(storage_descriptor, dict): + assert instance.storage_descriptor.to_api_repr() == storage_descriptor + else: + assert instance.storage_descriptor is None + + @pytest.mark.parametrize( + "connection_id,parameters,storage_descriptor", + [ + pytest.param( + 123, + PARAMETERS, + STORAGEDESCRIPTOR, + id="connection_id-invalid-type", + ), + pytest.param( + CONNECTIONID, + 123, + STORAGEDESCRIPTOR, + id="parameters-invalid-type", + ), + pytest.param( + CONNECTIONID, + PARAMETERS, + 123, + id="storage_descriptor-invalid-type", + ), + ], + ) + def test_ctor_invalid_input( + self, + connection_id: str, + parameters: Dict[str, Any], + storage_descriptor: Optional[schema.StorageDescriptor], + ): + with pytest.raises(TypeError) as e: + external_config.ExternalCatalogTableOptions( + connection_id=connection_id, + parameters=parameters, + storage_descriptor=storage_descriptor, + ) + + # Looking for the first word from the string "Pass as..." + assert "Pass " in str(e.value) + + def test_to_api_repr(self): + instance = self._make_one( + connection_id=self.CONNECTIONID, + parameters=self.PARAMETERS, + storage_descriptor=self.STORAGEDESCRIPTOR, + ) + + result = instance.to_api_repr() + expected = self.EXTERNALCATALOGTABLEOPTIONS + + assert result == expected + + def test_from_api_repr(self): + result = self._make_one( + connection_id=self.CONNECTIONID, + parameters=self.PARAMETERS, + storage_descriptor=self.STORAGEDESCRIPTOR, + ) + + instance = self._make_one() + api_repr = self.EXTERNALCATALOGTABLEOPTIONS + result = instance.from_api_repr(api_repr) + + assert isinstance(result, external_config.ExternalCatalogTableOptions) + assert result._properties == api_repr diff --git a/tests/unit/test_job_retry.py b/tests/unit/test_job_retry.py index 958986052..fa55e8f6a 100644 --- a/tests/unit/test_job_retry.py +++ b/tests/unit/test_job_retry.py @@ -80,7 +80,7 @@ ), ], ) -def test_retry_failed_jobs(sleep, reason, job_retry, result_retry): +def test_retry_failed_jobs(sleep, reason, job_retry, result_retry, global_time_lock): client = make_client() err = dict(reason=reason) conn = client._connection = make_connection( @@ -138,7 +138,7 @@ def test_retry_failed_jobs(sleep, reason, job_retry, result_retry): def test_retry_connection_error_with_default_retries_and_successful_first_job( - monkeypatch, client + monkeypatch, client, global_time_lock ): """ Make sure ConnectionError can be retried at `is_job_done` level, even if @@ -254,7 +254,7 @@ def make_job_id(*args, **kwargs): def test_query_retry_with_default_retry_and_ambiguous_errors_only_retries_with_failed_job( - client, monkeypatch + client, monkeypatch, global_time_lock ): """ Some errors like 'rateLimitExceeded' can be ambiguous. Make sure we only @@ -419,7 +419,7 @@ def make_job_id(*args, **kwargs): # - Pass None retry to `result`. @pytest.mark.parametrize("job_retry_on_query", ["Query", "Result"]) @mock.patch("time.sleep") -def test_disable_retry_failed_jobs(sleep, client, job_retry_on_query): +def test_disable_retry_failed_jobs(sleep, client, job_retry_on_query, global_time_lock): """ Test retry of job failures, as opposed to API-invocation failures. """ @@ -450,7 +450,7 @@ def api_request(method, path, query_params=None, data=None, **kw): @mock.patch("time.sleep") -def test_retry_failed_jobs_after_retry_failed(sleep, client): +def test_retry_failed_jobs_after_retry_failed(sleep, client, global_time_lock): """ If at first you don't succeed, maybe you will later. :) """ @@ -508,33 +508,43 @@ def api_request(method, path, query_params=None, data=None, **kw): assert job.job_id != orig_job_id -def test_raises_on_job_retry_on_query_with_non_retryable_jobs(client): +def test_raises_on_job_retry_on_query_with_non_retryable_jobs(client, global_time_lock): with pytest.raises( TypeError, - match=re.escape( + match=( "`job_retry` was provided, but the returned job is" " not retryable, because a custom `job_id` was" " provided." - ), + ).replace(" ", r"\s"), ): client.query("select 42", job_id=42, job_retry=google.api_core.retry.Retry()) -def test_raises_on_job_retry_on_result_with_non_retryable_jobs(client): +def test_raises_on_job_retry_on_result_with_non_retryable_jobs( + client, global_time_lock +): client._connection = make_connection({}) - job = client.query("select 42", job_id=42) + + with pytest.warns( + FutureWarning, + match=re.escape("job_retry must be explicitly set to None if job_id is set."), + ): + # Implicitly providing a job_retry is a warning and will be an error in the future. + job = client.query("select 42", job_id=42) + with pytest.raises( TypeError, - match=re.escape( + match=( "`job_retry` was provided, but this job is" " not retryable, because a custom `job_id` was" " provided to the query that created this job." - ), + ).replace(" ", r"\s"), ): + # Explicitly providing a job_retry is an error. job.result(job_retry=google.api_core.retry.Retry()) -def test_query_and_wait_retries_job_for_DDL_queries(): +def test_query_and_wait_retries_job_for_DDL_queries(global_time_lock): """ Specific test for retrying DDL queries with "jobRateLimitExceeded" error: https://github.com/googleapis/python-bigquery/issues/1790 @@ -605,3 +615,80 @@ def test_query_and_wait_retries_job_for_DDL_queries(): _, kwargs = calls[3] assert kwargs["method"] == "POST" assert kwargs["path"] == query_request_path + + +@pytest.mark.parametrize( + "result_retry_param", + [ + pytest.param( + {}, + id="default retry {}", + ), + pytest.param( + { + "retry": google.cloud.bigquery.retry.DEFAULT_RETRY.with_timeout( + timeout=10.0 + ) + }, + id="custom retry object with timeout 10.0", + ), + ], +) +def test_retry_load_job_result(result_retry_param, PROJECT, DS_ID): + from google.cloud.bigquery.dataset import DatasetReference + from google.cloud.bigquery.job.load import LoadJob + import google.cloud.bigquery.retry + + client = make_client() + conn = client._connection = make_connection( + dict( + status=dict(state="RUNNING"), + jobReference={"jobId": "id_1"}, + ), + google.api_core.exceptions.ServiceUnavailable("retry me"), + dict( + status=dict(state="DONE"), + jobReference={"jobId": "id_1"}, + statistics={"load": {"outputRows": 1}}, + ), + ) + + table_ref = DatasetReference(project=PROJECT, dataset_id=DS_ID).table("new_table") + job = LoadJob("id_1", source_uris=None, destination=table_ref, client=client) + + with mock.patch.object( + client, "_call_api", wraps=client._call_api + ) as wrapped_call_api: + result = job.result(**result_retry_param) + + assert job.state == "DONE" + assert result.output_rows == 1 + + # Check that _call_api was called multiple times due to retry + assert wrapped_call_api.call_count > 1 + + # Verify the retry object used in the calls to _call_api + expected_retry = result_retry_param.get( + "retry", google.cloud.bigquery.retry.DEFAULT_RETRY + ) + + for call in wrapped_call_api.mock_calls: + name, args, kwargs = call + # The retry object is the first positional argument to _call_api + called_retry = args[0] + + # We only care about the calls made during the job.result() polling + if kwargs.get("method") == "GET" and "jobs/id_1" in kwargs.get("path", ""): + assert called_retry._predicate == expected_retry._predicate + assert called_retry._initial == expected_retry._initial + assert called_retry._maximum == expected_retry._maximum + assert called_retry._multiplier == expected_retry._multiplier + assert called_retry._deadline == expected_retry._deadline + if "retry" in result_retry_param: + # Specifically check the timeout for the custom retry case + assert called_retry._timeout == 10.0 + else: + assert called_retry._timeout == expected_retry._timeout + + # The number of api_request calls should still be 3 + assert conn.api_request.call_count == 3 diff --git a/tests/unit/test_legacy_types.py b/tests/unit/test_legacy_types.py index 809be1855..75f3e77d7 100644 --- a/tests/unit/test_legacy_types.py +++ b/tests/unit/test_legacy_types.py @@ -18,9 +18,9 @@ import warnings try: - import proto # type: ignore + import proto except ImportError: - proto = None + proto = None # type: ignore @pytest.mark.skipif(proto is None, reason="proto is not installed") diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index 73b29df6b..c79e923f8 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -36,6 +36,7 @@ except ImportError: magics = None + bigquery_storage = pytest.importorskip("google.cloud.bigquery_storage") IPython = pytest.importorskip("IPython") interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") @@ -479,6 +480,7 @@ def test_bigquery_magic_without_optional_arguments(monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -830,6 +832,7 @@ def test_bigquery_magic_w_max_results_query_job_results_fails(monkeypatch): assert close_transports.called +@pytest.mark.usefixtures("ipython_interactive") def test_bigquery_magic_w_table_id_invalid(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) @@ -860,6 +863,7 @@ def test_bigquery_magic_w_table_id_invalid(monkeypatch): assert "Traceback (most recent call last)" not in output +@pytest.mark.usefixtures("ipython_interactive") def test_bigquery_magic_w_missing_query(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) @@ -982,6 +986,7 @@ def test_bigquery_magic_dryrun_option_sets_job_config(monkeypatch): google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) @@ -1003,6 +1008,7 @@ def test_bigquery_magic_dryrun_option_returns_query_job(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1031,6 +1037,7 @@ def test_bigquery_magic_dryrun_option_variable_error_message( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" ipython_ns_cleanup.append((ip, "q_job")) run_query_patch = mock.patch( @@ -1060,6 +1067,7 @@ def test_bigquery_magic_dryrun_option_saves_query_job_to_variable( magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1094,6 +1102,7 @@ def test_bigquery_magic_saves_query_job_to_variable_on_error( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" ipython_ns_cleanup.append((ip, "result")) client_query_patch = mock.patch( @@ -1272,6 +1281,11 @@ def test_bigquery_magic_with_no_query_cache(monkeypatch): bigquery.load_ipython_extension(ip) conn = make_connection() monkeypatch.setattr(magics.context, "_connection", conn) + monkeypatch.setattr( + magics.context, + "credentials", + mock.create_autospec(google.auth.credentials.Credentials, instance=True), + ) monkeypatch.setattr(magics.context, "project", "project-from-context") # --no_query_cache option should override context. @@ -1353,6 +1367,8 @@ def test_bigquery_magic_w_progress_bar_type_w_context_setter(monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1382,6 +1398,8 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + with run_query_patch as run_query_mock: ip.run_cell_magic( "bigquery", "--progress_bar_type=tqdm_gui", "SELECT 17 as num" @@ -1564,6 +1582,8 @@ def test_bigquery_magic_with_string_params(ipython_ns_cleanup, monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1604,6 +1624,8 @@ def test_bigquery_magic_with_dict_params(ipython_ns_cleanup, monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1688,6 +1710,7 @@ def test_bigquery_magic_with_option_value_incorrect(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" sql = "SELECT @foo AS foo" @@ -1718,6 +1741,8 @@ def test_bigquery_magic_with_dict_params_negative_value( run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1759,6 +1784,8 @@ def test_bigquery_magic_with_dict_params_array_value(ipython_ns_cleanup, monkeyp run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1800,6 +1827,8 @@ def test_bigquery_magic_with_dict_params_tuple_value(ipython_ns_cleanup, monkeyp run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1851,6 +1880,7 @@ def test_bigquery_magic_valid_query_in_existing_variable( magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" ipython_ns_cleanup.append((ip, "custom_query")) ipython_ns_cleanup.append((ip, "query_results_df")) @@ -1891,6 +1921,7 @@ def test_bigquery_magic_nonexisting_query_variable(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True @@ -1916,7 +1947,7 @@ def test_bigquery_magic_empty_query_variable_name(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) - + magics.context.project = "unit-test-project" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) @@ -1939,6 +1970,7 @@ def test_bigquery_magic_query_variable_non_string(ipython_ns_cleanup, monkeypatc magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True @@ -1967,9 +1999,14 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch): google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" cell_body = "$123foo" # 123foo is not valid Python identifier - with io.capture_output() as captured_io: + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + + with run_query_patch, io.capture_output() as captured_io: ip.run_cell_magic("bigquery", "", cell_body) # If "$" prefixes a string that is not a Python identifier, we do not treat such diff --git a/tests/unit/test_opentelemetry_tracing.py b/tests/unit/test_opentelemetry_tracing.py index 546cc02bd..57132a1b9 100644 --- a/tests/unit/test_opentelemetry_tracing.py +++ b/tests/unit/test_opentelemetry_tracing.py @@ -42,7 +42,6 @@ TEST_SPAN_ATTRIBUTES = {"foo": "baz"} -@pytest.mark.skipif(opentelemetry is None, reason="Require `opentelemetry`") @pytest.fixture def setup(): importlib.reload(opentelemetry_tracing) diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 40ef080f7..adb43bcd9 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -637,9 +637,9 @@ def test_to_api_repr_w_timestamp_datetime(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_timestamp_micros(self): - from google.cloud._helpers import _microseconds_from_datetime + from google.cloud._helpers import _microseconds_from_datetime, UTC - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) seconds = _microseconds_from_datetime(now) / 1.0e6 EXPECTED = { "parameterType": {"type": "TIMESTAMP"}, @@ -650,9 +650,9 @@ def test_to_api_repr_w_timestamp_micros(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_datetime_datetime(self): - from google.cloud._helpers import _datetime_to_rfc3339 + from google.cloud._helpers import _datetime_to_rfc3339, UTC - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) EXPECTED = { "parameterType": {"type": "DATETIME"}, "parameterValue": { @@ -664,9 +664,9 @@ def test_to_api_repr_w_datetime_datetime(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_datetime_string(self): - from google.cloud._helpers import _datetime_to_rfc3339 + from google.cloud._helpers import _datetime_to_rfc3339, UTC - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) now_str = _datetime_to_rfc3339(now) EXPECTED = { "parameterType": {"type": "DATETIME"}, @@ -1047,9 +1047,10 @@ def test_to_api_repr_w_datetime_str(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_datetime_datetime(self): + from google.cloud._helpers import UTC # type: ignore from google.cloud.bigquery._helpers import _RFC3339_MICROS_NO_ZULU - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) now_str = now.strftime(_RFC3339_MICROS_NO_ZULU) EXPECTED = { "parameterType": { @@ -1089,7 +1090,7 @@ def test_to_api_repr_w_timestamp_str(self): def test_to_api_repr_w_timestamp_timestamp(self): from google.cloud._helpers import UTC # type: ignore - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) now = now.astimezone(UTC) now_str = str(now) EXPECTED = { @@ -1999,6 +2000,70 @@ def test_total_bytes_processed_present_string(self): query = self._make_one(resource) self.assertEqual(query.total_bytes_processed, 123456) + def test_slot_millis_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.slot_millis) + + def test_slot_millis_present_integer(self): + resource = self._make_resource() + resource["totalSlotMs"] = 123456 + query = self._make_one(resource) + self.assertEqual(query.slot_millis, 123456) + + def test_slot_millis_present_string(self): + resource = self._make_resource() + resource["totalSlotMs"] = "123456" + query = self._make_one(resource) + self.assertEqual(query.slot_millis, 123456) + + def test_created_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.created) + + def test_created_present_integer(self): + resource = self._make_resource() + resource["creationTime"] = 1437767599006 + query = self._make_one(resource) + self.assertEqual(query.created.timestamp() * 1000, 1437767599006) + + def test_created_present_string(self): + resource = self._make_resource() + resource["creationTime"] = "1437767599006" + query = self._make_one(resource) + self.assertEqual(query.created.timestamp() * 1000, 1437767599006) + + def test_started_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.started) + + def test_started_present_integer(self): + resource = self._make_resource() + resource["startTime"] = 1437767599006 + query = self._make_one(resource) + self.assertEqual(query.started.timestamp() * 1000, 1437767599006) + + def test_started_present_string(self): + resource = self._make_resource() + resource["startTime"] = "1437767599006" + query = self._make_one(resource) + self.assertEqual(query.started.timestamp() * 1000, 1437767599006) + + def test_ended_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.ended) + + def test_ended_present_integer(self): + resource = self._make_resource() + resource["endTime"] = 1437767599006 + query = self._make_one(resource) + self.assertEqual(query.ended.timestamp() * 1000, 1437767599006) + + def test_ended_present_string(self): + resource = self._make_resource() + resource["endTime"] = "1437767599006" + query = self._make_one(resource) + self.assertEqual(query.ended.timestamp() * 1000, 1437767599006) + def test_num_dml_affected_rows_missing(self): query = self._make_one(self._make_resource()) self.assertIsNone(query.num_dml_affected_rows) diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index b17cd0281..f61b22035 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -12,14 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from google.cloud import bigquery -from google.cloud.bigquery.standard_sql import StandardSqlStructType -from google.cloud.bigquery.schema import PolicyTagList +import copy import unittest from unittest import mock import pytest +from google.cloud import bigquery +from google.cloud.bigquery import enums +from google.cloud.bigquery.standard_sql import StandardSqlStructType +from google.cloud.bigquery import schema +from google.cloud.bigquery.schema import PolicyTagList + class TestSchemaField(unittest.TestCase): @staticmethod @@ -46,6 +50,11 @@ def test_constructor_defaults(self): self.assertEqual(field.fields, ()) self.assertIsNone(field.policy_tags) self.assertIsNone(field.default_value_expression) + self.assertEqual(field.rounding_mode, None) + self.assertEqual(field.foreign_type_definition, None) + self.assertEqual( + field.timestamp_precision, enums.TimestampPrecision.MICROSECOND + ) def test_constructor_explicit(self): FIELD_DEFAULT_VALUE_EXPRESSION = "This is the default value for this field" @@ -61,6 +70,9 @@ def test_constructor_explicit(self): ) ), default_value_expression=FIELD_DEFAULT_VALUE_EXPRESSION, + rounding_mode=enums.RoundingMode.ROUNDING_MODE_UNSPECIFIED, + foreign_type_definition="INTEGER", + timestamp_precision=enums.TimestampPrecision.PICOSECOND, ) self.assertEqual(field.name, "test") self.assertEqual(field.field_type, "STRING") @@ -77,6 +89,12 @@ def test_constructor_explicit(self): ) ), ) + self.assertEqual(field.rounding_mode, "ROUNDING_MODE_UNSPECIFIED") + self.assertEqual(field.foreign_type_definition, "INTEGER") + self.assertEqual( + field.timestamp_precision, + enums.TimestampPrecision.PICOSECOND, + ) def test_constructor_explicit_none(self): field = self._make_one("test", "STRING", description=None, policy_tags=None) @@ -128,16 +146,22 @@ def test_constructor_range_str(self): self.assertEqual(field.range_element_type.element_type, "DATETIME") def test_to_api_repr(self): - from google.cloud.bigquery.schema import PolicyTagList - policy = PolicyTagList(names=("foo", "bar")) self.assertEqual( policy.to_api_repr(), {"names": ["foo", "bar"]}, ) + ROUNDINGMODE = enums.RoundingMode.ROUNDING_MODE_UNSPECIFIED + field = self._make_one( - "foo", "INTEGER", "NULLABLE", description="hello world", policy_tags=policy + "foo", + "INTEGER", + "NULLABLE", + description="hello world", + policy_tags=policy, + rounding_mode=ROUNDINGMODE, + foreign_type_definition=None, ) self.assertEqual( field.to_api_repr(), @@ -147,6 +171,7 @@ def test_to_api_repr(self): "type": "INTEGER", "description": "hello world", "policyTags": {"names": ["foo", "bar"]}, + "roundingMode": "ROUNDING_MODE_UNSPECIFIED", }, ) @@ -172,6 +197,23 @@ def test_to_api_repr_with_subfield(self): }, ) + def test_to_api_repr_w_timestamp_precision(self): + field = self._make_one( + "foo", + "TIMESTAMP", + "NULLABLE", + timestamp_precision=enums.TimestampPrecision.PICOSECOND, + ) + self.assertEqual( + field.to_api_repr(), + { + "mode": "NULLABLE", + "name": "foo", + "type": "TIMESTAMP", + "timestampPrecision": 12, + }, + ) + def test_from_api_repr(self): field = self._get_target_class().from_api_repr( { @@ -180,6 +222,8 @@ def test_from_api_repr(self): "description": "test_description", "name": "foo", "type": "record", + "roundingMode": "ROUNDING_MODE_UNSPECIFIED", + "timestampPrecision": 12, } ) self.assertEqual(field.name, "foo") @@ -191,6 +235,11 @@ def test_from_api_repr(self): self.assertEqual(field.fields[0].field_type, "INTEGER") self.assertEqual(field.fields[0].mode, "NULLABLE") self.assertEqual(field.range_element_type, None) + self.assertEqual(field.rounding_mode, "ROUNDING_MODE_UNSPECIFIED") + self.assertEqual( + field.timestamp_precision, + enums.TimestampPrecision.PICOSECOND, + ) def test_from_api_repr_policy(self): field = self._get_target_class().from_api_repr( @@ -245,6 +294,17 @@ def test_from_api_repr_defaults(self): self.assertNotIn("policyTags", field._properties) self.assertNotIn("rangeElementType", field._properties) + def test_from_api_repr_timestamp_precision_str(self): + # The backend would return timestampPrecision field as a string, even + # if we send over an integer. This test verifies we manually converted + # it into integer to ensure resending could succeed. + field = self._get_target_class().from_api_repr( + { + "timestampPrecision": "12", + } + ) + self.assertEqual(field._properties["timestampPrecision"], 12) + def test_name_property(self): name = "lemon-ness" schema_field = self._make_one(name, "INTEGER") @@ -282,6 +342,44 @@ def test_fields_property(self): schema_field = self._make_one("boat", "RECORD", fields=fields) self.assertEqual(schema_field.fields, fields) + def test_roundingmode_property_str(self): + ROUNDINGMODE = "ROUND_HALF_AWAY_FROM_ZERO" + schema_field = self._make_one("test", "STRING", rounding_mode=ROUNDINGMODE) + self.assertEqual(schema_field.rounding_mode, ROUNDINGMODE) + + del schema_field + schema_field = self._make_one("test", "STRING") + schema_field._properties["roundingMode"] = ROUNDINGMODE + self.assertEqual(schema_field.rounding_mode, ROUNDINGMODE) + + def test_foreign_type_definition_property_str(self): + FOREIGN_TYPE_DEFINITION = "INTEGER" + schema_field = self._make_one( + "test", "STRING", foreign_type_definition=FOREIGN_TYPE_DEFINITION + ) + self.assertEqual(schema_field.foreign_type_definition, FOREIGN_TYPE_DEFINITION) + + del schema_field + schema_field = self._make_one("test", "STRING") + schema_field._properties["foreignTypeDefinition"] = FOREIGN_TYPE_DEFINITION + self.assertEqual(schema_field.foreign_type_definition, FOREIGN_TYPE_DEFINITION) + + def test_timestamp_precision_unsupported_type(self): + with pytest.raises(ValueError) as e: + self._make_one("test", "TIMESTAMP", timestamp_precision=12) + + assert "timestamp_precision must be class enums.TimestampPrecision" in str( + e.value + ) + + def test_timestamp_precision_property(self): + TIMESTAMP_PRECISION = enums.TimestampPrecision.PICOSECOND + schema_field = self._make_one("test", "TIMESTAMP") + schema_field._properties[ + "timestampPrecision" + ] = enums.TimestampPrecision.PICOSECOND.value + self.assertEqual(schema_field.timestamp_precision, TIMESTAMP_PRECISION) + def test_to_standard_sql_simple_type(self): examples = ( # a few legacy types @@ -456,6 +554,20 @@ def test_to_standard_sql_unknown_type(self): bigquery.StandardSqlTypeNames.TYPE_KIND_UNSPECIFIED, ) + def test_to_standard_sql_foreign_type_valid(self): + legacy_type = "FOREIGN" + standard_type = bigquery.StandardSqlTypeNames.FOREIGN + foreign_type_definition = "INTEGER" + + field = self._make_one( + "some_field", + field_type=legacy_type, + foreign_type_definition=foreign_type_definition, + ) + standard_field = field.to_standard_sql() + self.assertEqual(standard_field.name, "some_field") + self.assertEqual(standard_field.type.type_kind, standard_type) + def test___eq___wrong_type(self): field = self._make_one("test", "STRING") other = object() @@ -582,12 +694,9 @@ def test___hash__not_equals(self): def test___repr__(self): field1 = self._make_one("field1", "STRING") - expected = "SchemaField('field1', 'STRING', 'NULLABLE', None, None, (), None)" - self.assertEqual(repr(field1), expected) - - def test___repr__type_not_set(self): - field1 = self._make_one("field1", field_type=None) - expected = "SchemaField('field1', None, 'NULLABLE', None, None, (), None)" + expected = ( + "SchemaField('field1', 'STRING', 'NULLABLE', None, None, (), None, None)" + ) self.assertEqual(repr(field1), expected) def test___repr__evaluable_no_policy_tags(self): @@ -710,27 +819,62 @@ def test__parse_schema_resource_fields_without_mode(self): self._verifySchema(schema, RESOURCE) -class Test_build_schema_resource(unittest.TestCase, _SchemaBase): +class Test_build_schema_resource: + """Tests for the _build_schema_resource function.""" + def _call_fut(self, resource): - from google.cloud.bigquery.schema import _build_schema_resource + return schema._build_schema_resource(resource) + + FULL_NAME = schema.SchemaField( + name="full_name", field_type="STRING", mode="REQUIRED" + ) + AGE = schema.SchemaField(name="age", field_type="INTEGER", mode="REQUIRED") + LIST_RESOURCE = [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, + ] + FOREIGN_TYPE_INFO = schema.ForeignTypeInfo(type_system="TYPE_SYSTEM_UNSPECIFIED") + FOREIGN_TYPE_INFO_RESOURCE = {"typeSystem": "TYPE_SYSTEM_UNSPECIFIED"} + + @pytest.mark.parametrize( + "schema,expected", + [ + pytest.param([], [], id="empty list"), + pytest.param([FULL_NAME, AGE], LIST_RESOURCE, id="list"), + ], + ) + def test_ctor_valid_input(self, schema, expected): + result = self._call_fut(schema) + + assert result == expected - return _build_schema_resource(resource) + @pytest.mark.parametrize( + "schema,expected", + [ + pytest.param(123, TypeError, id="invalid type"), + ], + ) + def test_ctor_invalid_input(self, schema, expected): + with pytest.raises(TypeError) as e: + self._call_fut(schema) + + # Looking for the first phrase from the string "Schema must be a ..." + assert "Schema must be a " in str(e.value) def test_defaults(self): from google.cloud.bigquery.schema import SchemaField full_name = SchemaField("full_name", "STRING", mode="REQUIRED") age = SchemaField("age", "INTEGER", mode="REQUIRED") + # test with simple list resource = self._call_fut([full_name, age]) - self.assertEqual(len(resource), 2) - self.assertEqual( - resource[0], - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - ) - self.assertEqual( - resource[1], - {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, - ) + assert len(resource) == 2 + assert resource[0] == { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + } + assert resource[1] == {"name": "age", "type": "INTEGER", "mode": "REQUIRED"} def test_w_description(self): from google.cloud.bigquery.schema import SchemaField @@ -747,25 +891,20 @@ def test_w_description(self): description=None, ) resource = self._call_fut([full_name, age]) - self.assertEqual(len(resource), 2) - self.assertEqual( - resource[0], - { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": DESCRIPTION, - }, - ) - self.assertEqual( - resource[1], - { - "name": "age", - "type": "INTEGER", - "mode": "REQUIRED", - "description": None, - }, - ) + assert len(resource) == 2 + assert resource[0] == { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + "description": DESCRIPTION, + } + + assert resource[1] == { + "name": "age", + "type": "INTEGER", + "mode": "REQUIRED", + "description": None, + } def test_w_subfields(self): from google.cloud.bigquery.schema import SchemaField @@ -777,57 +916,99 @@ def test_w_subfields(self): "phone", "RECORD", mode="REPEATED", fields=[ph_type, ph_num] ) resource = self._call_fut([full_name, phone]) - self.assertEqual(len(resource), 2) - self.assertEqual( - resource[0], - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - ) - self.assertEqual( - resource[1], - { - "name": "phone", - "type": "RECORD", - "mode": "REPEATED", - "fields": [ - {"name": "type", "type": "STRING", "mode": "REQUIRED"}, - {"name": "number", "type": "STRING", "mode": "REQUIRED"}, - ], - }, - ) + assert len(resource) == 2 + assert resource[0] == { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + } + assert resource[1] == { + "name": "phone", + "type": "RECORD", + "mode": "REPEATED", + "fields": [ + {"name": "type", "type": "STRING", "mode": "REQUIRED"}, + {"name": "number", "type": "STRING", "mode": "REQUIRED"}, + ], + } -class Test_to_schema_fields(unittest.TestCase): +class Test_to_schema_fields: + """Tests for the _to_schema_fields function.""" + @staticmethod def _call_fut(schema): from google.cloud.bigquery.schema import _to_schema_fields return _to_schema_fields(schema) - def test_invalid_type(self): - schema = [ - ("full_name", "STRING", "REQUIRED"), - ("address", "STRING", "REQUIRED"), - ] - with self.assertRaises(ValueError): - self._call_fut(schema) + FULL_NAME = schema.SchemaField( + name="full_name", field_type="STRING", mode="REQUIRED" + ) + AGE = schema.SchemaField(name="age", field_type="INTEGER", mode="REQUIRED") + LIST_RESOURCE = [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, + ] + FOREIGN_TYPE_INFO = schema.ForeignTypeInfo(type_system="TYPE_SYSTEM_UNSPECIFIED") + FOREIGN_TYPE_INFO_RESOURCE = {"typeSystem": "TYPE_SYSTEM_UNSPECIFIED"} + + @pytest.mark.parametrize( + "schema,expected", + [ + pytest.param([], [], id="empty list"), + pytest.param((), [], id="empty tuple"), + pytest.param(LIST_RESOURCE, [FULL_NAME, AGE], id="list"), + ], + ) + def test_ctor_valid_input(self, schema, expected): + result = self._call_fut(schema) - def test_schema_fields_sequence(self): - from google.cloud.bigquery.schema import SchemaField + assert result == expected + + @pytest.mark.parametrize( + "schema,expected", + [ + pytest.param(123, TypeError, id="invalid schema type"), + pytest.param([123, 123], TypeError, id="invalid SchemaField type"), + pytest.param({"fields": 123}, TypeError, id="invalid type, dict"), + pytest.param( + {"fields": 123, "foreignTypeInfo": 123}, + TypeError, + id="invalid type, dict", + ), + ], + ) + def test_ctor_invalid_input(self, schema, expected): + with pytest.raises(expected): + self._call_fut(schema) + def test_unknown_properties(self): schema = [ - SchemaField("full_name", "STRING", mode="REQUIRED"), - SchemaField("age", "INT64", mode="NULLABLE"), + { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + "someNewProperty": "test-value", + }, + { + "name": "age", + # Note: This type should be included, too. Avoid client-side + # validation, as it could prevent backwards-compatible + # evolution of the server-side behavior. + "typo": "INTEGER", + "mode": "REQUIRED", + "anotherNewProperty": "another-test", + }, ] + + # Make sure the setter doesn't mutate schema. + expected_schema = copy.deepcopy(schema) + result = self._call_fut(schema) - self.assertEqual(result, schema) - def test_invalid_mapping_representation(self): - schema = [ - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "address", "typeooo": "STRING", "mode": "REQUIRED"}, - ] - with self.assertRaises(Exception): - self._call_fut(schema) + for api_repr, field in zip(expected_schema, result): + assert field.to_api_repr() == api_repr def test_valid_mapping_representation(self): from google.cloud.bigquery.schema import SchemaField @@ -859,14 +1040,12 @@ def test_valid_mapping_representation(self): ] result = self._call_fut(schema) - self.assertEqual(result, expected_schema) + assert result == expected_schema class TestPolicyTags(unittest.TestCase): @staticmethod def _get_target_class(): - from google.cloud.bigquery.schema import PolicyTagList - return PolicyTagList def _make_one(self, *args, **kw): @@ -1108,3 +1287,285 @@ def test_to_api_repr_parameterized(field, api): from google.cloud.bigquery.schema import SchemaField assert SchemaField(**field).to_api_repr() == api + + +class TestForeignTypeInfo: + """Tests for ForeignTypeInfo objects.""" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.schema import ForeignTypeInfo + + return ForeignTypeInfo + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + @pytest.mark.parametrize( + "type_system,expected", + [ + (None, None), + ("TYPE_SYSTEM_UNSPECIFIED", "TYPE_SYSTEM_UNSPECIFIED"), + ("HIVE", "HIVE"), + ], + ) + def test_ctor_valid_input(self, type_system, expected): + result = self._make_one(type_system=type_system) + + assert result.type_system == expected + + def test_ctor_invalid_input(self): + with pytest.raises(TypeError) as e: + self._make_one(type_system=123) + + # Looking for the first word from the string "Pass as..." + assert "Pass " in str(e.value) + + @pytest.mark.parametrize( + "type_system,expected", + [ + ("TYPE_SYSTEM_UNSPECIFIED", {"typeSystem": "TYPE_SYSTEM_UNSPECIFIED"}), + ("HIVE", {"typeSystem": "HIVE"}), + (None, {"typeSystem": None}), + ], + ) + def test_to_api_repr(self, type_system, expected): + result = self._make_one(type_system=type_system) + + assert result.to_api_repr() == expected + + def test_from_api_repr(self): + """GIVEN an api representation of a ForeignTypeInfo object (i.e. api_repr) + WHEN converted into a ForeignTypeInfo object using from_api_repr() + THEN it will have the same representation in dict format as a ForeignTypeInfo + object made directly (via _make_one()) and represented in dict format. + """ + api_repr = { + "typeSystem": "TYPE_SYSTEM_UNSPECIFIED", + } + + expected = self._make_one( + type_system="TYPE_SYSTEM_UNSPECIFIED", + ) + + klass = self._get_target_class() + result = klass.from_api_repr(api_repr) + + # We convert both to dict format because these classes do not have a + # __eq__() method to facilitate direct equality comparisons. + assert result.to_api_repr() == expected.to_api_repr() + + +class TestSerDeInfo: + """Tests for the SerDeInfo class.""" + + @staticmethod + def _get_target_class(): + return schema.SerDeInfo + + def _make_one(self, *args, **kwargs): + return self._get_target_class()(*args, **kwargs) + + @pytest.mark.parametrize( + "serialization_library,name,parameters", + [ + ("testpath.to.LazySimpleSerDe", None, None), + ("testpath.to.LazySimpleSerDe", "serde_name", None), + ("testpath.to.LazySimpleSerDe", None, {"key": "value"}), + ("testpath.to.LazySimpleSerDe", "serde_name", {"key": "value"}), + ], + ) + def test_ctor_valid_input(self, serialization_library, name, parameters): + serde_info = self._make_one( + serialization_library=serialization_library, + name=name, + parameters=parameters, + ) + assert serde_info.serialization_library == serialization_library + assert serde_info.name == name + assert serde_info.parameters == parameters + + @pytest.mark.parametrize( + "serialization_library,name,parameters", + [ + (123, None, None), + ("testpath.to.LazySimpleSerDe", 123, None), + ("testpath.to.LazySimpleSerDe", None, ["test", "list"]), + ("testpath.to.LazySimpleSerDe", None, 123), + ], + ) + def test_ctor_invalid_input(self, serialization_library, name, parameters): + with pytest.raises(TypeError) as e: + self._make_one( + serialization_library=serialization_library, + name=name, + parameters=parameters, + ) + # Looking for the first word from the string "Pass as..." + assert "Pass " in str(e.value) + + def test_to_api_repr(self): + serde_info = self._make_one( + serialization_library="testpath.to.LazySimpleSerDe", + name="serde_name", + parameters={"key": "value"}, + ) + expected_repr = { + "serializationLibrary": "testpath.to.LazySimpleSerDe", + "name": "serde_name", + "parameters": {"key": "value"}, + } + assert serde_info.to_api_repr() == expected_repr + + def test_from_api_repr(self): + """GIVEN an api representation of a SerDeInfo object (i.e. api_repr) + WHEN converted into a SerDeInfo object using from_api_repr() + THEN it will have the same representation in dict format as a SerDeInfo + object made directly (via _make_one()) and represented in dict format. + """ + api_repr = { + "serializationLibrary": "testpath.to.LazySimpleSerDe", + "name": "serde_name", + "parameters": {"key": "value"}, + } + + expected = self._make_one( + serialization_library="testpath.to.LazySimpleSerDe", + name="serde_name", + parameters={"key": "value"}, + ) + + klass = self._get_target_class() + result = klass.from_api_repr(api_repr) + + # We convert both to dict format because these classes do not have a + # __eq__() method to facilitate direct equality comparisons. + assert result.to_api_repr() == expected.to_api_repr() + + +class TestStorageDescriptor: + """Tests for the StorageDescriptor class.""" + + @staticmethod + def _get_target_class(): + return schema.StorageDescriptor + + def _make_one(self, *args, **kwargs): + return self._get_target_class()(*args, **kwargs) + + serdeinfo_resource = { + "serialization_library": "testpath.to.LazySimpleSerDe", + "name": "serde_lib_name", + "parameters": {"key": "value"}, + } + + SERDEINFO = schema.SerDeInfo("PLACEHOLDER").from_api_repr(serdeinfo_resource) + + STORAGEDESCRIPTOR = { + "inputFormat": "testpath.to.OrcInputFormat", + "locationUri": "gs://test/path/", + "outputFormat": "testpath.to.OrcOutputFormat", + "serDeInfo": SERDEINFO.to_api_repr(), + } + + @pytest.mark.parametrize( + "input_format,location_uri,output_format,serde_info", + [ + (None, None, None, None), + ("testpath.to.OrcInputFormat", None, None, None), + (None, "gs://test/path/", None, None), + (None, None, "testpath.to.OrcOutputFormat", None), + (None, None, None, SERDEINFO), + ( + "testpath.to.OrcInputFormat", + "gs://test/path/", + "testpath.to.OrcOutputFormat", + SERDEINFO, # uses SERDEINFO class format + ), + ( + "testpath.to.OrcInputFormat", + "gs://test/path/", + "testpath.to.OrcOutputFormat", + serdeinfo_resource, # uses api resource format (dict) + ), + ], + ) + def test_ctor_valid_input( + self, input_format, location_uri, output_format, serde_info + ): + storage_descriptor = self._make_one( + input_format=input_format, + location_uri=location_uri, + output_format=output_format, + serde_info=serde_info, + ) + assert storage_descriptor.input_format == input_format + assert storage_descriptor.location_uri == location_uri + assert storage_descriptor.output_format == output_format + if isinstance(serde_info, schema.SerDeInfo): + assert ( + storage_descriptor.serde_info.to_api_repr() == serde_info.to_api_repr() + ) + elif isinstance(serde_info, dict): + assert storage_descriptor.serde_info.to_api_repr() == serde_info + else: + assert storage_descriptor.serde_info is None + + @pytest.mark.parametrize( + "input_format,location_uri,output_format,serde_info", + [ + (123, None, None, None), + (None, 123, None, None), + (None, None, 123, None), + (None, None, None, 123), + ], + ) + def test_ctor_invalid_input( + self, input_format, location_uri, output_format, serde_info + ): + with pytest.raises(TypeError) as e: + self._make_one( + input_format=input_format, + location_uri=location_uri, + output_format=output_format, + serde_info=serde_info, + ) + + # Looking for the first word from the string "Pass as..." + assert "Pass " in str(e.value) + + def test_to_api_repr(self): + storage_descriptor = self._make_one( + input_format="input_format", + location_uri="location_uri", + output_format="output_format", + serde_info=self.SERDEINFO, + ) + expected_repr = { + "inputFormat": "input_format", + "locationUri": "location_uri", + "outputFormat": "output_format", + "serDeInfo": self.SERDEINFO.to_api_repr(), + } + assert storage_descriptor.to_api_repr() == expected_repr + + def test_from_api_repr(self): + """GIVEN an api representation of a StorageDescriptor (i.e. STORAGEDESCRIPTOR) + WHEN converted into a StorageDescriptor using from_api_repr() and + displayed as a dict + THEN it will have the same representation a StorageDescriptor created + directly (via the _make_one() func) and displayed as a dict. + """ + + # generate via STORAGEDESCRIPTOR + resource = self.STORAGEDESCRIPTOR + result = self._get_target_class().from_api_repr(resource) + # result = klass.from_api_repr(resource) + + expected = self._make_one( + input_format="testpath.to.OrcInputFormat", + location_uri="gs://test/path/", + output_format="testpath.to.OrcOutputFormat", + serde_info=self.SERDEINFO, + ) + assert result.to_api_repr() == expected.to_api_repr() diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index d6febcfb1..a8397247d 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import datetime import logging import re -from sys import version_info import time import types import unittest @@ -29,6 +29,9 @@ from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import exceptions +from google.cloud.bigquery import external_config +from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.dataset import DatasetReference @@ -392,7 +395,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.006 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.ETAG = "ETAG" self.TABLE_FULL_ID = "%s:%s.%s" % (self.PROJECT, self.DS_ID, self.TABLE_NAME) self.RESOURCE_URL = "http://example.com/path/to/resource" @@ -433,6 +438,12 @@ def _make_resource(self): "sourceFormat": "CSV", "csvOptions": {"allowJaggedRows": True, "encoding": "encoding"}, }, + "biglakeConfiguration": { + "connectionId": "connection", + "storageUri": "uri", + "fileFormat": "PARQUET", + "tableFormat": "ICEBERG", + }, "labels": {"x": "y"}, } @@ -519,6 +530,15 @@ def _verifyResourceProperties(self, table, resource): else: self.assertIsNone(table.encryption_configuration) + if "biglakeConfiguration" in resource: + self.assertIsNotNone(table.biglake_configuration) + self.assertEqual(table.biglake_configuration.connection_id, "connection") + self.assertEqual(table.biglake_configuration.storage_uri, "uri") + self.assertEqual(table.biglake_configuration.file_format, "PARQUET") + self.assertEqual(table.biglake_configuration.table_format, "ICEBERG") + else: + self.assertIsNone(table.biglake_configuration) + def test_ctor(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) @@ -697,7 +717,7 @@ def test_schema_setter_invalid_field(self): table_ref = dataset.table(self.TABLE_NAME) table = self._make_one(table_ref) full_name = SchemaField("full_name", "STRING", mode="REQUIRED") - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): table.schema = [full_name, object()] def test_schema_setter_valid_fields(self): @@ -711,14 +731,35 @@ def test_schema_setter_valid_fields(self): table.schema = [full_name, age] self.assertEqual(table.schema, [full_name, age]) - def test_schema_setter_invalid_mapping_representation(self): + def test_schema_setter_allows_unknown_properties(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) table = self._make_one(table_ref) - full_name = {"name": "full_name", "type": "STRING", "mode": "REQUIRED"} - invalid_field = {"name": "full_name", "typeooo": "STRING", "mode": "REQUIRED"} - with self.assertRaises(Exception): - table.schema = [full_name, invalid_field] + schema = [ + { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + "someNewProperty": "test-value", + }, + { + "name": "age", + # Note: This type should be included, too. Avoid client-side + # validation, as it could prevent backwards-compatible + # evolution of the server-side behavior. + "typo": "INTEGER", + "mode": "REQUIRED", + "anotherNewProperty": "another-test", + }, + ] + + # Make sure the setter doesn't mutate schema. + expected_schema = copy.deepcopy(schema) + + table.schema = schema + + # _properties should include all fields, including unknown ones. + assert table._properties["schema"]["fields"] == expected_schema def test_schema_setter_valid_mapping_representation(self): from google.cloud.bigquery.schema import SchemaField @@ -870,6 +911,212 @@ def test_table_constraints_property_getter(self): assert isinstance(table_constraints, TableConstraints) assert table_constraints.primary_key == PrimaryKey(columns=["id"]) + def test_biglake_configuration_not_set(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + assert table.biglake_configuration is None + + def test_biglake_configuration_set(self): + from google.cloud.bigquery.table import BigLakeConfiguration + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + table._properties["biglakeConfiguration"] = { + "connectionId": "connection", + "storageUri": "uri", + "fileFormat": "PARQUET", + "tableFormat": "ICEBERG", + } + + config = table.biglake_configuration + + assert isinstance(config, BigLakeConfiguration) + assert config.connection_id == "connection" + assert config.storage_uri == "uri" + assert config.file_format == "PARQUET" + assert config.table_format == "ICEBERG" + + def test_biglake_configuration_property_setter(self): + from google.cloud.bigquery.table import BigLakeConfiguration + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + config = BigLakeConfiguration( + connection_id="connection", + storage_uri="uri", + file_format="PARQUET", + table_format="ICEBERG", + ) + table.biglake_configuration = config + + assert table._properties["biglakeConfiguration"] == { + "connectionId": "connection", + "storageUri": "uri", + "fileFormat": "PARQUET", + "tableFormat": "ICEBERG", + } + + table.biglake_configuration = None + assert table.biglake_configuration is None + + def test_table_constraints_property_setter(self): + from google.cloud.bigquery.table import ( + ColumnReference, + ForeignKey, + PrimaryKey, + TableConstraints, + TableReference, + ) + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + primary_key = PrimaryKey(columns=["id"]) + foreign_keys = [ + ForeignKey( + name="fk_name", + referenced_table=TableReference.from_string( + "my_project.my_dataset.table" + ), + column_references=[ + ColumnReference( + referenced_column="product_id", referencing_column="id" + ) + ], + ) + ] + table_constraints = TableConstraints( + primary_key=primary_key, foreign_keys=foreign_keys + ) + table.table_constraints = table_constraints + + assert table._properties["tableConstraints"] == { + "primaryKey": {"columns": ["id"]}, + "foreignKeys": [ + { + "name": "fk_name", + "referencedTable": { + "projectId": "my_project", + "datasetId": "my_dataset", + "tableId": "table", + }, + "columnReferences": [ + {"referencedColumn": "product_id", "referencingColumn": "id"} + ], + } + ], + } + + def test_table_constraints_property_setter_empty_value(self): + from google.cloud.bigquery.table import TableConstraints + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + table.table_constraints = TableConstraints(primary_key=None, foreign_keys=None) + assert table._properties["tableConstraints"] == {} + + def test_table_constraints_property_setter_invalid_value(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + with pytest.raises( + ValueError, + match="value must be google.cloud.bigquery.table.TableConstraints or None", + ): + table.table_constraints = "invalid_value" + + def test_table_constraints_property_setter_none_value(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + table.table_constraints = None + assert table._properties["tableConstraints"] is None + + def test_table_constraints_property_setter_only_primary_key_set(self): + from google.cloud.bigquery.table import PrimaryKey, TableConstraints + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + primary_key = PrimaryKey(columns=["id"]) + + table_constraints = TableConstraints(primary_key=primary_key, foreign_keys=None) + table.table_constraints = table_constraints + + assert table._properties["tableConstraints"] == { + "primaryKey": {"columns": ["id"]} + } + + def test_table_constraints_property_setter_only_foriegn_keys(self): + from google.cloud.bigquery.table import ( + ColumnReference, + ForeignKey, + TableConstraints, + TableReference, + ) + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + foreign_keys = [ + ForeignKey( + name="fk_name", + referenced_table=TableReference.from_string( + "my_project.my_dataset.table" + ), + column_references=[ + ColumnReference( + referenced_column="product_id", referencing_column="id" + ) + ], + ) + ] + table_constraints = TableConstraints( + primary_key=None, foreign_keys=foreign_keys + ) + table.table_constraints = table_constraints + + assert table._properties["tableConstraints"] == { + "foreignKeys": [ + { + "name": "fk_name", + "referencedTable": { + "projectId": "my_project", + "datasetId": "my_dataset", + "tableId": "table", + }, + "columnReferences": [ + {"referencedColumn": "product_id", "referencingColumn": "id"} + ], + } + ] + } + + def test_table_constraints_property_setter_empty_constraints(self): + from google.cloud.bigquery.table import TableConstraints + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + table_constraints = TableConstraints(primary_key=None, foreign_keys=None) + table.table_constraints = table_constraints + + assert table._properties["tableConstraints"] == {} + def test_description_setter_bad_value(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) @@ -1050,6 +1297,16 @@ def test_mview_refresh_interval(self): table.mview_refresh_interval = None self.assertIsNone(table.mview_refresh_interval) + def test_mview_allow_non_incremental_definition(self): + table = self._make_one() + self.assertIsNone(table.mview_allow_non_incremental_definition) + table.mview_allow_non_incremental_definition = True + self.assertTrue(table.mview_allow_non_incremental_definition) + table.mview_allow_non_incremental_definition = False + self.assertFalse(table.mview_allow_non_incremental_definition) + table.mview_allow_non_incremental_definition = None + self.assertIsNone(table.mview_allow_non_incremental_definition) + def test_from_string(self): cls = self._get_target_class() got = cls.from_string("string-project.string_dataset.string_table") @@ -1180,6 +1437,83 @@ def test_to_api_repr_w_unsetting_expiration(self): } self.assertEqual(resource, exp_resource) + def test_to_api_repr_w_schema_and_foreign_type_info(self): + """Tests to ensure that to_api_repr works correctly with + both schema and foreign_type_info fields + """ + + PROJECT = "test-project" + DATASET_ID = "test_dataset" + TABLE_ID = "coffee_table" + FOREIGNTYPEINFO = { + "typeSystem": "TYPE_SYSTEM_UNSPECIFIED", + } + SCHEMA = { + "fields": [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, + ], + "foreignTypeInfo": FOREIGNTYPEINFO, + } + + API_REPR = { + "tableReference": { + "projectId": PROJECT, + "datasetId": DATASET_ID, + "tableId": TABLE_ID, + }, + "schema": SCHEMA, + } + + table = self._get_target_class().from_api_repr(API_REPR) + assert table._properties == table.to_api_repr() + + # update schema (i.e. the fields), ensure foreign_type_info is unchanged + table.schema = [] + expected = { + "fields": [], + "foreignTypeInfo": {"typeSystem": "TYPE_SYSTEM_UNSPECIFIED"}, + } + assert table.to_api_repr()["schema"] == expected + + # update foreign_type_info, ensure schema (i.e. the fields), is unchanged + table.foreign_type_info = {"typeSystem": "SCHEMA_SHOULD_NOT_CHANGE"} + expected = { + "fields": [], + "foreignTypeInfo": {"typeSystem": "SCHEMA_SHOULD_NOT_CHANGE"}, + } + assert table.to_api_repr()["schema"] == expected + + def test_from_api_repr_w_schema_and_foreign_type_info(self): + """Tests to ensure that to_api_repr works correctly with + both schema and foreign_type_info fields + """ + + PROJECT = "test-project" + DATASET_ID = "test_dataset" + TABLE_ID = "coffee_table" + FOREIGNTYPEINFO = { + "typeSystem": "TYPE_SYSTEM_UNSPECIFIED", + } + SCHEMA = { + "fields": [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, + ], + "foreignTypeInfo": FOREIGNTYPEINFO, + } + API_REPR = { + "tableReference": { + "projectId": PROJECT, + "datasetId": DATASET_ID, + "tableId": TABLE_ID, + }, + "schema": SCHEMA, + } + + table = self._get_target_class().from_api_repr(API_REPR) + assert table._properties == API_REPR + def test__build_resource_w_custom_field(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) @@ -1448,6 +1782,33 @@ def test_encryption_configuration_setter(self): table.encryption_configuration = None self.assertIsNone(table.encryption_configuration) + def test_resource_tags_getter_empty(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + self.assertEqual(table.resource_tags, {}) + + def test_resource_tags_update_in_place(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + table.resource_tags["123456789012/key"] = "value" + self.assertEqual(table.resource_tags, {"123456789012/key": "value"}) + + def test_resource_tags_setter(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + table.resource_tags = {"123456789012/key": "value"} + self.assertEqual(table.resource_tags, {"123456789012/key": "value"}) + + def test_resource_tags_setter_bad_value(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + with self.assertRaises(ValueError): + table.resource_tags = 12345 + def test___repr__(self): from google.cloud.bigquery.table import TableReference @@ -1465,6 +1826,49 @@ def test___str__(self): table1 = self._make_one(TableReference(dataset, "table1")) self.assertEqual(str(table1), "project1.dataset1.table1") + def test_max_staleness_getter(self): + """Test getting max_staleness property.""" + dataset = DatasetReference("test-project", "test_dataset") + table_ref = dataset.table("test_table") + table = self._make_one(table_ref) + # Initially None + self.assertIsNone(table.max_staleness) + # Set max_staleness using setter + table.max_staleness = "1h" + self.assertEqual(table.max_staleness, "1h") + + def test_max_staleness_setter(self): + """Test setting max_staleness property.""" + dataset = DatasetReference("test-project", "test_dataset") + table_ref = dataset.table("test_table") + table = self._make_one(table_ref) + # Set valid max_staleness + table.max_staleness = "30m" + self.assertEqual(table.max_staleness, "30m") + # Set to None + table.max_staleness = None + self.assertIsNone(table.max_staleness) + + def test_max_staleness_setter_invalid_type(self): + """Test setting max_staleness with an invalid type raises ValueError.""" + dataset = DatasetReference("test-project", "test_dataset") + table_ref = dataset.table("test_table") + table = self._make_one(table_ref) + # Try setting invalid type + with self.assertRaises(ValueError): + table.max_staleness = 123 # Not a string + + def test_max_staleness_to_api_repr(self): + """Test max_staleness is correctly represented in API representation.""" + dataset = DatasetReference("test-project", "test_dataset") + table_ref = dataset.table("test_table") + table = self._make_one(table_ref) + # Set max_staleness + table.max_staleness = "1h" + # Convert to API representation + resource = table.to_api_repr() + self.assertEqual(resource.get("maxStaleness"), "1h") + class Test_row_from_mapping(unittest.TestCase, _SchemaBase): PROJECT = "prahj-ekt" @@ -1550,7 +1954,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.125 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.EXP_TIME = datetime.datetime(2015, 8, 1, 23, 59, 59, tzinfo=UTC) def test_ctor(self): @@ -1834,6 +2240,97 @@ def test_ctor_full_resource(self): assert instance.snapshot_time == expected_time +class TestBigLakeConfiguration(unittest.TestCase): + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.table import BigLakeConfiguration + + return BigLakeConfiguration + + @classmethod + def _make_one(cls, *args, **kwargs): + klass = cls._get_target_class() + return klass(*args, **kwargs) + + def test_ctor_empty_resource(self): + instance = self._make_one() + self.assertIsNone(instance.connection_id) + self.assertIsNone(instance.storage_uri) + self.assertIsNone(instance.file_format) + self.assertIsNone(instance.table_format) + + def test_ctor_kwargs(self): + instance = self._make_one( + connection_id="conn", + storage_uri="uri", + file_format="FILE", + table_format="TABLE", + ) + self.assertEqual(instance.connection_id, "conn") + self.assertEqual(instance.storage_uri, "uri") + self.assertEqual(instance.file_format, "FILE") + self.assertEqual(instance.table_format, "TABLE") + + def test_ctor_full_resource(self): + resource = { + "connectionId": "conn", + "storageUri": "uri", + "fileFormat": "FILE", + "tableFormat": "TABLE", + } + instance = self._make_one(_properties=resource) + self.assertEqual(instance.connection_id, "conn") + self.assertEqual(instance.storage_uri, "uri") + self.assertEqual(instance.file_format, "FILE") + self.assertEqual(instance.table_format, "TABLE") + + def test_to_api_repr(self): + resource = { + "connectionId": "conn", + "storageUri": "uri", + "fileFormat": "FILE", + "tableFormat": "TABLE", + } + instance = self._make_one(_properties=resource) + self.assertEqual(instance.to_api_repr(), resource) + + def test_from_api_repr_partial(self): + klass = self._get_target_class() + api_repr = {"fileFormat": "FILE"} + instance = klass.from_api_repr(api_repr) + + self.assertIsNone(instance.connection_id) + self.assertIsNone(instance.storage_uri) + self.assertEqual(instance.file_format, "FILE") + self.assertIsNone(instance.table_format) + + def test_comparisons(self): + resource = { + "connectionId": "conn", + "storageUri": "uri", + "fileFormat": "FILE", + "tableFormat": "TABLE", + } + + first = self._make_one(_properties=resource) + second = self._make_one(_properties=copy.deepcopy(resource)) + # Exercise comparator overloads. + # first and second should be equivalent. + self.assertNotEqual(first, resource) + self.assertEqual(first, second) + self.assertEqual(hash(first), hash(second)) + + # Update second to ensure that first and second are no longer equivalent. + second.connection_id = "foo" + self.assertNotEqual(first, second) + self.assertNotEqual(hash(first), hash(second)) + + # Update first with the same change, restoring equivalence. + first.connection_id = "foo" + self.assertEqual(first, second) + self.assertEqual(hash(first), hash(second)) + + class TestCloneDefinition: @staticmethod def _get_target_class(): @@ -1919,6 +2416,7 @@ def test_to_arrow_error_if_pyarrow_is_none(self): row_iterator.to_arrow() def test_to_arrow(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip("pyarrow") row_iterator = self._make_one() tbl = row_iterator.to_arrow() @@ -1926,6 +2424,7 @@ def test_to_arrow(self): self.assertEqual(tbl.num_rows, 0) def test_to_arrow_iterable(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip( "pyarrow", minversion=self.PYARROW_MINIMUM_VERSION ) @@ -1990,10 +2489,25 @@ def test_to_geodataframe(self): df = row_iterator.to_geodataframe(create_bqstorage_client=False) self.assertIsInstance(df, geopandas.GeoDataFrame) self.assertEqual(len(df), 0) # verify the number of rows - if version_info.major == 3 and version_info.minor > 7: - assert not hasattr(df, "crs") # used with Python > 3.7 + + if geopandas.__version__ == "0.9.0": + assert hasattr(df, "crs") else: - self.assertIsNone(df.crs) # used with Python == 3.7 + assert not hasattr(df, "crs") + + def test_methods_w_timeout(self): + pytest.importorskip("pyarrow") + pytest.importorskip("geopandas") + # Ensure that the timeout parameter is accepted by all methods without raising a TypeError, + # even though the _EmptyRowIterator implementations do not use the timeout value. + timeout = 42.0 + + # Call each type to ensure no TypeError is raised + self._make_one().to_arrow(timeout=timeout) + self._make_one().to_arrow_iterable(timeout=timeout) + self._make_one().to_dataframe(timeout=timeout) + self._make_one().to_dataframe_iterable(timeout=timeout) + self._make_one().to_geodataframe(timeout=timeout) class TestRowIterator(unittest.TestCase): @@ -2011,7 +2525,7 @@ def _make_one( path=None, schema=None, table=None, - **kwargs + **kwargs, ): from google.cloud.bigquery.table import TableReference @@ -2591,6 +3105,7 @@ def test_to_arrow_iterable_w_bqstorage(self): bqstorage_client._transport.grpc_channel.close.assert_not_called() def test_to_arrow(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip( "pyarrow", minversion=self.PYARROW_MINIMUM_VERSION ) @@ -2675,6 +3190,7 @@ def test_to_arrow(self): ) def test_to_arrow_w_nulls(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip( "pyarrow", minversion=self.PYARROW_MINIMUM_VERSION ) @@ -2711,6 +3227,7 @@ def test_to_arrow_w_nulls(self): self.assertEqual(ages, [32, 29, None, 111]) def test_to_arrow_w_unknown_type(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip( "pyarrow", minversion=self.PYARROW_MINIMUM_VERSION ) @@ -2756,6 +3273,7 @@ def test_to_arrow_w_unknown_type(self): self.assertTrue(all("sport" in str(warning) for warning in warned)) def test_to_arrow_w_empty_table(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip( "pyarrow", minversion=self.PYARROW_MINIMUM_VERSION ) @@ -2797,6 +3315,7 @@ def test_to_arrow_w_empty_table(self): self.assertEqual(child_field.type.value_type[1].name, "age") def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): + pytest.importorskip("numpy") pytest.importorskip("pyarrow") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery.schema import SchemaField @@ -2839,6 +3358,7 @@ def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): mock_client._ensure_bqstorage_client.assert_not_called() def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): + pytest.importorskip("numpy") pytest.importorskip("pyarrow") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery.schema import SchemaField @@ -2877,6 +3397,7 @@ def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): mock_client._ensure_bqstorage_client.assert_not_called() def test_to_arrow_w_bqstorage(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip("pyarrow") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery import schema @@ -2960,6 +3481,7 @@ def test_to_arrow_w_bqstorage(self): bqstorage_client._transport.grpc_channel.close.assert_not_called() def test_to_arrow_w_bqstorage_creates_client(self): + pytest.importorskip("numpy") pytest.importorskip("pyarrow") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery import schema @@ -2993,6 +3515,7 @@ def test_to_arrow_w_bqstorage_creates_client(self): bqstorage_client._transport.grpc_channel.close.assert_called_once() def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip( "pyarrow", minversion=self.PYARROW_MINIMUM_VERSION ) @@ -3026,6 +3549,7 @@ def mock_verify_version(raise_if_error: bool = False): self.assertEqual(tbl.num_rows, 2) def test_to_arrow_w_bqstorage_no_streams(self): + pytest.importorskip("numpy") pyarrow = pytest.importorskip("pyarrow") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery import schema @@ -3065,6 +3589,7 @@ def test_to_arrow_w_bqstorage_no_streams(self): self.assertEqual(actual_table.schema[2].name, "colB") def test_to_arrow_progress_bar(self): + pytest.importorskip("numpy") pytest.importorskip("pyarrow") pytest.importorskip("tqdm") pytest.importorskip("tqdm.notebook") @@ -3198,6 +3723,7 @@ def test_to_dataframe_iterable_with_dtypes(self): self.assertEqual(df_2["age"][0], 33) def test_to_dataframe_iterable_w_bqstorage(self): + pytest.importorskip("numpy") pandas = pytest.importorskip("pandas") pyarrow = pytest.importorskip("pyarrow") pytest.importorskip("google.cloud.bigquery_storage") @@ -3272,6 +3798,7 @@ def test_to_dataframe_iterable_w_bqstorage(self): bqstorage_client._transport.grpc_channel.close.assert_not_called() def test_to_dataframe_iterable_w_bqstorage_max_results_warning(self): + pytest.importorskip("numpy") pandas = pytest.importorskip("pandas") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery import schema @@ -3518,10 +4045,10 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - # With Python 3.7 and 3.8, len(user_warnings) = 3. With pandas < 1.5, - # pandas.ArrowDtype is not supported. We raise warnings because - # range columns have to be converted to object. - # With higher Python versions and noextra tests, len(user_warnings) = 0 + # With pandas < 1.5, pandas.ArrowDtype is not supported + # and len(user_warnings) = 3. + # We raise warnings because range columns have to be converted to object. + # With higher pandas versions and noextra tests, len(user_warnings) = 0 self.assertIn(len(user_warnings), [0, 3]) self.assertEqual(len(df), 4) @@ -3553,10 +4080,10 @@ def test_to_dataframe_no_tqdm(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - # With Python 3.7 and 3.8, len(user_warnings) = 4. With pandas < 1.5, - # pandas.ArrowDtype is not supported. We raise warnings because - # range columns have to be converted to object. - # With higher Python versions and noextra tests, len(user_warnings) = 1 + # With pandas < 1.5, pandas.ArrowDtype is not supported + # and len(user_warnings) = 4. + # We raise warnings because range columns have to be converted to object. + # With higher pandas versions and noextra tests, len(user_warnings) = 1 self.assertIn(len(user_warnings), [1, 4]) # Even though the progress bar won't show, downloading the dataframe @@ -3565,7 +4092,7 @@ def test_to_dataframe_no_tqdm(self): def test_to_dataframe_tqdm_error(self): pytest.importorskip("pandas") - pytest.importorskip("tqdm") + tqdm = pytest.importorskip("tqdm") mock.patch("tqdm.tqdm_gui", new=None) mock.patch("tqdm.notebook.tqdm", new=None) mock.patch("tqdm.tqdm", new=None) @@ -3598,9 +4125,13 @@ def test_to_dataframe_tqdm_error(self): # Warn that a progress bar was requested, but creating the tqdm # progress bar failed. for warning in warned: # pragma: NO COVER + # Pyparsing warnings appear to be coming from a transitive + # dependency and are unrelated to the code under test. + if "Pyparsing" in warning.category.__name__: + continue self.assertIn( warning.category, - [UserWarning, DeprecationWarning], + [UserWarning, DeprecationWarning, tqdm.TqdmExperimentalWarning], ) def test_to_dataframe_w_empty_results(self): @@ -3810,12 +4341,8 @@ def test_to_dataframe_w_dtypes_mapper(self): ) self.assertEqual(df.name.dtype.name, "string") - if hasattr(pandas, "Float64Dtype"): - self.assertEqual(list(df.miles), [1.77, 6.66, 2.0]) - self.assertEqual(df.miles.dtype.name, "Float64") - else: - self.assertEqual(list(df.miles), ["1.77", "6.66", "2.0"]) - self.assertEqual(df.miles.dtype.name, "string") + self.assertEqual(list(df.miles), [1.77, 6.66, 2.0]) + self.assertEqual(df.miles.dtype.name, "Float64") if hasattr(pandas, "ArrowDtype"): self.assertEqual( @@ -3904,7 +4431,6 @@ def test_to_dataframe_w_dtypes_mapper(self): {"start": None, "end": None}, ], ) - else: self.assertEqual( list(df.date), @@ -4020,7 +4546,7 @@ def test_to_dataframe_w_none_dtypes_mapper(self): def test_to_dataframe_w_unsupported_dtypes_mapper(self): pytest.importorskip("pandas") - import numpy + numpy = pytest.importorskip("numpy") from google.cloud.bigquery.schema import SchemaField schema = [ @@ -4304,6 +4830,7 @@ def test_to_dataframe_max_results_w_create_bqstorage_client_no_warning(self): mock_client._ensure_bqstorage_client.assert_not_called() def test_to_dataframe_w_bqstorage_creates_client(self): + pytest.importorskip("numpy") pytest.importorskip("pandas") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery import schema @@ -4337,6 +4864,7 @@ def test_to_dataframe_w_bqstorage_creates_client(self): bqstorage_client._transport.grpc_channel.close.assert_called_once() def test_to_dataframe_w_bqstorage_no_streams(self): + pytest.importorskip("numpy") pytest.importorskip("pandas") pytest.importorskip("google.cloud.bigquery_storage") from google.cloud.bigquery import schema @@ -4365,6 +4893,7 @@ def test_to_dataframe_w_bqstorage_no_streams(self): self.assertTrue(got.empty) def test_to_dataframe_w_bqstorage_logs_session(self): + pytest.importorskip("numpy") pytest.importorskip("google.cloud.bigquery_storage") pytest.importorskip("pandas") pytest.importorskip("pyarrow") @@ -4389,6 +4918,7 @@ def test_to_dataframe_w_bqstorage_logs_session(self): ) def test_to_dataframe_w_bqstorage_empty_streams(self): + pytest.importorskip("numpy") pytest.importorskip("google.cloud.bigquery_storage") pytest.importorskip("pandas") pyarrow = pytest.importorskip("pyarrow") @@ -4443,6 +4973,7 @@ def test_to_dataframe_w_bqstorage_empty_streams(self): self.assertTrue(got.empty) def test_to_dataframe_w_bqstorage_nonempty(self): + pytest.importorskip("numpy") pytest.importorskip("google.cloud.bigquery_storage") pytest.importorskip("pandas") pyarrow = pytest.importorskip("pyarrow") @@ -4525,6 +5056,7 @@ def test_to_dataframe_w_bqstorage_nonempty(self): bqstorage_client._transport.grpc_channel.close.assert_not_called() def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): + pytest.importorskip("numpy") bigquery_storage = pytest.importorskip("google.cloud.bigquery_storage") pytest.importorskip("pandas") pyarrow = pytest.importorskip("pyarrow") @@ -4577,6 +5109,7 @@ def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): self.assertTrue(got.index.is_unique) def test_to_dataframe_w_bqstorage_updates_progress_bar(self): + pytest.importorskip("numpy") bigquery_storage = pytest.importorskip("google.cloud.bigquery_storage") pytest.importorskip("pandas") pyarrow = pytest.importorskip("pyarrow") @@ -4654,6 +5187,7 @@ def blocking_to_arrow(*args, **kwargs): tqdm_mock().close.assert_called_once() def test_to_dataframe_w_bqstorage_exits_on_keyboardinterrupt(self): + pytest.importorskip("numpy") bigquery_storage = pytest.importorskip("google.cloud.bigquery_storage") pytest.importorskip("pandas") pyarrow = pytest.importorskip("pyarrow") @@ -4829,6 +5363,7 @@ def test_to_dataframe_w_bqstorage_snapshot(self): row_iterator.to_dataframe(bqstorage_client) def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): + pytest.importorskip("numpy") pytest.importorskip("google.cloud.bigquery_storage") pandas = pytest.importorskip("pandas") pyarrow = pytest.importorskip("pyarrow") @@ -5111,7 +5646,7 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): """ pandas = pytest.importorskip("pandas") geopandas = pytest.importorskip("geopandas") - import numpy + numpy = pytest.importorskip("numpy") from shapely import wkt row_iterator = self._make_one_from_data( @@ -5144,6 +5679,11 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, + timeout=None, ) self.assertIsInstance(df, geopandas.GeoDataFrame) @@ -5707,6 +6247,48 @@ def test__eq__other_type(self): with self.assertRaises(TypeError): foreign_key == "This is not a Foreign Key" + def test_to_api_repr(self): + from google.cloud.bigquery.table import ColumnReference, TableReference + + name = "my_fk" + referenced_table = TableReference.from_string("my-project.mydataset.mytable") + column_references = [ + ColumnReference(referencing_column="product_id", referenced_column="id") + ] + foreign_key = self._make_one(name, referenced_table, column_references) + + expected = { + "name": name, + "referencedTable": { + "projectId": "my-project", + "datasetId": "mydataset", + "tableId": "mytable", + }, + "columnReferences": [ + {"referencingColumn": "product_id", "referencedColumn": "id"} + ], + } + self.assertEqual(foreign_key.to_api_repr(), expected) + + def test_to_api_repr_empty_column_references(self): + from google.cloud.bigquery.table import TableReference + + name = "my_fk" + referenced_table = TableReference.from_string("my-project.mydataset.mytable") + column_references = [] + foreign_key = self._make_one(name, referenced_table, column_references) + + expected = { + "name": name, + "referencedTable": { + "projectId": "my-project", + "datasetId": "mydataset", + "tableId": "mytable", + }, + "columnReferences": [], + } + self.assertEqual(foreign_key.to_api_repr(), expected) + class TestTableConstraint(unittest.TestCase): @staticmethod @@ -5724,6 +6306,68 @@ def test_constructor_defaults(self): self.assertIsNone(instance.primary_key) self.assertIsNone(instance.foreign_keys) + def test_constructor_explicit(self): + from google.cloud.bigquery.table import ( + PrimaryKey, + ForeignKey, + TableReference, + ColumnReference, + ) + + primary_key = PrimaryKey(columns=["my_pk_id"]) + foriegn_keys = [ + ForeignKey( + name="my_fk_id", + referenced_table=TableReference.from_string( + "my-project.my-dataset.my-table" + ), + column_references=[ + ColumnReference(referencing_column="id", referenced_column="id"), + ], + ), + ] + + table_constraint = self._make_one( + primary_key=primary_key, + foreign_keys=foriegn_keys, + ) + + self.assertEqual(table_constraint.primary_key, primary_key) + self.assertEqual(table_constraint.foreign_keys, foriegn_keys) + + def test_constructor_explicit_with_none(self): + table_constraint = self._make_one(primary_key=None, foreign_keys=None) + + self.assertIsNone(table_constraint.primary_key) + self.assertIsNone(table_constraint.foreign_keys) + + def test__eq__other_type(self): + from google.cloud.bigquery.table import ( + PrimaryKey, + ForeignKey, + TableReference, + ColumnReference, + ) + + table_constraint = self._make_one( + primary_key=PrimaryKey(columns=["my_pk_id"]), + foreign_keys=[ + ForeignKey( + name="my_fk_id", + referenced_table=TableReference.from_string( + "my-project.my-dataset.my-table" + ), + column_references=[ + ColumnReference( + referencing_column="id", referenced_column="id" + ), + ], + ), + ], + ) + with self.assertRaises(TypeError): + table_constraint == "This is not a Table Constraint" + def test_from_api_repr_full_resource(self): from google.cloud.bigquery.table import ( ColumnReference, @@ -5803,6 +6447,363 @@ def test_from_api_repr_only_foreign_keys_resource(self): self.assertIsNone(instance.primary_key) self.assertIsNotNone(instance.foreign_keys) + def test_to_api_repr(self): + from google.cloud.bigquery.table import ColumnReference, ForeignKey, PrimaryKey + + primary_key = PrimaryKey(columns=["id", "product_id"]) + foreign_keys = [ + ForeignKey( + name="my_fk_name", + referenced_table=TableReference.from_string( + "my-project.my-dataset.products" + ), + column_references=[ + ColumnReference( + referencing_column="product_id", referenced_column="id" + ), + ], + ) + ] + instance = self._make_one(primary_key=primary_key, foreign_keys=foreign_keys) + + expected = { + "primaryKey": { + "columns": ["id", "product_id"], + }, + "foreignKeys": [ + { + "name": "my_fk_name", + "referencedTable": { + "projectId": "my-project", + "datasetId": "my-dataset", + "tableId": "products", + }, + "columnReferences": [ + {"referencingColumn": "product_id", "referencedColumn": "id"}, + ], + } + ], + } + self.assertEqual(instance.to_api_repr(), expected) + + def test_to_api_repr_only_primary_key(self): + from google.cloud.bigquery.table import PrimaryKey + + primary_key = PrimaryKey(columns=["id", "product_id"]) + instance = self._make_one(primary_key=primary_key, foreign_keys=None) + expected = { + "primaryKey": { + "columns": ["id", "product_id"], + }, + } + self.assertEqual(instance.to_api_repr(), expected) + + def test_to_api_repr_empty_primary_key(self): + from google.cloud.bigquery.table import PrimaryKey + + primary_key = PrimaryKey(columns=[]) + instance = self._make_one(primary_key=primary_key, foreign_keys=None) + + expected = { + "primaryKey": { + "columns": [], + }, + } + self.assertEqual(instance.to_api_repr(), expected) + + def test_to_api_repr_only_foreign_keys(self): + from google.cloud.bigquery.table import ColumnReference, ForeignKey + + foreign_keys = [ + ForeignKey( + name="my_fk_name", + referenced_table=TableReference.from_string( + "my-project.my-dataset.products" + ), + column_references=[ + ColumnReference( + referencing_column="product_id", referenced_column="id" + ), + ], + ) + ] + instance = self._make_one(primary_key=None, foreign_keys=foreign_keys) + expected = { + "foreignKeys": [ + { + "name": "my_fk_name", + "referencedTable": { + "projectId": "my-project", + "datasetId": "my-dataset", + "tableId": "products", + }, + "columnReferences": [ + {"referencingColumn": "product_id", "referencedColumn": "id"}, + ], + } + ], + } + self.assertEqual(instance.to_api_repr(), expected) + + def test_to_api_repr_empty_foreign_keys(self): + foreign_keys = [] + instance = self._make_one(primary_key=None, foreign_keys=foreign_keys) + + expected = {} + self.assertEqual(instance.to_api_repr(), expected) + + def test_to_api_repr_empty_constraints(self): + instance = self._make_one(primary_key=None, foreign_keys=None) + expected = {} + self.assertEqual(instance.to_api_repr(), expected) + + +@pytest.mark.parametrize( + "self_pk_name,self_fk_name,other_pk_name,other_fk_name,expected_equal", + [ + (None, None, None, None, True), + ("pkey", None, "pkey", None, True), + ("pkey", "fkey", "pkey", "fkey", True), + (None, "fkey", None, "fkey", True), + ("pkey", None, "pkey_no_match", None, False), + ("pkey", "fkey", "pkey_no_match", "fkey_no_match", False), + (None, "fkey", None, "fkey_no_match", False), + ("pkey", "fkey", "pkey_no_match", "fkey", False), + ("pkey", "fkey", "pkey", "fkey_no_match", False), + ], +) +def test_table_constraint_eq_parametrized( + self_pk_name, self_fk_name, other_pk_name, other_fk_name, expected_equal +): + # Imports are placed here to ensure they are self-contained for this example. + # In a real test file, they would likely be at the top of the file. + from google.cloud.bigquery.table import ( + ColumnReference, + ForeignKey, + PrimaryKey, + TableReference, + TableConstraints, + ) + + # Helper function to create a PrimaryKey object or None + def _create_primary_key(name): + if name is None: + return None + return PrimaryKey(columns=[name]) + + # Helper function to create a list of ForeignKey objects or None + def _create_foreign_keys(name): + if name is None: + return None + # Using a generic referenced_table and column_references for simplicity + # The 'name' parameter ensures different ForeignKey objects for different names + return [ + ForeignKey( + name=name, + referenced_table=TableReference.from_string( + f"my-project.my-dataset.{name}_referenced_table" + ), + column_references=[ + ColumnReference( + referencing_column=f"{name}_ref_col", + referenced_column=f"{name}_pk_col", + ) + ], + ) + ] + + # Create the two TableConstraints instances for comparison + tc1 = TableConstraints( + primary_key=_create_primary_key(self_pk_name), + foreign_keys=_create_foreign_keys(self_fk_name), + ) + tc2 = TableConstraints( + primary_key=_create_primary_key(other_pk_name), + foreign_keys=_create_foreign_keys(other_fk_name), + ) + + # Assert the equality based on the expected outcome + assert (tc1 == tc2) == expected_equal + + +class TestExternalCatalogTableOptions: + PROJECT = "test-project" + DATASET_ID = "test_dataset" + TABLE_ID = "coffee_table" + DATASET = DatasetReference(PROJECT, DATASET_ID) + TABLEREF = DATASET.table(TABLE_ID) + + @staticmethod + def _get_target_class(self): + from google.cloud.bigquery.table import Table + + return Table + + def _make_one(self, *args, **kw): + return self._get_target_class(self)(*args, **kw) + + EXTERNALCATALOGTABLEOPTIONS = { + "connection_id": "connection123", + "parameters": {"key": "value"}, + "storage_descriptor": { + "input_format": "testpath.to.OrcInputFormat", + "location_uri": "gs://test/path/", + "output_format": "testpath.to.OrcOutputFormat", + "serde_info": { + "serialization_library": "testpath.to.LazySimpleSerDe", + "name": "serde_lib_name", + "parameters": {"key": "value"}, + }, + }, + } + + def test_external_catalog_table_options_default_initialization(self): + table = self._make_one(self.TABLEREF) + + assert table.external_catalog_table_options is None + + def test_external_catalog_table_options_valid_inputs(self): + table = self._make_one(self.TABLEREF) + + # supplied in api_repr format + table.external_catalog_table_options = self.EXTERNALCATALOGTABLEOPTIONS + result = table.external_catalog_table_options.to_api_repr() + expected = self.EXTERNALCATALOGTABLEOPTIONS + assert result == expected + + # supplied in obj format + ecto = external_config.ExternalCatalogTableOptions.from_api_repr( + self.EXTERNALCATALOGTABLEOPTIONS + ) + assert isinstance(ecto, external_config.ExternalCatalogTableOptions) + + table.external_catalog_table_options = ecto + result = table.external_catalog_table_options.to_api_repr() + expected = self.EXTERNALCATALOGTABLEOPTIONS + assert result == expected + + def test_external_catalog_table_options_invalid_input(self): + table = self._make_one(self.TABLEREF) + + # invalid on the whole + with pytest.raises(TypeError) as e: + table.external_catalog_table_options = 123 + + # Looking for the first word from the string "Pass as..." + assert "Pass " in str(e.value) + + def test_external_catalog_table_options_to_api_repr(self): + table = self._make_one(self.TABLEREF) + + table.external_catalog_table_options = self.EXTERNALCATALOGTABLEOPTIONS + result = table.external_catalog_table_options.to_api_repr() + expected = self.EXTERNALCATALOGTABLEOPTIONS + assert result == expected + + def test_external_catalog_table_options_from_api_repr(self): + table = self._make_one(self.TABLEREF) + + table.external_catalog_table_options = self.EXTERNALCATALOGTABLEOPTIONS + ecto = external_config.ExternalCatalogTableOptions.from_api_repr( + self.EXTERNALCATALOGTABLEOPTIONS + ) + result = ecto.to_api_repr() + expected = self.EXTERNALCATALOGTABLEOPTIONS + assert result == expected + + +class TestForeignTypeInfo: + PROJECT = "test-project" + DATASET_ID = "test_dataset" + TABLE_ID = "coffee_table" + DATASET = DatasetReference(PROJECT, DATASET_ID) + TABLEREF = DATASET.table(TABLE_ID) + FOREIGNTYPEINFO = { + "typeSystem": "TYPE_SYSTEM_UNSPECIFIED", + } + API_REPR = { + "tableReference": { + "projectId": PROJECT, + "datasetId": DATASET_ID, + "tableId": TABLE_ID, + }, + "schema": { + "fields": [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, + ], + "foreign_info_type": FOREIGNTYPEINFO, + }, + } + + from google.cloud.bigquery.schema import ForeignTypeInfo + + @staticmethod + def _get_target_class(self): + from google.cloud.bigquery.table import Table + + return Table + + def _make_one(self, *args, **kw): + return self._get_target_class(self)(*args, **kw) + + def test_foreign_type_info_default_initialization(self): + table = self._make_one(self.TABLEREF) + assert table.foreign_type_info is None + + @pytest.mark.parametrize( + "foreign_type_info, expected", + [ + ( + {"typeSystem": "TYPE_SYSTEM_UNSPECIFIED"}, + "TYPE_SYSTEM_UNSPECIFIED", + ), + (None, None), + ( + ForeignTypeInfo(type_system="TYPE_SYSTEM_UNSPECIFIED"), + "TYPE_SYSTEM_UNSPECIFIED", + ), + ], + ) + def test_foreign_type_info_valid_inputs(self, foreign_type_info, expected): + table = self._make_one(self.TABLEREF) + + table.foreign_type_info = foreign_type_info + + if foreign_type_info is None: + result = table.foreign_type_info + else: + result = table.foreign_type_info.type_system + assert result == expected + + def test_foreign_type_info_invalid_inputs(self): + table = self._make_one(self.TABLEREF) + + # invalid on the whole + with pytest.raises(TypeError, match="Pass .*"): + table.foreign_type_info = 123 + + def test_foreign_type_info_to_api_repr(self): + table = self._make_one(self.TABLEREF) + + table.foreign_type_info = self.ForeignTypeInfo( + type_system="TYPE_SYSTEM_UNSPECIFIED", + ) + + result = table.to_api_repr()["schema"]["foreignTypeInfo"] + expected = self.FOREIGNTYPEINFO + assert result == expected + + def test_foreign_type_info_from_api_repr(self): + table = self._make_one(self.TABLEREF) + table.foreign_type_info = self.FOREIGNTYPEINFO + + fti = schema.ForeignTypeInfo.from_api_repr(self.FOREIGNTYPEINFO) + + result = fti.to_api_repr() + expected = self.FOREIGNTYPEINFO + assert result == expected + @pytest.mark.parametrize( "table_path", @@ -5822,3 +6823,77 @@ def test_table_reference_to_bqstorage_v1_stable(table_path): for klass in (mut.TableReference, mut.Table, mut.TableListItem): got = klass.from_string(table_path).to_bqstorage() assert got == expected + + +@pytest.mark.parametrize("preserve_order", [True, False]) +def test_to_arrow_iterable_w_bqstorage_max_stream_count(preserve_order): + pytest.importorskip("pandas") + pytest.importorskip("google.cloud.bigquery_storage") + from google.cloud.bigquery import schema + from google.cloud.bigquery import table as mut + from google.cloud import bigquery_storage + + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() + bqstorage_client.create_read_session.return_value = session + + row_iterator = mut.RowIterator( + _mock_client(), + api_request=None, + path=None, + schema=[ + schema.SchemaField("colA", "INTEGER"), + ], + table=mut.TableReference.from_string("proj.dset.tbl"), + ) + row_iterator._preserve_order = preserve_order + + max_stream_count = 132 + result_iterable = row_iterator.to_arrow_iterable( + bqstorage_client=bqstorage_client, max_stream_count=max_stream_count + ) + list(result_iterable) + bqstorage_client.create_read_session.assert_called_once_with( + parent=mock.ANY, + read_session=mock.ANY, + max_stream_count=max_stream_count if not preserve_order else 1, + retry=None, + timeout=None, + ) + + +@pytest.mark.parametrize("preserve_order", [True, False]) +def test_to_dataframe_iterable_w_bqstorage_max_stream_count(preserve_order): + pytest.importorskip("pandas") + pytest.importorskip("google.cloud.bigquery_storage") + from google.cloud.bigquery import schema + from google.cloud.bigquery import table as mut + from google.cloud import bigquery_storage + + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() + bqstorage_client.create_read_session.return_value = session + + row_iterator = mut.RowIterator( + _mock_client(), + api_request=None, + path=None, + schema=[ + schema.SchemaField("colA", "INTEGER"), + ], + table=mut.TableReference.from_string("proj.dset.tbl"), + ) + row_iterator._preserve_order = preserve_order + + max_stream_count = 132 + result_iterable = row_iterator.to_dataframe_iterable( + bqstorage_client=bqstorage_client, max_stream_count=max_stream_count + ) + list(result_iterable) + bqstorage_client.create_read_session.assert_called_once_with( + parent=mock.ANY, + read_session=mock.ANY, + max_stream_count=max_stream_count if not preserve_order else 1, + retry=None, + timeout=None, + ) diff --git a/tests/unit/test_table_arrow.py b/tests/unit/test_table_arrow.py index 6f1e6f76a..fdd1b7b78 100644 --- a/tests/unit/test_table_arrow.py +++ b/tests/unit/test_table_arrow.py @@ -18,7 +18,8 @@ import google.cloud.bigquery.table -pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") +pytest.importorskip("numpy") +pytest.importorskip("pyarrow", minversion="3.0.0") def test_to_arrow_with_jobs_query_response(): @@ -28,6 +29,7 @@ def test_to_arrow_with_jobs_query_response(): "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "number", "type": "INTEGER", "mode": "NULLABLE"}, + {"name": "json", "type": "JSON", "mode": "NULLABLE"}, ] }, "jobReference": { @@ -37,15 +39,21 @@ def test_to_arrow_with_jobs_query_response(): }, "totalRows": "9", "rows": [ - {"f": [{"v": "Tiarra"}, {"v": "6"}]}, - {"f": [{"v": "Timothy"}, {"v": "325"}]}, - {"f": [{"v": "Tina"}, {"v": "26"}]}, - {"f": [{"v": "Tierra"}, {"v": "10"}]}, - {"f": [{"v": "Tia"}, {"v": "17"}]}, - {"f": [{"v": "Tiara"}, {"v": "22"}]}, - {"f": [{"v": "Tiana"}, {"v": "6"}]}, - {"f": [{"v": "Tiffany"}, {"v": "229"}]}, - {"f": [{"v": "Tiffani"}, {"v": "8"}]}, + {"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]}, + {"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]}, + {"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]}, + { + "f": [ + {"v": "Tierra"}, + {"v": "10"}, + {"v": '{"aKey": {"bKey": {"cKey": -123}}}'}, + ] + }, + {"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]}, + {"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]}, + {"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]}, + {"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]}, + {"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]}, ], "totalBytesProcessed": "154775150", "jobComplete": True, @@ -65,7 +73,7 @@ def test_to_arrow_with_jobs_query_response(): ) records = rows.to_arrow() - assert records.column_names == ["name", "number"] + assert records.column_names == ["name", "number", "json"] assert records["name"].to_pylist() == [ "Tiarra", "Timothy", @@ -78,6 +86,17 @@ def test_to_arrow_with_jobs_query_response(): "Tiffani", ] assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8] + assert records["json"].to_pylist() == [ + "123", + '{"key":"value"}', + "[1,2,3]", + '{"aKey": {"bKey": {"cKey": -123}}}', + None, + '"some-json-string"', + '{"nullKey":null}', + '""', + "[]", + ] def test_to_arrow_with_jobs_query_response_and_max_results(): @@ -87,6 +106,7 @@ def test_to_arrow_with_jobs_query_response_and_max_results(): "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "number", "type": "INTEGER", "mode": "NULLABLE"}, + {"name": "json", "type": "JSON", "mode": "NULLABLE"}, ] }, "jobReference": { @@ -96,15 +116,21 @@ def test_to_arrow_with_jobs_query_response_and_max_results(): }, "totalRows": "9", "rows": [ - {"f": [{"v": "Tiarra"}, {"v": "6"}]}, - {"f": [{"v": "Timothy"}, {"v": "325"}]}, - {"f": [{"v": "Tina"}, {"v": "26"}]}, - {"f": [{"v": "Tierra"}, {"v": "10"}]}, - {"f": [{"v": "Tia"}, {"v": "17"}]}, - {"f": [{"v": "Tiara"}, {"v": "22"}]}, - {"f": [{"v": "Tiana"}, {"v": "6"}]}, - {"f": [{"v": "Tiffany"}, {"v": "229"}]}, - {"f": [{"v": "Tiffani"}, {"v": "8"}]}, + {"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]}, + {"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]}, + {"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]}, + { + "f": [ + {"v": "Tierra"}, + {"v": "10"}, + {"v": '{"aKey": {"bKey": {"cKey": -123}}}'}, + ] + }, + {"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]}, + {"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]}, + {"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]}, + {"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]}, + {"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]}, ], "totalBytesProcessed": "154775150", "jobComplete": True, @@ -125,10 +151,11 @@ def test_to_arrow_with_jobs_query_response_and_max_results(): ) records = rows.to_arrow() - assert records.column_names == ["name", "number"] + assert records.column_names == ["name", "number", "json"] assert records["name"].to_pylist() == [ "Tiarra", "Timothy", "Tina", ] assert records["number"].to_pylist() == [6, 325, 26] + assert records["json"].to_pylist() == ["123", '{"key":"value"}', "[1,2,3]"] diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 02a7a6a79..64d8b1451 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock import datetime import decimal -from unittest import mock import pytest @@ -34,12 +34,10 @@ def class_under_test(): return RowIterator -@pytest.mark.skipif( - pandas.__version__.startswith("2."), - reason="pandas 2.0 changes some default dtypes and we haven't update the test to account for those", -) -def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): - # See tests/system/test_arrow.py for the actual types we get from the API. +def test_to_dataframe_nullable_scalars( + monkeypatch, class_under_test +): # pragma: NO COVER + """See tests/system/test_arrow.py for the actual types we get from the API.""" arrow_schema = pyarrow.schema( [ pyarrow.field("bignumeric_col", pyarrow.decimal256(76, scale=38)), @@ -55,6 +53,7 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): pyarrow.field( "timestamp_col", pyarrow.timestamp("us", tz=datetime.timezone.utc) ), + pyarrow.field("json_col", pyarrow.string()), ] ) arrow_table = pyarrow.Table.from_pydict( @@ -74,6 +73,7 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): 2021, 8, 9, 13, 30, 44, 123456, tzinfo=datetime.timezone.utc ) ], + "json_col": ["{}"], }, schema=arrow_schema, ) @@ -90,6 +90,7 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): bigquery.SchemaField("string_col", "STRING"), bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + bigquery.SchemaField("json_col", "JSON"), ] mock_client = mock.create_autospec(bigquery.Client) mock_client.project = "test-proj" @@ -106,13 +107,18 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): assert df.dtypes["bool_col"].name == "boolean" assert df.dtypes["bytes_col"].name == "object" assert df.dtypes["date_col"].name == "dbdate" - assert df.dtypes["datetime_col"].name == "datetime64[ns]" assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["int64_col"].name == "Int64" assert df.dtypes["numeric_col"].name == "object" assert df.dtypes["string_col"].name == "object" assert df.dtypes["time_col"].name == "dbtime" - assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + assert df.dtypes["json_col"].name == "object" + if pandas.__version__.startswith("2."): + assert df.dtypes["datetime_col"].name == "datetime64[us]" + assert df.dtypes["timestamp_col"].name == "datetime64[us, UTC]" + else: + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" # Check for expected values. assert df["bignumeric_col"][0] == decimal.Decimal("123.456789101112131415") @@ -129,12 +135,10 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): assert df["int64_col"][0] == -7 assert df["numeric_col"][0] == decimal.Decimal("-123.456789") assert df["string_col"][0] == "abcdefg" - # Pandas timedelta64 might be a better choice for pandas time columns. Then # they can more easily be combined with date columns to form datetimes. # https://github.com/googleapis/python-bigquery/issues/862 assert df["time_col"][0] == datetime.time(14, 21, 17, 123456) - assert df["timestamp_col"][0] == pandas.to_datetime("2021-08-09 13:30:44.123456Z") @@ -255,3 +259,108 @@ def test_to_dataframe_with_jobs_query_response(class_under_test): "Tiffani", ] assert list(df["number"]) == [6, 325, 26, 10, 17, 22, 6, 229, 8] + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_default_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (1 2)"], + "bool_col": [True], + "int_col": [123], + "float_col": [1.23], + "string_col": ["abc"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + rows.to_geodataframe(geography_column="geo_col") + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=bigquery.enums.DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=bigquery.enums.DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, + timeout=None, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + ) + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_custom_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (3 4)"], + "bool_col": [False], + "int_col": [456], + "float_col": [4.56], + "string_col": ["def"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + custom_bool_dtype = "bool" + custom_int_dtype = "int32" + custom_float_dtype = "float32" + custom_string_dtype = "string" + + rows.to_geodataframe( + geography_column="geo_col", + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + ) + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + timeout=None, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + )