diff --git a/.coveragerc b/.coveragerc index dd39c8546..23861a8eb 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,35 +1,18 @@ -# -*- coding: utf-8 -*- -# -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Generated by synthtool. DO NOT EDIT! [run] branch = True [report] fail_under = 100 show_missing = True +omit = + google/cloud/bigquery/__init__.py exclude_lines = # Re-enable the standard pragma pragma: NO COVER # Ignore debug-only repr def __repr__ - # Ignore abstract methods - raise NotImplementedError -omit = - */gapic/*.py - */proto/*.py - */core/*.py - */site-packages/*.py \ No newline at end of file + # Ignore pkg_resources exceptions. + # This is added at the module level as a safeguard for if someone + # generates the code and tries to run it without pip installing. This + # makes it virtually impossible to test properly. + except pkg_resources.DistributionNotFound diff --git a/.flake8 b/.flake8 index ed9316381..29227d4cf 100644 --- a/.flake8 +++ b/.flake8 @@ -26,6 +26,7 @@ exclude = *_pb2.py # Standard linting exemptions. + **/.nox/** __pycache__, .git, *.pyc, diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml new file mode 100644 index 000000000..a9fcd07cc --- /dev/null +++ b/.github/.OwlBot.lock.yaml @@ -0,0 +1,3 @@ +docker: + image: gcr.io/repo-automation-bots/owlbot-python:latest + digest: sha256:9743664022bd63a8084be67f144898314c7ca12f0a03e422ac17c733c129d803 diff --git a/.github/.OwlBot.yaml b/.github/.OwlBot.yaml new file mode 100644 index 000000000..2b6451c19 --- /dev/null +++ b/.github/.OwlBot.yaml @@ -0,0 +1,26 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +docker: + image: gcr.io/repo-automation-bots/owlbot-python:latest + +deep-remove-regex: + - /owl-bot-staging + +deep-copy-regex: + - source: /google/cloud/bigquery/(v.*)/.*-py/(.*) + dest: /owl-bot-staging/$1/$2 + +begin-after-commit-hash: f2de93abafa306b2ebadf1d10d947db8bcf2bf15 + diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 10f4ee7c0..6763f258c 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,8 +5,7 @@ # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-syntax # The @googleapis/api-bigquery is the default owner for changes in this repo -* @googleapis/api-bigquery +* @googleapis/api-bigquery @googleapis/yoshi-python # The python-samples-reviewers team is the default owner for samples changes -/samples/ @googleapis/python-samples-owners - +/samples/ @googleapis/api-bigquery @googleapis/python-samples-owners @googleapis/yoshi-python diff --git a/.github/header-checker-lint.yml b/.github/header-checker-lint.yml new file mode 100644 index 000000000..6fe78aa79 --- /dev/null +++ b/.github/header-checker-lint.yml @@ -0,0 +1,15 @@ +{"allowedCopyrightHolders": ["Google LLC"], + "allowedLicenses": ["Apache-2.0", "MIT", "BSD-3"], + "ignoreFiles": ["**/requirements.txt", "**/requirements-test.txt", "**/__init__.py", "samples/**/constraints.txt", "samples/**/constraints-test.txt"], + "sourceFileExtensions": [ + "ts", + "js", + "java", + "sh", + "Dockerfile", + "yaml", + "py", + "html", + "txt" + ] +} \ No newline at end of file diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml new file mode 100644 index 000000000..8634a3043 --- /dev/null +++ b/.github/sync-repo-settings.yaml @@ -0,0 +1,27 @@ +# https://github.com/googleapis/repo-automation-bots/tree/master/packages/sync-repo-settings +# Rules for master branch protection +branchProtectionRules: +# Identifies the protection rule pattern. Name of the branch to be protected. +# Defaults to `master` +- pattern: master + requiresCodeOwnerReviews: true + requiresStrictStatusChecks: true + requiredStatusCheckContexts: + - 'Kokoro' + - 'Kokoro snippets-3.8' + - 'cla/google' + - 'Samples - Lint' + - 'Samples - Python 3.6' + - 'Samples - Python 3.7' + - 'Samples - Python 3.8' +- pattern: v3 + requiresCodeOwnerReviews: true + requiresStrictStatusChecks: true + requiredStatusCheckContexts: + - 'Kokoro' + - 'Kokoro snippets-3.8' + - 'cla/google' + - 'Samples - Lint' + - 'Samples - Python 3.6' + - 'Samples - Python 3.7' + - 'Samples - Python 3.8' diff --git a/.gitignore b/.gitignore index b9daa52f1..99c3a1444 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ pip-log.txt .nox .cache .pytest_cache +.pytype # Mac @@ -50,8 +51,10 @@ docs.metadata # Virtual environment env/ + +# Test logs coverage.xml -sponge_log.xml +*sponge_log.xml # System test environment variables. system_tests/local_test_setup diff --git a/.kokoro/build.sh b/.kokoro/build.sh index 0e71e2aca..302cc1e1a 100755 --- a/.kokoro/build.sh +++ b/.kokoro/build.sh @@ -15,7 +15,11 @@ set -eo pipefail -cd github/python-bigquery +if [[ -z "${PROJECT_ROOT:-}" ]]; then + PROJECT_ROOT="github/python-bigquery" +fi + +cd "${PROJECT_ROOT}" # Disable buffering, so that the logs stream through. export PYTHONUNBUFFERED=1 @@ -30,16 +34,26 @@ export GOOGLE_APPLICATION_CREDENTIALS=${KOKORO_GFILE_DIR}/service-account.json export PROJECT_ID=$(cat "${KOKORO_GFILE_DIR}/project-id.json") # Remove old nox -python3.6 -m pip uninstall --yes --quiet nox-automation +python3 -m pip uninstall --yes --quiet nox-automation # Install nox -python3.6 -m pip install --upgrade --quiet nox -python3.6 -m nox --version +python3 -m pip install --upgrade --quiet nox +python3 -m nox --version + +# If this is a continuous build, send the test log to the FlakyBot. +# See https://github.com/googleapis/repo-automation-bots/tree/master/packages/flakybot. +if [[ $KOKORO_BUILD_ARTIFACTS_SUBDIR = *"continuous"* ]]; then + cleanup() { + chmod +x $KOKORO_GFILE_DIR/linux_amd64/flakybot + $KOKORO_GFILE_DIR/linux_amd64/flakybot + } + trap cleanup EXIT HUP +fi # If NOX_SESSION is set, it only runs the specified session, # otherwise run all the sessions. if [[ -n "${NOX_SESSION:-}" ]]; then - python3.6 -m nox -s "${NOX_SESSION:-}" + python3 -m nox -s ${NOX_SESSION:-} else - python3.6 -m nox + python3 -m nox fi diff --git a/.kokoro/docker/docs/Dockerfile b/.kokoro/docker/docs/Dockerfile index 412b0b56a..4e1b1fb8b 100644 --- a/.kokoro/docker/docs/Dockerfile +++ b/.kokoro/docker/docs/Dockerfile @@ -40,6 +40,7 @@ RUN apt-get update \ libssl-dev \ libsqlite3-dev \ portaudio19-dev \ + python3-distutils \ redis-server \ software-properties-common \ ssh \ @@ -59,40 +60,8 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* \ && rm -f /var/cache/apt/archives/*.deb - -COPY fetch_gpg_keys.sh /tmp -# Install the desired versions of Python. -RUN set -ex \ - && export GNUPGHOME="$(mktemp -d)" \ - && echo "disable-ipv6" >> "${GNUPGHOME}/dirmngr.conf" \ - && /tmp/fetch_gpg_keys.sh \ - && for PYTHON_VERSION in 3.7.8 3.8.5; do \ - wget --no-check-certificate -O python-${PYTHON_VERSION}.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz" \ - && wget --no-check-certificate -O python-${PYTHON_VERSION}.tar.xz.asc "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz.asc" \ - && gpg --batch --verify python-${PYTHON_VERSION}.tar.xz.asc python-${PYTHON_VERSION}.tar.xz \ - && rm -r python-${PYTHON_VERSION}.tar.xz.asc \ - && mkdir -p /usr/src/python-${PYTHON_VERSION} \ - && tar -xJC /usr/src/python-${PYTHON_VERSION} --strip-components=1 -f python-${PYTHON_VERSION}.tar.xz \ - && rm python-${PYTHON_VERSION}.tar.xz \ - && cd /usr/src/python-${PYTHON_VERSION} \ - && ./configure \ - --enable-shared \ - # This works only on Python 2.7 and throws a warning on every other - # version, but seems otherwise harmless. - --enable-unicode=ucs4 \ - --with-system-ffi \ - --without-ensurepip \ - && make -j$(nproc) \ - && make install \ - && ldconfig \ - ; done \ - && rm -rf "${GNUPGHOME}" \ - && rm -rf /usr/src/python* \ - && rm -rf ~/.cache/ - RUN wget -O /tmp/get-pip.py 'https://bootstrap.pypa.io/get-pip.py' \ - && python3.7 /tmp/get-pip.py \ && python3.8 /tmp/get-pip.py \ && rm /tmp/get-pip.py -CMD ["python3.7"] +CMD ["python3.8"] diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg index 8f9807f72..0c99ae611 100644 --- a/.kokoro/docs/common.cfg +++ b/.kokoro/docs/common.cfg @@ -30,7 +30,7 @@ env_vars: { env_vars: { key: "V2_STAGING_BUCKET" - value: "docs-staging-v2-staging" + value: "docs-staging-v2" } # It will upload the docker image after successful builds. diff --git a/.kokoro/docs/docs-presubmit.cfg b/.kokoro/docs/docs-presubmit.cfg index 111810782..08adb2e28 100644 --- a/.kokoro/docs/docs-presubmit.cfg +++ b/.kokoro/docs/docs-presubmit.cfg @@ -15,3 +15,14 @@ env_vars: { key: "TRAMPOLINE_IMAGE_UPLOAD" value: "false" } + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/build.sh" +} + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "docs docfx" +} diff --git a/.kokoro/populate-secrets.sh b/.kokoro/populate-secrets.sh new file mode 100755 index 000000000..f52514257 --- /dev/null +++ b/.kokoro/populate-secrets.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright 2020 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eo pipefail + +function now { date +"%Y-%m-%d %H:%M:%S" | tr -d '\n' ;} +function msg { println "$*" >&2 ;} +function println { printf '%s\n' "$(now) $*" ;} + + +# Populates requested secrets set in SECRET_MANAGER_KEYS from service account: +# kokoro-trampoline@cloud-devrel-kokoro-resources.iam.gserviceaccount.com +SECRET_LOCATION="${KOKORO_GFILE_DIR}/secret_manager" +msg "Creating folder on disk for secrets: ${SECRET_LOCATION}" +mkdir -p ${SECRET_LOCATION} +for key in $(echo ${SECRET_MANAGER_KEYS} | sed "s/,/ /g") +do + msg "Retrieving secret ${key}" + docker run --entrypoint=gcloud \ + --volume=${KOKORO_GFILE_DIR}:${KOKORO_GFILE_DIR} \ + gcr.io/google.com/cloudsdktool/cloud-sdk \ + secrets versions access latest \ + --project cloud-devrel-kokoro-resources \ + --secret ${key} > \ + "${SECRET_LOCATION}/${key}" + if [[ $? == 0 ]]; then + msg "Secret written to ${SECRET_LOCATION}/${key}" + else + msg "Error retrieving secret ${key}" + fi +done diff --git a/.kokoro/presubmit/prerelease-deps-3.8.cfg b/.kokoro/presubmit/prerelease-deps-3.8.cfg new file mode 100644 index 000000000..f06806baf --- /dev/null +++ b/.kokoro/presubmit/prerelease-deps-3.8.cfg @@ -0,0 +1,7 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "prerelease_deps" +} \ No newline at end of file diff --git a/.kokoro/presubmit/presubmit.cfg b/.kokoro/presubmit/presubmit.cfg index b158096f0..17d071cae 100644 --- a/.kokoro/presubmit/presubmit.cfg +++ b/.kokoro/presubmit/presubmit.cfg @@ -5,3 +5,7 @@ env_vars: { key: "RUN_SYSTEM_TESTS" value: "false" } +env_vars: { + key: "RUN_SNIPPETS_TESTS" + value: "false" +} diff --git a/.kokoro/presubmit/system-2.7.cfg b/.kokoro/presubmit/snippets-2.7.cfg similarity index 80% rename from .kokoro/presubmit/system-2.7.cfg rename to .kokoro/presubmit/snippets-2.7.cfg index 3b6523a19..3bd6134d2 100644 --- a/.kokoro/presubmit/system-2.7.cfg +++ b/.kokoro/presubmit/snippets-2.7.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system-2.7" -} \ No newline at end of file + value: "snippets-2.7" +} diff --git a/.kokoro/presubmit/snippets-3.8.cfg b/.kokoro/presubmit/snippets-3.8.cfg new file mode 100644 index 000000000..840d9e716 --- /dev/null +++ b/.kokoro/presubmit/snippets-3.8.cfg @@ -0,0 +1,7 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "snippets-3.8" +} diff --git a/.kokoro/release.sh b/.kokoro/release.sh index 0e58f0640..3abba6e06 100755 --- a/.kokoro/release.sh +++ b/.kokoro/release.sh @@ -26,7 +26,7 @@ python3 -m pip install --upgrade twine wheel setuptools export PYTHONUNBUFFERED=1 # Move into the package, build the distribution and upload. -TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google_cloud_pypi_password") +TWINE_PASSWORD=$(cat "${KOKORO_GFILE_DIR}/secret_manager/google-cloud-pypi-token") cd github/python-bigquery python3 setup.py sdist bdist_wheel -twine upload --username gcloudpypi --password "${TWINE_PASSWORD}" dist/* +twine upload --username __token__ --password "${TWINE_PASSWORD}" dist/* diff --git a/.kokoro/release/common.cfg b/.kokoro/release/common.cfg index 661a04481..922d7fe50 100644 --- a/.kokoro/release/common.cfg +++ b/.kokoro/release/common.cfg @@ -23,42 +23,8 @@ env_vars: { value: "github/python-bigquery/.kokoro/release.sh" } -# Fetch the token needed for reporting release status to GitHub -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "yoshi-automation-github-key" - } - } -} - -# Fetch PyPI password -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "google_cloud_pypi_password" - } - } -} - -# Fetch magictoken to use with Magic Github Proxy -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "releasetool-magictoken" - } - } -} - -# Fetch api key to use with Magic Github Proxy -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "magic-github-proxy-api-key" - } - } +# Tokens needed to report release status back to GitHub +env_vars: { + key: "SECRET_MANAGER_KEYS" + value: "releasetool-publish-reporter-app,releasetool-publish-reporter-googleapis-installation,releasetool-publish-reporter-pem,google-cloud-pypi-token" } diff --git a/.kokoro/samples/python3.6/common.cfg b/.kokoro/samples/python3.6/common.cfg index a56768eae..f3b930960 100644 --- a/.kokoro/samples/python3.6/common.cfg +++ b/.kokoro/samples/python3.6/common.cfg @@ -13,6 +13,12 @@ env_vars: { value: "py-3.6" } +# Declare build specific Cloud project. +env_vars: { + key: "BUILD_SPECIFIC_GCLOUD_PROJECT" + value: "python-docs-samples-tests-py36" +} + env_vars: { key: "TRAMPOLINE_BUILD_FILE" value: "github/python-bigquery/.kokoro/test-samples.sh" diff --git a/.kokoro/samples/python3.6/periodic-head.cfg b/.kokoro/samples/python3.6/periodic-head.cfg new file mode 100644 index 000000000..5aa01bab5 --- /dev/null +++ b/.kokoro/samples/python3.6/periodic-head.cfg @@ -0,0 +1,11 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" +} diff --git a/.kokoro/samples/python3.7/common.cfg b/.kokoro/samples/python3.7/common.cfg index c93747180..fc0654565 100644 --- a/.kokoro/samples/python3.7/common.cfg +++ b/.kokoro/samples/python3.7/common.cfg @@ -13,6 +13,12 @@ env_vars: { value: "py-3.7" } +# Declare build specific Cloud project. +env_vars: { + key: "BUILD_SPECIFIC_GCLOUD_PROJECT" + value: "python-docs-samples-tests-py37" +} + env_vars: { key: "TRAMPOLINE_BUILD_FILE" value: "github/python-bigquery/.kokoro/test-samples.sh" diff --git a/.kokoro/samples/python3.7/periodic-head.cfg b/.kokoro/samples/python3.7/periodic-head.cfg new file mode 100644 index 000000000..5aa01bab5 --- /dev/null +++ b/.kokoro/samples/python3.7/periodic-head.cfg @@ -0,0 +1,11 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" +} diff --git a/.kokoro/samples/python3.8/common.cfg b/.kokoro/samples/python3.8/common.cfg index 9808f15e3..2b0bf59b3 100644 --- a/.kokoro/samples/python3.8/common.cfg +++ b/.kokoro/samples/python3.8/common.cfg @@ -13,6 +13,12 @@ env_vars: { value: "py-3.8" } +# Declare build specific Cloud project. +env_vars: { + key: "BUILD_SPECIFIC_GCLOUD_PROJECT" + value: "python-docs-samples-tests-py38" +} + env_vars: { key: "TRAMPOLINE_BUILD_FILE" value: "github/python-bigquery/.kokoro/test-samples.sh" diff --git a/.kokoro/samples/python3.8/periodic-head.cfg b/.kokoro/samples/python3.8/periodic-head.cfg new file mode 100644 index 000000000..5aa01bab5 --- /dev/null +++ b/.kokoro/samples/python3.8/periodic-head.cfg @@ -0,0 +1,11 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" +} diff --git a/.kokoro/samples/python3.9/common.cfg b/.kokoro/samples/python3.9/common.cfg new file mode 100644 index 000000000..f179577a5 --- /dev/null +++ b/.kokoro/samples/python3.9/common.cfg @@ -0,0 +1,40 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Build logs will be here +action { + define_artifacts { + regex: "**/*sponge_log.xml" + } +} + +# Specify which tests to run +env_vars: { + key: "RUN_TESTS_SESSION" + value: "py-3.9" +} + +# Declare build specific Cloud project. +env_vars: { + key: "BUILD_SPECIFIC_GCLOUD_PROJECT" + value: "python-docs-samples-tests-py39" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/test-samples.sh" +} + +# Configure the docker image for kokoro-trampoline. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" +} + +# Download secrets for samples +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" + +# Download trampoline resources. +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" + +# Use the trampoline script to run in docker. +build_file: "python-bigquery/.kokoro/trampoline.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.9/continuous.cfg b/.kokoro/samples/python3.9/continuous.cfg new file mode 100644 index 000000000..a1c8d9759 --- /dev/null +++ b/.kokoro/samples/python3.9/continuous.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file diff --git a/.kokoro/samples/python3.9/periodic-head.cfg b/.kokoro/samples/python3.9/periodic-head.cfg new file mode 100644 index 000000000..5aa01bab5 --- /dev/null +++ b/.kokoro/samples/python3.9/periodic-head.cfg @@ -0,0 +1,11 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" +} diff --git a/.kokoro/samples/python3.9/periodic.cfg b/.kokoro/samples/python3.9/periodic.cfg new file mode 100644 index 000000000..50fec9649 --- /dev/null +++ b/.kokoro/samples/python3.9/periodic.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "False" +} \ No newline at end of file diff --git a/.kokoro/samples/python3.9/presubmit.cfg b/.kokoro/samples/python3.9/presubmit.cfg new file mode 100644 index 000000000..a1c8d9759 --- /dev/null +++ b/.kokoro/samples/python3.9/presubmit.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file diff --git a/.kokoro/test-samples-against-head.sh b/.kokoro/test-samples-against-head.sh new file mode 100755 index 000000000..689948a23 --- /dev/null +++ b/.kokoro/test-samples-against-head.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A customized test runner for samples. +# +# For periodic builds, you can specify this file for testing against head. + +# `-e` enables the script to automatically fail when a command fails +# `-o pipefail` sets the exit code to the rightmost comment to exit with a non-zero +set -eo pipefail +# Enables `**` to include files nested inside sub-folders +shopt -s globstar + +cd github/python-bigquery + +exec .kokoro/test-samples-impl.sh diff --git a/.kokoro/test-samples-impl.sh b/.kokoro/test-samples-impl.sh new file mode 100755 index 000000000..311a8d54b --- /dev/null +++ b/.kokoro/test-samples-impl.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# `-e` enables the script to automatically fail when a command fails +# `-o pipefail` sets the exit code to the rightmost comment to exit with a non-zero +set -eo pipefail +# Enables `**` to include files nested inside sub-folders +shopt -s globstar + +# Exit early if samples don't exist +if ! find samples -name 'requirements.txt' | grep -q .; then + echo "No tests run. './samples/**/requirements.txt' not found" + exit 0 +fi + +# Disable buffering, so that the logs stream through. +export PYTHONUNBUFFERED=1 + +# Debug: show build environment +env | grep KOKORO + +# Install nox +python3.6 -m pip install --upgrade --quiet nox + +# Use secrets acessor service account to get secrets +if [[ -f "${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" ]]; then + gcloud auth activate-service-account \ + --key-file="${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" \ + --project="cloud-devrel-kokoro-resources" +fi + +# This script will create 3 files: +# - testing/test-env.sh +# - testing/service-account.json +# - testing/client-secrets.json +./scripts/decrypt-secrets.sh + +source ./testing/test-env.sh +export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/testing/service-account.json + +# For cloud-run session, we activate the service account for gcloud sdk. +gcloud auth activate-service-account \ + --key-file "${GOOGLE_APPLICATION_CREDENTIALS}" + +export GOOGLE_CLIENT_SECRETS=$(pwd)/testing/client-secrets.json + +echo -e "\n******************** TESTING PROJECTS ********************" + +# Switch to 'fail at end' to allow all tests to complete before exiting. +set +e +# Use RTN to return a non-zero value if the test fails. +RTN=0 +ROOT=$(pwd) +# Find all requirements.txt in the samples directory (may break on whitespace). +for file in samples/**/requirements.txt; do + cd "$ROOT" + # Navigate to the project folder. + file=$(dirname "$file") + cd "$file" + + echo "------------------------------------------------------------" + echo "- testing $file" + echo "------------------------------------------------------------" + + # Use nox to execute the tests for the project. + python3.6 -m nox -s "$RUN_TESTS_SESSION" + EXIT=$? + + # If this is a periodic build, send the test log to the FlakyBot. + # See https://github.com/googleapis/repo-automation-bots/tree/master/packages/flakybot. + if [[ $KOKORO_BUILD_ARTIFACTS_SUBDIR = *"periodic"* ]]; then + chmod +x $KOKORO_GFILE_DIR/linux_amd64/flakybot + $KOKORO_GFILE_DIR/linux_amd64/flakybot + fi + + if [[ $EXIT -ne 0 ]]; then + RTN=1 + echo -e "\n Testing failed: Nox returned a non-zero exit code. \n" + else + echo -e "\n Testing completed.\n" + fi + +done +cd "$ROOT" + +# Workaround for Kokoro permissions issue: delete secrets +rm testing/{test-env.sh,client-secrets.json,service-account.json} + +exit "$RTN" diff --git a/.kokoro/test-samples.sh b/.kokoro/test-samples.sh index 905732a40..62ef534cd 100755 --- a/.kokoro/test-samples.sh +++ b/.kokoro/test-samples.sh @@ -13,6 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +# The default test runner for samples. +# +# For periodic builds, we rewinds the repo to the latest release, and +# run test-samples-impl.sh. # `-e` enables the script to automatically fail when a command fails # `-o pipefail` sets the exit code to the rightmost comment to exit with a non-zero @@ -24,81 +28,19 @@ cd github/python-bigquery # Run periodic samples tests at latest release if [[ $KOKORO_BUILD_ARTIFACTS_SUBDIR = *"periodic"* ]]; then + # preserving the test runner implementation. + cp .kokoro/test-samples-impl.sh "${TMPDIR}/test-samples-impl.sh" + echo "--- IMPORTANT IMPORTANT IMPORTANT ---" + echo "Now we rewind the repo back to the latest release..." LATEST_RELEASE=$(git describe --abbrev=0 --tags) git checkout $LATEST_RELEASE -fi - -# Disable buffering, so that the logs stream through. -export PYTHONUNBUFFERED=1 - -# Debug: show build environment -env | grep KOKORO - -# Install nox -python3.6 -m pip install --upgrade --quiet nox - -# Use secrets acessor service account to get secrets -if [[ -f "${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" ]]; then - gcloud auth activate-service-account \ - --key-file="${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" \ - --project="cloud-devrel-kokoro-resources" -fi - -# This script will create 3 files: -# - testing/test-env.sh -# - testing/service-account.json -# - testing/client-secrets.json -./scripts/decrypt-secrets.sh - -source ./testing/test-env.sh -export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/testing/service-account.json - -# For cloud-run session, we activate the service account for gcloud sdk. -gcloud auth activate-service-account \ - --key-file "${GOOGLE_APPLICATION_CREDENTIALS}" - -export GOOGLE_CLIENT_SECRETS=$(pwd)/testing/client-secrets.json - -echo -e "\n******************** TESTING PROJECTS ********************" - -# Switch to 'fail at end' to allow all tests to complete before exiting. -set +e -# Use RTN to return a non-zero value if the test fails. -RTN=0 -ROOT=$(pwd) -# Find all requirements.txt in the samples directory (may break on whitespace). -for file in samples/**/requirements.txt; do - cd "$ROOT" - # Navigate to the project folder. - file=$(dirname "$file") - cd "$file" - - echo "------------------------------------------------------------" - echo "- testing $file" - echo "------------------------------------------------------------" - - # Use nox to execute the tests for the project. - python3.6 -m nox -s "$RUN_TESTS_SESSION" - EXIT=$? - - # If this is a periodic build, send the test log to the Build Cop Bot. - # See https://github.com/googleapis/repo-automation-bots/tree/master/packages/buildcop. - if [[ $KOKORO_BUILD_ARTIFACTS_SUBDIR = *"periodic"* ]]; then - chmod +x $KOKORO_GFILE_DIR/linux_amd64/buildcop - $KOKORO_GFILE_DIR/linux_amd64/buildcop + echo "The current head is: " + echo $(git rev-parse --verify HEAD) + echo "--- IMPORTANT IMPORTANT IMPORTANT ---" + # move back the test runner implementation if there's no file. + if [ ! -f .kokoro/test-samples-impl.sh ]; then + cp "${TMPDIR}/test-samples-impl.sh" .kokoro/test-samples-impl.sh fi +fi - if [[ $EXIT -ne 0 ]]; then - RTN=1 - echo -e "\n Testing failed: Nox returned a non-zero exit code. \n" - else - echo -e "\n Testing completed.\n" - fi - -done -cd "$ROOT" - -# Workaround for Kokoro permissions issue: delete secrets -rm testing/{test-env.sh,client-secrets.json,service-account.json} - -exit "$RTN" \ No newline at end of file +exec .kokoro/test-samples-impl.sh diff --git a/.kokoro/trampoline.sh b/.kokoro/trampoline.sh index e8c4251f3..f39236e94 100755 --- a/.kokoro/trampoline.sh +++ b/.kokoro/trampoline.sh @@ -15,9 +15,14 @@ set -eo pipefail -python3 "${KOKORO_GFILE_DIR}/trampoline_v1.py" || ret_code=$? +# Always run the cleanup script, regardless of the success of bouncing into +# the container. +function cleanup() { + chmod +x ${KOKORO_GFILE_DIR}/trampoline_cleanup.sh + ${KOKORO_GFILE_DIR}/trampoline_cleanup.sh + echo "cleanup"; +} +trap cleanup EXIT -chmod +x ${KOKORO_GFILE_DIR}/trampoline_cleanup.sh -${KOKORO_GFILE_DIR}/trampoline_cleanup.sh || true - -exit ${ret_code} +$(dirname $0)/populate-secrets.sh # Secret Manager secrets. +python3 "${KOKORO_GFILE_DIR}/trampoline_v1.py" \ No newline at end of file diff --git a/.kokoro/trampoline_v2.sh b/.kokoro/trampoline_v2.sh index 719bcd5ba..4af6cdc26 100755 --- a/.kokoro/trampoline_v2.sh +++ b/.kokoro/trampoline_v2.sh @@ -159,7 +159,7 @@ if [[ -n "${KOKORO_BUILD_ID:-}" ]]; then "KOKORO_GITHUB_COMMIT" "KOKORO_GITHUB_PULL_REQUEST_NUMBER" "KOKORO_GITHUB_PULL_REQUEST_COMMIT" - # For Build Cop Bot + # For FlakyBot "KOKORO_GITHUB_COMMIT_URL" "KOKORO_GITHUB_PULL_REQUEST_URL" ) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..62eb5a77d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,31 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml +- repo: https://github.com/psf/black + rev: 19.10b0 + hooks: + - id: black +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 diff --git a/.repo-metadata.json b/.repo-metadata.json index f50dbbeb2..f132056d5 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -6,6 +6,7 @@ "issue_tracker": "https://issuetracker.google.com/savedsearches/559654", "release_level": "ga", "language": "python", + "library_type": "GAPIC_COMBO", "repo": "googleapis/python-bigquery", "distribution_name": "google-cloud-bigquery", "api_id": "bigquery.googleapis.com", diff --git a/.trampolinerc b/.trampolinerc index 995ee2911..383b6ec89 100644 --- a/.trampolinerc +++ b/.trampolinerc @@ -24,6 +24,7 @@ required_envvars+=( pass_down_envvars+=( "STAGING_BUCKET" "V2_STAGING_BUCKET" + "NOX_SESSION" ) # Prevent unintentional override on the default image. diff --git a/CHANGELOG.md b/CHANGELOG.md index e8d367f73..8a21df6fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,544 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history + +### [2.25.1](https://www.github.com/googleapis/python-bigquery/compare/v2.25.0...v2.25.1) (2021-08-25) + + +### Bug Fixes + +* populate default `timeout` and retry after client-side timeout ([#896](https://www.github.com/googleapis/python-bigquery/issues/896)) ([b508809](https://www.github.com/googleapis/python-bigquery/commit/b508809c0f887575274309a463e763c56ddd017d)) +* use REST API in cell magic when requested ([#892](https://www.github.com/googleapis/python-bigquery/issues/892)) ([1cb3e55](https://www.github.com/googleapis/python-bigquery/commit/1cb3e55253e824e3a1da5201f6ec09065fb6b627)) + +## [2.25.0](https://www.github.com/googleapis/python-bigquery/compare/v2.24.1...v2.25.0) (2021-08-24) + + +### Features + +* Support using GeoPandas for GEOGRAPHY columns ([#848](https://www.github.com/googleapis/python-bigquery/issues/848)) ([16f65e6](https://www.github.com/googleapis/python-bigquery/commit/16f65e6ae15979217ceea6c6d398c9057a363a13)) + +### [2.24.1](https://www.github.com/googleapis/python-bigquery/compare/v2.24.0...v2.24.1) (2021-08-13) + + +### Bug Fixes + +* remove pytz dependency and require pyarrow>=3.0.0 ([#875](https://www.github.com/googleapis/python-bigquery/issues/875)) ([2cb3563](https://www.github.com/googleapis/python-bigquery/commit/2cb3563ee863edef7eaf5d04d739bcfe7bc6438e)) + +## [2.24.0](https://www.github.com/googleapis/python-bigquery/compare/v2.23.3...v2.24.0) (2021-08-11) + + +### Features + +* add support for transaction statistics ([#849](https://www.github.com/googleapis/python-bigquery/issues/849)) ([7f7b1a8](https://www.github.com/googleapis/python-bigquery/commit/7f7b1a808d50558772a0deb534ca654da65d629e)) +* make the same `Table*` instances equal to each other ([#867](https://www.github.com/googleapis/python-bigquery/issues/867)) ([c1a3d44](https://www.github.com/googleapis/python-bigquery/commit/c1a3d4435739a21d25aa154145e36d3a7c42eeb6)) +* retry failed query jobs in `result()` ([#837](https://www.github.com/googleapis/python-bigquery/issues/837)) ([519d99c](https://www.github.com/googleapis/python-bigquery/commit/519d99c20e7d1101f76981f3de036fdf3c7a4ecc)) +* support `ScalarQueryParameterType` for `type_` argument in `ScalarQueryParameter` constructor ([#850](https://www.github.com/googleapis/python-bigquery/issues/850)) ([93d15e2](https://www.github.com/googleapis/python-bigquery/commit/93d15e2e5405c2cc6d158c4e5737361344193dbc)) + + +### Bug Fixes + +* make unicode characters working well in load_table_from_json ([#865](https://www.github.com/googleapis/python-bigquery/issues/865)) ([ad9c802](https://www.github.com/googleapis/python-bigquery/commit/ad9c8026f0e667f13dd754279f9dc40d06f4fa78)) + +### [2.23.3](https://www.github.com/googleapis/python-bigquery/compare/v2.23.2...v2.23.3) (2021-08-06) + + +### Bug Fixes + +* increase default retry deadline to 10 minutes ([#859](https://www.github.com/googleapis/python-bigquery/issues/859)) ([30770fd](https://www.github.com/googleapis/python-bigquery/commit/30770fd0575fbd5aaa70c14196a4cc54627aecd2)) + +### [2.23.2](https://www.github.com/googleapis/python-bigquery/compare/v2.23.1...v2.23.2) (2021-07-29) + + +### Dependencies + +* expand pyarrow pins to support 5.x releases ([#833](https://www.github.com/googleapis/python-bigquery/issues/833)) ([80e3a61](https://www.github.com/googleapis/python-bigquery/commit/80e3a61c60419fb19b70b664c6415cd01ba82f5b)) + +### [2.23.1](https://www.github.com/googleapis/python-bigquery/compare/v2.23.0...v2.23.1) (2021-07-28) + + +### Bug Fixes + +* `insert_rows()` accepts float column values as strings again ([#824](https://www.github.com/googleapis/python-bigquery/issues/824)) ([d9378af](https://www.github.com/googleapis/python-bigquery/commit/d9378af13add879118a1d004529b811f72c325d6)) + +## [2.23.0](https://www.github.com/googleapis/python-bigquery/compare/v2.22.1...v2.23.0) (2021-07-27) + + +### Features + +* Update proto definitions for bigquery/v2 to support new proto fields for BQML. ([#817](https://www.github.com/googleapis/python-bigquery/issues/817)) ([fe7a902](https://www.github.com/googleapis/python-bigquery/commit/fe7a902e8b3e723ace335c9b499aea6d180a025b)) + + +### Bug Fixes + +* no longer raise a warning in `to_dataframe` if `max_results` set ([#815](https://www.github.com/googleapis/python-bigquery/issues/815)) ([3c1be14](https://www.github.com/googleapis/python-bigquery/commit/3c1be149e76b1d1d8879fdcf0924ddb1c1839e94)) +* retry ChunkedEncodingError by default ([#802](https://www.github.com/googleapis/python-bigquery/issues/802)) ([419d36d](https://www.github.com/googleapis/python-bigquery/commit/419d36d6b1887041e5795dbc8fc808890e91ab11)) + + +### Documentation + +* correct docs for `LoadJobConfig.destination_table_description` ([#810](https://www.github.com/googleapis/python-bigquery/issues/810)) ([da87fd9](https://www.github.com/googleapis/python-bigquery/commit/da87fd921cc8067b187d7985c978aac8eb58d107)) + +### [2.22.1](https://www.github.com/googleapis/python-bigquery/compare/v2.22.0...v2.22.1) (2021-07-22) + + +### Bug Fixes + +* issue a warning if buggy pyarrow is detected ([#787](https://www.github.com/googleapis/python-bigquery/issues/787)) ([e403721](https://www.github.com/googleapis/python-bigquery/commit/e403721af1373eb1f1a1c7be5b2182e3819ed1f9)) +* use a larger chunk size when loading data ([#799](https://www.github.com/googleapis/python-bigquery/issues/799)) ([b804373](https://www.github.com/googleapis/python-bigquery/commit/b804373277c1c1baa3370ebfb4783503b7ff360f)) + + +### Documentation + +* add Samples section to CONTRIBUTING.rst ([#785](https://www.github.com/googleapis/python-bigquery/issues/785)) ([e587029](https://www.github.com/googleapis/python-bigquery/commit/e58702967d572e83b4c774278818302594a511b7)) +* add sample to delete job metadata ([#798](https://www.github.com/googleapis/python-bigquery/issues/798)) ([be9b242](https://www.github.com/googleapis/python-bigquery/commit/be9b242f2180f5b795dfb3a168a97af1682999fd)) + +## [2.22.0](https://www.github.com/googleapis/python-bigquery/compare/v2.21.0...v2.22.0) (2021-07-19) + + +### Features + +* add `LoadJobConfig.projection_fields` to select DATASTORE_BACKUP fields ([#736](https://www.github.com/googleapis/python-bigquery/issues/736)) ([c45a738](https://www.github.com/googleapis/python-bigquery/commit/c45a7380871af3dfbd3c45524cb606c60e1a01d1)) +* add standard sql table type, update scalar type enums ([#777](https://www.github.com/googleapis/python-bigquery/issues/777)) ([b8b5433](https://www.github.com/googleapis/python-bigquery/commit/b8b5433898ec881f8da1303614780a660d94733a)) +* add support for more detailed DML stats ([#758](https://www.github.com/googleapis/python-bigquery/issues/758)) ([36fe86f](https://www.github.com/googleapis/python-bigquery/commit/36fe86f41c1a8f46167284f752a6d6bbf886a04b)) +* add support for user defined Table View Functions ([#724](https://www.github.com/googleapis/python-bigquery/issues/724)) ([8c7b839](https://www.github.com/googleapis/python-bigquery/commit/8c7b839a6ac1491c1c3b6b0e8755f4b70ed72ee3)) + + +### Bug Fixes + +* avoid possible job already exists error ([#751](https://www.github.com/googleapis/python-bigquery/issues/751)) ([45b9308](https://www.github.com/googleapis/python-bigquery/commit/45b93089f5398740413104285cc8acfd5ebc9c08)) + + +### Dependencies + +* allow 2.x versions of `google-api-core`, `google-cloud-core`, `google-resumable-media` ([#770](https://www.github.com/googleapis/python-bigquery/issues/770)) ([87a09fa](https://www.github.com/googleapis/python-bigquery/commit/87a09fa3f2a9ab35728a1ac925f9d5f2e6616c65)) + + +### Documentation + +* add loading data from Firestore backup sample ([#737](https://www.github.com/googleapis/python-bigquery/issues/737)) ([22fd848](https://www.github.com/googleapis/python-bigquery/commit/22fd848cae4af1148040e1faa31dd15a4d674687)) + +## [2.21.0](https://www.github.com/googleapis/python-bigquery/compare/v2.20.0...v2.21.0) (2021-07-12) + + +### Features + +* Add max_results parameter to some of the `QueryJob` methods. ([#698](https://www.github.com/googleapis/python-bigquery/issues/698)) ([2a9618f](https://www.github.com/googleapis/python-bigquery/commit/2a9618f4daaa4a014161e1a2f7376844eec9e8da)) +* Add support for decimal target types. ([#735](https://www.github.com/googleapis/python-bigquery/issues/735)) ([7d2d3e9](https://www.github.com/googleapis/python-bigquery/commit/7d2d3e906a9eb161911a198fb925ad79de5df934)) +* Add support for table snapshots. ([#740](https://www.github.com/googleapis/python-bigquery/issues/740)) ([ba86b2a](https://www.github.com/googleapis/python-bigquery/commit/ba86b2a6300ae5a9f3c803beeb42bda4c522e34c)) +* Enable unsetting policy tags on schema fields. ([#703](https://www.github.com/googleapis/python-bigquery/issues/703)) ([18bb443](https://www.github.com/googleapis/python-bigquery/commit/18bb443c7acd0a75dcb57d9aebe38b2d734ff8c7)) +* Make it easier to disable best-effort deduplication with streaming inserts. ([#734](https://www.github.com/googleapis/python-bigquery/issues/734)) ([1246da8](https://www.github.com/googleapis/python-bigquery/commit/1246da86b78b03ca1aa2c45ec71649e294cfb2f1)) +* Support passing struct data to the DB API. ([#718](https://www.github.com/googleapis/python-bigquery/issues/718)) ([38b3ef9](https://www.github.com/googleapis/python-bigquery/commit/38b3ef96c3dedc139b84f0ff06885141ae7ce78c)) + + +### Bug Fixes + +* Inserting non-finite floats with `insert_rows()`. ([#728](https://www.github.com/googleapis/python-bigquery/issues/728)) ([d047419](https://www.github.com/googleapis/python-bigquery/commit/d047419879e807e123296da2eee89a5253050166)) +* Use `pandas` function to check for `NaN`. ([#750](https://www.github.com/googleapis/python-bigquery/issues/750)) ([67bc5fb](https://www.github.com/googleapis/python-bigquery/commit/67bc5fbd306be7cdffd216f3791d4024acfa95b3)) + + +### Documentation + +* Add docs for all enums in module. ([#745](https://www.github.com/googleapis/python-bigquery/issues/745)) ([145944f](https://www.github.com/googleapis/python-bigquery/commit/145944f24fedc4d739687399a8309f9d51d43dfd)) +* Omit mention of Python 2.7 in `CONTRIBUTING.rst`. ([#706](https://www.github.com/googleapis/python-bigquery/issues/706)) ([27d6839](https://www.github.com/googleapis/python-bigquery/commit/27d6839ee8a40909e4199cfa0da8b6b64705b2e9)) + +## [2.20.0](https://www.github.com/googleapis/python-bigquery/compare/v2.19.0...v2.20.0) (2021-06-07) + + +### Features + +* support script options in query job config ([#690](https://www.github.com/googleapis/python-bigquery/issues/690)) ([1259e16](https://www.github.com/googleapis/python-bigquery/commit/1259e16394784315368e8be959c1ac097782b62e)) + +## [2.19.0](https://www.github.com/googleapis/python-bigquery/compare/v2.18.0...v2.19.0) (2021-06-06) + + +### Features + +* list_tables, list_projects, list_datasets, list_models, list_routines, and list_jobs now accept a page_size parameter to control page size ([#686](https://www.github.com/googleapis/python-bigquery/issues/686)) ([1f1c4b7](https://www.github.com/googleapis/python-bigquery/commit/1f1c4b7ba4390fc4c5c8186bc22b83b45304ca06)) + +## [2.18.0](https://www.github.com/googleapis/python-bigquery/compare/v2.17.0...v2.18.0) (2021-06-02) + + +### Features + +* add support for Parquet options ([#679](https://www.github.com/googleapis/python-bigquery/issues/679)) ([d792ce0](https://www.github.com/googleapis/python-bigquery/commit/d792ce09388a6ee3706777915dd2818d4c854f79)) + +## [2.17.0](https://www.github.com/googleapis/python-bigquery/compare/v2.16.1...v2.17.0) (2021-05-21) + + +### Features + +* detect obsolete BQ Storage extra at runtime ([#666](https://www.github.com/googleapis/python-bigquery/issues/666)) ([bd7dbda](https://www.github.com/googleapis/python-bigquery/commit/bd7dbdae5c972b16bafc53c67911eeaa3255a880)) +* Support parameterized NUMERIC, BIGNUMERIC, STRING, and BYTES types ([#673](https://www.github.com/googleapis/python-bigquery/issues/673)) ([45421e7](https://www.github.com/googleapis/python-bigquery/commit/45421e73bfcddb244822e6a5cd43be6bd1ca2256)) + + +### Bug Fixes + +* **tests:** invalid path to strptime() ([#672](https://www.github.com/googleapis/python-bigquery/issues/672)) ([591cdd8](https://www.github.com/googleapis/python-bigquery/commit/591cdd851bb1321b048a05a378a0ef48d3ade462)) + +### [2.16.1](https://www.github.com/googleapis/python-bigquery/compare/v2.16.0...v2.16.1) (2021-05-12) + + +### Bug Fixes + +* executemany rowcount only reflected the last execution ([#660](https://www.github.com/googleapis/python-bigquery/issues/660)) ([aeadc8c](https://www.github.com/googleapis/python-bigquery/commit/aeadc8c2d614bb9f0883ec901fca48930f3aaf19)) + +## [2.16.0](https://www.github.com/googleapis/python-bigquery/compare/v2.15.0...v2.16.0) (2021-05-05) + + +### Features + +* add with_name() to ScalarQueryParameterType ([#644](https://www.github.com/googleapis/python-bigquery/issues/644)) ([6cc6876](https://www.github.com/googleapis/python-bigquery/commit/6cc6876eb0e5bf49fdc047256a945dcf1b289576)) + + +### Dependencies + +* expand supported pyarrow versions to v4 ([#643](https://www.github.com/googleapis/python-bigquery/issues/643)) ([9e1d386](https://www.github.com/googleapis/python-bigquery/commit/9e1d3869c2024fe7a8af57ff59838d904ca5db03)) + +## [2.15.0](https://www.github.com/googleapis/python-bigquery/compare/v2.14.0...v2.15.0) (2021-04-29) + + +### Features + +* Extended DB API parameter syntax to optionally provide parameter types ([#626](https://www.github.com/googleapis/python-bigquery/issues/626)) ([8bcf397](https://www.github.com/googleapis/python-bigquery/commit/8bcf397fbe2527e06317741875a059b109cfcd9c)) + + +### Bug Fixes + +* add DECIMAL and BIGDECIMAL as aliases for NUMERIC and BIGNUMERIC ([#638](https://www.github.com/googleapis/python-bigquery/issues/638)) ([aa59023](https://www.github.com/googleapis/python-bigquery/commit/aa59023317b1c63720fb717b3544f755652da58d)) +* The DB API Binary function accepts bytes data ([#630](https://www.github.com/googleapis/python-bigquery/issues/630)) ([4396e70](https://www.github.com/googleapis/python-bigquery/commit/4396e70771af6889d3242c37c5ff2e80241023a2)) + +## [2.14.0](https://www.github.com/googleapis/python-bigquery/compare/v2.13.1...v2.14.0) (2021-04-26) + + +### Features + +* accept DatasetListItem where DatasetReference is accepted ([#597](https://www.github.com/googleapis/python-bigquery/issues/597)) ([c8b5581](https://www.github.com/googleapis/python-bigquery/commit/c8b5581ea3c94005d69755c4a3b5a0d8900f3fe2)) +* accept job object as argument to `get_job` and `cancel_job` ([#617](https://www.github.com/googleapis/python-bigquery/issues/617)) ([f75dcdf](https://www.github.com/googleapis/python-bigquery/commit/f75dcdf3943b87daba60011c9a3b42e34ff81910)) +* add `Client.delete_job_metadata` method to remove job metadata ([#610](https://www.github.com/googleapis/python-bigquery/issues/610)) ([0abb566](https://www.github.com/googleapis/python-bigquery/commit/0abb56669c097c59fbffce007c702e7a55f2d9c1)) +* add `max_queue_size` argument to `RowIterator.to_dataframe_iterable` ([#575](https://www.github.com/googleapis/python-bigquery/issues/575)) ([f95f415](https://www.github.com/googleapis/python-bigquery/commit/f95f415d3441b3928f6cc705cb8a75603d790fd6)) +* add type hints for public methods ([#613](https://www.github.com/googleapis/python-bigquery/issues/613)) ([f8d4aaa](https://www.github.com/googleapis/python-bigquery/commit/f8d4aaa335a0eef915e73596fc9b43b11d11be9f)) +* DB API cursors are now iterable ([#618](https://www.github.com/googleapis/python-bigquery/issues/618)) ([e0b373d](https://www.github.com/googleapis/python-bigquery/commit/e0b373d0e721a70656ed8faceb7f5c70f642d144)) +* retry google.auth TransportError by default ([#624](https://www.github.com/googleapis/python-bigquery/issues/624)) ([34ecc3f](https://www.github.com/googleapis/python-bigquery/commit/34ecc3f1ca0ff073330c0c605673d89b43af7ed9)) +* use pyarrow stream compression, if available ([#593](https://www.github.com/googleapis/python-bigquery/issues/593)) ([dde9dc5](https://www.github.com/googleapis/python-bigquery/commit/dde9dc5114c2311fb76fafc5b222fff561e8abf1)) + + +### Bug Fixes + +* consistent percents handling in DB API query ([#619](https://www.github.com/googleapis/python-bigquery/issues/619)) ([6502a60](https://www.github.com/googleapis/python-bigquery/commit/6502a602337ae562652a20b20270949f2c9d5073)) +* missing license headers in new test files ([#604](https://www.github.com/googleapis/python-bigquery/issues/604)) ([df48cc5](https://www.github.com/googleapis/python-bigquery/commit/df48cc5a0be99ad39d5835652d1b7422209afc5d)) +* unsetting clustering fields on Table is now possible ([#622](https://www.github.com/googleapis/python-bigquery/issues/622)) ([33a871f](https://www.github.com/googleapis/python-bigquery/commit/33a871f06329f9bf5a6a92fab9ead65bf2bee75d)) + + +### Documentation + +* add sample to run DML query ([#591](https://www.github.com/googleapis/python-bigquery/issues/591)) ([ff2ec3a](https://www.github.com/googleapis/python-bigquery/commit/ff2ec3abe418a443cd07751c08e654f94e8b3155)) +* update the description of the return value of `_QueryResults.rows()` ([#594](https://www.github.com/googleapis/python-bigquery/issues/594)) ([8f4c0b8](https://www.github.com/googleapis/python-bigquery/commit/8f4c0b84dac3840532d7865247b8ad94b625b897)) + +### [2.13.1](https://www.github.com/googleapis/python-bigquery/compare/v2.13.0...v2.13.1) (2021-03-23) + + +### Bug Fixes + +* add ConnectionError to default retry ([#571](https://www.github.com/googleapis/python-bigquery/issues/571)) ([a3edb8b](https://www.github.com/googleapis/python-bigquery/commit/a3edb8b921e029e2c03d33302d408ad5d4e9d4ad)) + +## [2.13.0](https://www.github.com/googleapis/python-bigquery/compare/v2.12.0...v2.13.0) (2021-03-22) + + +### Features + +* add `ExternalConfig.connection_id` property to connect to external sources ([#560](https://www.github.com/googleapis/python-bigquery/issues/560)) ([d93986e](https://www.github.com/googleapis/python-bigquery/commit/d93986e0259952257f2571f60719b52099c29c0c)) + + +### Bug Fixes + +* avoid overly strict dependency on pyarrow 3.x ([#564](https://www.github.com/googleapis/python-bigquery/issues/564)) ([97ee6ec](https://www.github.com/googleapis/python-bigquery/commit/97ee6ec6cd4bc9f833cd506dc6d244d103654cfd)) +* avoid policy tags 403 error in `load_table_from_dataframe` ([#557](https://www.github.com/googleapis/python-bigquery/issues/557)) ([84e646e](https://www.github.com/googleapis/python-bigquery/commit/84e646e6b7087a1626e56ad51eeb130f4ddfa2fb)) + +## [2.12.0](https://www.github.com/googleapis/python-bigquery/compare/v2.11.0...v2.12.0) (2021-03-16) + + +### Features + +* make QueryJob.done() method more performant ([#544](https://www.github.com/googleapis/python-bigquery/issues/544)) ([a3ab9ef](https://www.github.com/googleapis/python-bigquery/commit/a3ab9efdd0758829845cfcb6ca0ac1f03ab44f64)) + + +### Bug Fixes + +* remove DB-API dependency on pyarrow with decimal query parameters ([#551](https://www.github.com/googleapis/python-bigquery/issues/551)) ([1b946ba](https://www.github.com/googleapis/python-bigquery/commit/1b946ba23ee7df86114c6acb338ec34e6c92af6d)) + +## [2.11.0](https://www.github.com/googleapis/python-bigquery/compare/v2.10.0...v2.11.0) (2021-03-09) + + +### Features + +* add context manager support to client ([#540](https://www.github.com/googleapis/python-bigquery/issues/540)) ([d5c7e11](https://www.github.com/googleapis/python-bigquery/commit/d5c7e11a1dc2a149d74294bfadbae62d70573e69)) + +## [2.10.0](https://www.github.com/googleapis/python-bigquery/compare/v2.9.0...v2.10.0) (2021-02-25) + + +### Features + +* add BIGNUMERIC support ([#527](https://www.github.com/googleapis/python-bigquery/issues/527)) ([cc3394f](https://www.github.com/googleapis/python-bigquery/commit/cc3394f80934419eb00c2029bb81c92a696e7d88)) + + +### Bug Fixes + +* error using empty array of structs parameter ([#474](https://www.github.com/googleapis/python-bigquery/issues/474)) ([c1d15f4](https://www.github.com/googleapis/python-bigquery/commit/c1d15f4e5da4b7e10c00afffd59a5c7f3ded027a)) +* QueryJob.exception() *returns* the errors, not raises them ([#467](https://www.github.com/googleapis/python-bigquery/issues/467)) ([d763279](https://www.github.com/googleapis/python-bigquery/commit/d7632799769248b09a8558ba18f5025ebdd9675a)) + + +### Documentation + +* **bigquery:** Add alternative approach to setting credentials ([#517](https://www.github.com/googleapis/python-bigquery/issues/517)) ([60fbf28](https://www.github.com/googleapis/python-bigquery/commit/60fbf287b0d34d5db2e61cce7a5b42735ed43d0e)) +* explain retry behavior for DONE jobs ([#532](https://www.github.com/googleapis/python-bigquery/issues/532)) ([696c443](https://www.github.com/googleapis/python-bigquery/commit/696c443f0a6740be0767e12b706a7771bc1460c3)) + +## [2.9.0](https://www.github.com/googleapis/python-bigquery/compare/v2.8.0...v2.9.0) (2021-02-18) + + +### Features + +* add determinism level for javascript UDFs ([#522](https://www.github.com/googleapis/python-bigquery/issues/522)) ([edd3328](https://www.github.com/googleapis/python-bigquery/commit/edd3328fffa3040b2cd3a3c668c90a0e43e4c94c)) +* expose reservation usage stats on jobs ([#524](https://www.github.com/googleapis/python-bigquery/issues/524)) ([4ffb4e0](https://www.github.com/googleapis/python-bigquery/commit/4ffb4e067abdaa54dad6eff49a7fbdb0fa358637)) + + +### Documentation + +* clarify `%%bigquery`` magics and fix broken link ([#508](https://www.github.com/googleapis/python-bigquery/issues/508)) ([eedf93b](https://www.github.com/googleapis/python-bigquery/commit/eedf93b6636c5ff1bd810c6038cfeaea8ccb64d8)) +* update python contributing guide ([#514](https://www.github.com/googleapis/python-bigquery/issues/514)) ([01e851d](https://www.github.com/googleapis/python-bigquery/commit/01e851d00fc17a780375580776753d78f6d74174)) + +## [2.8.0](https://www.github.com/googleapis/python-bigquery/compare/v2.7.0...v2.8.0) (2021-02-08) + + +### Features + +* Add mTLS support to client. ([#492](https://www.github.com/googleapis/python-bigquery/issues/492)) ([1823cad](https://www.github.com/googleapis/python-bigquery/commit/1823cadee3acf95c516d0479400e4175349ea199)) + + +### Bug Fixes + +* Don't try to close closed cursors. ([#498](https://www.github.com/googleapis/python-bigquery/issues/498)) ([bf44e7b](https://www.github.com/googleapis/python-bigquery/commit/bf44e7b67d2de41c13053a4550484b9ea049db3e)) + +## [2.7.0](https://www.github.com/googleapis/python-bigquery/compare/v2.6.2...v2.7.0) (2021-01-27) + + +### Bug Fixes + +* invalid conversion of timezone-aware datetime values to JSON ([#480](https://www.github.com/googleapis/python-bigquery/issues/480)) ([61b4385](https://www.github.com/googleapis/python-bigquery/commit/61b438523d305ce66a68fde7cb49e9abbf0a8d1d)) +* reading the labels attribute on Job instances ([#471](https://www.github.com/googleapis/python-bigquery/issues/471)) ([80944f0](https://www.github.com/googleapis/python-bigquery/commit/80944f080bcc4fda870a6daf1d884de616d39ae7)) +* use explicitly given project over the client's default project for load jobs ([#482](https://www.github.com/googleapis/python-bigquery/issues/482)) ([530e1e8](https://www.github.com/googleapis/python-bigquery/commit/530e1e8d8fe8939e914a78ff1b220907c1b87af7)) + + +### Dependencies + +* declare support for Python 3.9 ([#488](https://www.github.com/googleapis/python-bigquery/issues/488)) ([55daa7d](https://www.github.com/googleapis/python-bigquery/commit/55daa7da9857a8a2fb14a80a4efa3f466386a85f)) + +### [2.6.2](https://www.github.com/googleapis/python-bigquery/compare/v2.6.1...v2.6.2) (2021-01-11) + + +### Bug Fixes + +* add minimum timeout to getQueryResults API requests ([#444](https://www.github.com/googleapis/python-bigquery/issues/444)) ([015a73e](https://www.github.com/googleapis/python-bigquery/commit/015a73e1839e3427408ef6e0f879717d9ddbdb61)) +* use debug logging level for OpenTelemetry message ([#442](https://www.github.com/googleapis/python-bigquery/issues/442)) ([7ea6b7c](https://www.github.com/googleapis/python-bigquery/commit/7ea6b7c2469d2415192cfdacc379e38e49d24775)) + + +### Documentation + +* add GEOGRAPHY data type code samples ([#428](https://www.github.com/googleapis/python-bigquery/issues/428)) ([dbc68b3](https://www.github.com/googleapis/python-bigquery/commit/dbc68b3d1f325f80d24a2da5f028b0f653fb0317)) +* fix Shapely import in GEOGRAPHY sample ([#431](https://www.github.com/googleapis/python-bigquery/issues/431)) ([96a1c5b](https://www.github.com/googleapis/python-bigquery/commit/96a1c5b3c72855ba6ae8c88dfd0cdb02d2faf909)) +* move and refresh view samples ([#420](https://www.github.com/googleapis/python-bigquery/issues/420)) ([079b6a1](https://www.github.com/googleapis/python-bigquery/commit/079b6a162f6929bf801366d92f8daeb3318426c4)) + +### [2.6.1](https://www.github.com/googleapis/python-bigquery/compare/v2.6.0...v2.6.1) (2020-12-09) + + +### Bug Fixes + +* handle null values in array query parameters ([#426](https://www.github.com/googleapis/python-bigquery/issues/426)) ([78fde4a](https://www.github.com/googleapis/python-bigquery/commit/78fde4a92e61a89d0b490b93acc90fff9635d1bf)) + + +### Documentation + +* add examples of `fields` argument to update methods ([#418](https://www.github.com/googleapis/python-bigquery/issues/418)) ([8c7e02b](https://www.github.com/googleapis/python-bigquery/commit/8c7e02b0de2c92ee965414e7c430eb57d1877326)) + +## [2.6.0](https://www.github.com/googleapis/python-bigquery/compare/v2.5.0...v2.6.0) (2020-12-07) + + +### Features + +* add support for materialized views ([#408](https://www.github.com/googleapis/python-bigquery/issues/408)) ([57ffc66](https://www.github.com/googleapis/python-bigquery/commit/57ffc665319331e0a00583d5d652fd14a510cf2a)), closes [#407](https://www.github.com/googleapis/python-bigquery/issues/407) +* convert `BIGNUMERIC` values to decimal objects ([#414](https://www.github.com/googleapis/python-bigquery/issues/414)) ([d472d2d](https://www.github.com/googleapis/python-bigquery/commit/d472d2d2b33e40b954652d31476dea8c90e6a2dc)), closes [#367](https://www.github.com/googleapis/python-bigquery/issues/367) +* support CSV format in `load_table_from_dataframe` pandas connector ([#399](https://www.github.com/googleapis/python-bigquery/issues/399)) ([0046742](https://www.github.com/googleapis/python-bigquery/commit/0046742abdd2b5eab3c3e935316f91e7eef44d44)) + + +### Bug Fixes + +* preserve timestamp microsecond precision with rows from REST API ([#402](https://www.github.com/googleapis/python-bigquery/issues/402)) ([04510a7](https://www.github.com/googleapis/python-bigquery/commit/04510a7dc7570466550bbdf500d7020bef2af44d)) + + +### Documentation + +* update intersphinx links ([#404](https://www.github.com/googleapis/python-bigquery/issues/404)) ([a9d8ae8](https://www.github.com/googleapis/python-bigquery/commit/a9d8ae8a920dec655b77dca9d9128e569f1d07a7)) + +## [2.5.0](https://www.github.com/googleapis/python-bigquery/compare/v2.4.0...v2.5.0) (2020-12-02) + + +### Features + +* add `TableReference.__str__` to get table ID in standard SQL ([#405](https://www.github.com/googleapis/python-bigquery/issues/405)) ([53dff2a](https://www.github.com/googleapis/python-bigquery/commit/53dff2ad3889af04369a22437e6ab9b92c5755b6)), closes [#354](https://www.github.com/googleapis/python-bigquery/issues/354) +* add progress bar for magics ([#396](https://www.github.com/googleapis/python-bigquery/issues/396)) ([04d0273](https://www.github.com/googleapis/python-bigquery/commit/04d027317a99e3f353e0b7a18076da9b6ba4d8d3)) +* add support for unrecognized model types ([#401](https://www.github.com/googleapis/python-bigquery/issues/401)) ([168f035](https://www.github.com/googleapis/python-bigquery/commit/168f0354c4815bd1aeadbd4e388dcc9b32f97d6b)) + + +### Bug Fixes + +* avoid floating point for timestamp in `insert_rows` ([#393](https://www.github.com/googleapis/python-bigquery/issues/393)) ([a1949ae](https://www.github.com/googleapis/python-bigquery/commit/a1949ae20ec4f9c771b0cffbcd70792dd6a30dbf)) + + +### Performance Improvements + +* don't fetch rows when waiting for query to finish ([#400](https://www.github.com/googleapis/python-bigquery/issues/400)) ([730df17](https://www.github.com/googleapis/python-bigquery/commit/730df17ae1ab0b0bb2454f3c134c8f62665bc51b)), closes [#374](https://www.github.com/googleapis/python-bigquery/issues/374) [#394](https://www.github.com/googleapis/python-bigquery/issues/394) + + +### Documentation + +* **samples:** add more clustering code snippets ([#330](https://www.github.com/googleapis/python-bigquery/issues/330)) ([809e4a2](https://www.github.com/googleapis/python-bigquery/commit/809e4a27b94ba30c10e0c9a7e89576a9de9fda2b)), closes [#329](https://www.github.com/googleapis/python-bigquery/issues/329) + + +### Dependencies + +* update required version of opentelementry for opentelemetry-exporter-google-cloud ([#398](https://www.github.com/googleapis/python-bigquery/issues/398)) ([673a9cb](https://www.github.com/googleapis/python-bigquery/commit/673a9cb51c577c1dd016e76f3634b1e9e21482c5)) + +## [2.4.0](https://www.github.com/googleapis/python-bigquery/compare/v2.3.1...v2.4.0) (2020-11-16) + + +### Features + +* add progress bar to `QueryJob.to_dataframe` and `to_arrow` ([#352](https://www.github.com/googleapis/python-bigquery/issues/352)) ([dc78edd](https://www.github.com/googleapis/python-bigquery/commit/dc78eddde7a6a312c8fed7bace7d64036837ab1a)) +* allow routine references ([#378](https://www.github.com/googleapis/python-bigquery/issues/378)) ([f9480dc](https://www.github.com/googleapis/python-bigquery/commit/f9480dc2a1bc58367083176bd74725aa8b903301)) + + +### Bug Fixes + +* **dbapi:** allow rows to be fetched from scripts ([#387](https://www.github.com/googleapis/python-bigquery/issues/387)) ([b899ad1](https://www.github.com/googleapis/python-bigquery/commit/b899ad12e17cb87c58d3ae46b4388d917c5743f2)), closes [#377](https://www.github.com/googleapis/python-bigquery/issues/377) + + +### Performance Improvements + +* avoid extra API calls from `to_dataframe` if all rows are cached ([#384](https://www.github.com/googleapis/python-bigquery/issues/384)) ([c52b317](https://www.github.com/googleapis/python-bigquery/commit/c52b31789998fc0dfde07c3296650c85104d719d)) +* cache first page of `jobs.getQueryResults` rows ([#374](https://www.github.com/googleapis/python-bigquery/issues/374)) ([86f6a51](https://www.github.com/googleapis/python-bigquery/commit/86f6a516d1c7c5dc204ab085ea2578793e6561ff)) +* use `getQueryResults` from DB-API ([#375](https://www.github.com/googleapis/python-bigquery/issues/375)) ([30de15f](https://www.github.com/googleapis/python-bigquery/commit/30de15f7255de5ea221df4e8db7991d279e0ea28)) + + +### Dependencies + +* expand pyarrow dependencies to include version 2 ([#368](https://www.github.com/googleapis/python-bigquery/issues/368)) ([cd9febd](https://www.github.com/googleapis/python-bigquery/commit/cd9febd20c34983781386c3bf603e5fca7135695)) + +## 2.3.1 + +11-05-2020 09:27 PST + +### Internal / Testing Changes + +- update `google.cloud.bigquery.__version__` + +## [2.3.0](https://www.github.com/googleapis/python-bigquery/compare/v2.2.0...v2.3.0) (2020-11-04) + + +### Features + +* add `reload` argument to `*Job.done()` functions ([#341](https://www.github.com/googleapis/python-bigquery/issues/341)) ([e51fd45](https://www.github.com/googleapis/python-bigquery/commit/e51fd45fdb0481ac5d59cc0edbfa0750928b2596)) +* pass retry from Job.result() to Job.done() ([#41](https://www.github.com/googleapis/python-bigquery/issues/41)) ([284e17a](https://www.github.com/googleapis/python-bigquery/commit/284e17a17adf6844a17db2c6fed54a649b1f997e)) + + +### Bug Fixes + +* add missing spaces in opentelemetry log message ([#360](https://www.github.com/googleapis/python-bigquery/issues/360)) ([4f326b1](https://www.github.com/googleapis/python-bigquery/commit/4f326b1ca4411cfbf5ded86955a963d3e05a409f)) +* **dbapi:** avoid running % format with no query parameters ([#348](https://www.github.com/googleapis/python-bigquery/issues/348)) ([5dd1a5e](https://www.github.com/googleapis/python-bigquery/commit/5dd1a5e77f13b8e576e917069e247c5390a81900)) +* create_job method accepts dictionary arguments ([#300](https://www.github.com/googleapis/python-bigquery/issues/300)) ([155bacc](https://www.github.com/googleapis/python-bigquery/commit/155bacc156f181384ca6dba699ab83d0398176d1)) + + +### Performance Improvements + +* use `jobs.getQueryResults` to download result sets ([#363](https://www.github.com/googleapis/python-bigquery/issues/363)) ([0c3476d](https://www.github.com/googleapis/python-bigquery/commit/0c3476d56380d70115f6fd765bf5c5261967052f)) + + +### Documentation + +* add documents for QueryPlanEntry and QueryPlanEntryStep ([#344](https://www.github.com/googleapis/python-bigquery/issues/344)) ([dca2e4c](https://www.github.com/googleapis/python-bigquery/commit/dca2e4ca7c2ae183ac4bb60f653d425a43a86bea)) + +## [2.2.0](https://www.github.com/googleapis/python-bigquery/compare/v2.1.0...v2.2.0) (2020-10-19) + + +### Features + +* add method api_repr for table list item ([#299](https://www.github.com/googleapis/python-bigquery/issues/299)) ([07c70f0](https://www.github.com/googleapis/python-bigquery/commit/07c70f0292f9212f0c968cd5c9206e8b0409c0da)) +* add support for listing arima, automl, boosted tree, DNN, and matrix factorization models ([#328](https://www.github.com/googleapis/python-bigquery/issues/328)) ([502a092](https://www.github.com/googleapis/python-bigquery/commit/502a0926018abf058cb84bd18043c25eba15a2cc)) +* add timeout paramter to load_table_from_file and it dependent methods ([#327](https://www.github.com/googleapis/python-bigquery/issues/327)) ([b0dd892](https://www.github.com/googleapis/python-bigquery/commit/b0dd892176e31ac25fddd15554b5bfa054299d4d)) +* add to_api_repr method to Model ([#326](https://www.github.com/googleapis/python-bigquery/issues/326)) ([fb401bd](https://www.github.com/googleapis/python-bigquery/commit/fb401bd94477323bba68cf252dd88166495daf54)) +* allow client options to be set in magics context ([#322](https://www.github.com/googleapis/python-bigquery/issues/322)) ([5178b55](https://www.github.com/googleapis/python-bigquery/commit/5178b55682f5e264bfc082cde26acb1fdc953a18)) + + +### Bug Fixes + +* make TimePartitioning repr evaluable ([#110](https://www.github.com/googleapis/python-bigquery/issues/110)) ([20f473b](https://www.github.com/googleapis/python-bigquery/commit/20f473bfff5ae98377f5d9cdf18bfe5554d86ff4)), closes [#109](https://www.github.com/googleapis/python-bigquery/issues/109) +* use version.py instead of pkg_resources.get_distribution ([#307](https://www.github.com/googleapis/python-bigquery/issues/307)) ([b8f502b](https://www.github.com/googleapis/python-bigquery/commit/b8f502b14f21d1815697e4d57cf1225dfb4a7c5e)) + + +### Performance Improvements + +* add size parameter for load table from dataframe and json methods ([#280](https://www.github.com/googleapis/python-bigquery/issues/280)) ([3be78b7](https://www.github.com/googleapis/python-bigquery/commit/3be78b737add7111e24e912cd02fc6df75a07de6)) + + +### Documentation + +* update clustering field docstrings ([#286](https://www.github.com/googleapis/python-bigquery/issues/286)) ([5ea1ece](https://www.github.com/googleapis/python-bigquery/commit/5ea1ece2d911cdd1f3d9549ee01559ce8ed8269a)), closes [#285](https://www.github.com/googleapis/python-bigquery/issues/285) +* update snippets samples to support version 2.0 ([#309](https://www.github.com/googleapis/python-bigquery/issues/309)) ([61634be](https://www.github.com/googleapis/python-bigquery/commit/61634be9bf9e3df7589fc1bfdbda87288859bb13)) + + +### Dependencies + +* add protobuf dependency ([#306](https://www.github.com/googleapis/python-bigquery/issues/306)) ([cebb5e0](https://www.github.com/googleapis/python-bigquery/commit/cebb5e0e911e8c9059bc8c9e7fce4440e518bff3)), closes [#305](https://www.github.com/googleapis/python-bigquery/issues/305) +* require pyarrow for pandas support ([#314](https://www.github.com/googleapis/python-bigquery/issues/314)) ([801e4c0](https://www.github.com/googleapis/python-bigquery/commit/801e4c0574b7e421aa3a28cafec6fd6bcce940dd)), closes [#265](https://www.github.com/googleapis/python-bigquery/issues/265) + +## [2.1.0](https://www.github.com/googleapis/python-bigquery/compare/v2.0.0...v2.1.0) (2020-10-08) + + +### Features + +* add constants for MONTH and YEAR time partitioning types ([#283](https://www.github.com/googleapis/python-bigquery/issues/283)) ([9090e1c](https://www.github.com/googleapis/python-bigquery/commit/9090e1ccd8825a97835325b4829f6e7ecfd9ea88)) + + +### Bug Fixes + +* remove unnecessary dependency on libcst ([#308](https://www.github.com/googleapis/python-bigquery/issues/308)) ([c055930](https://www.github.com/googleapis/python-bigquery/commit/c05593094c1405f752b2c51b15202a6dbb5cb83f)) + + +### Performance Improvements + +* remove redundant array deepcopy ([#26](https://www.github.com/googleapis/python-bigquery/issues/26)) ([b54f867](https://www.github.com/googleapis/python-bigquery/commit/b54f86769c982ce5c8fcbf3889f82450428bb40c)) + + +### Documentation + +* **samples:** add create_table_clustered code snippet ([#291](https://www.github.com/googleapis/python-bigquery/issues/291)) ([d1eb8b3](https://www.github.com/googleapis/python-bigquery/commit/d1eb8b3dcc789916c5d3ba8464f62b1f8bef35ff)) + +## 2.0.0 + +09-30-2020 14:51 PDT + + +### Implementation Changes + +- Transition the library to microgenerator. ([#278](https://github.com/googleapis/python-bigquery/pull/278)) + This is a **breaking change** that **drops support for Python 2.7 and 3.5** and brings a few other changes. + See [migration guide](https://googleapis.dev/python/bigquery/latest/UPGRADING.html) for more info. + + + +### Internal / Testing Changes + +- Update protoc-generated comments (via synth). ([#270](https://github.com/googleapis/python-bigquery/pull/270)) +- Add CI secrets manager (via synth). ([#271](https://github.com/googleapis/python-bigquery/pull/271)) + ## [1.28.0](https://www.github.com/googleapis/python-bigquery/compare/v1.27.2...v1.28.0) (2020-09-22) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index b3d1f6029..039f43681 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,44 +1,95 @@ -# Contributor Code of Conduct +# Code of Conduct -As contributors and maintainers of this project, -and in the interest of fostering an open and welcoming community, -we pledge to respect all people who contribute through reporting issues, -posting feature requests, updating documentation, -submitting pull requests or patches, and other activities. +## Our Pledge -We are committed to making participation in this project -a harassment-free experience for everyone, -regardless of level of experience, gender, gender identity and expression, -sexual orientation, disability, personal appearance, -body size, race, ethnicity, age, religion, or nationality. +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of +experience, education, socio-economic status, nationality, personal appearance, +race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members Examples of unacceptable behavior by participants include: -* The use of sexualized language or imagery -* Personal attacks -* Trolling or insulting/derogatory comments -* Public or private harassment -* Publishing other's private information, -such as physical or electronic -addresses, without explicit permission -* Other unethical or unprofessional conduct. +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject -comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct. -By adopting this Code of Conduct, -project maintainers commit themselves to fairly and consistently -applying these principles to every aspect of managing this project. -Project maintainers who do not follow or enforce the Code of Conduct -may be permanently removed from the project team. - -This code of conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. - -Instances of abusive, harassing, or otherwise unacceptable behavior -may be reported by opening an issue -or contacting one or more of the project maintainers. - -This Code of Conduct is adapted from the [Contributor Covenant](http://contributor-covenant.org), version 1.2.0, -available at [http://contributor-covenant.org/version/1/2/0/](http://contributor-covenant.org/version/1/2/0/) +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, or to ban temporarily or permanently any +contributor for other behaviors that they deem inappropriate, threatening, +offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +This Code of Conduct also applies outside the project spaces when the Project +Steward has a reasonable belief that an individual's behavior may have a +negative impact on the project or its community. + +## Conflict Resolution + +We do not believe that all conflict is bad; healthy debate and disagreement +often yield positive results. However, it is never okay to be disrespectful or +to engage in behavior that violates the project’s code of conduct. + +If you see someone violating the code of conduct, you are encouraged to address +the behavior directly with those involved. Many issues can be resolved quickly +and easily, and this gives people more control over the outcome of their +dispute. If you are unable to resolve the matter for any reason, or if the +behavior is threatening or harassing, report it. We are dedicated to providing +an environment where participants feel welcome and safe. + + +Reports should be directed to *googleapis-stewards@google.com*, the +Project Steward(s) for *Google Cloud Client Libraries*. It is the Project Steward’s duty to +receive and address reported violations of the code of conduct. They will then +work with a committee consisting of representatives from the Open Source +Programs Office and the Google Open Source Strategy team. If for any reason you +are uncomfortable reaching out to the Project Steward, please email +opensource@google.com. + +We will investigate every complaint, but you may not receive a direct response. +We will use our discretion in determining when and how to follow up on reported +incidents, which may range from not taking action to permanent expulsion from +the project and project-sponsored spaces. We will notify the accused of the +report and provide them an opportunity to discuss it before any action is taken. +The identity of the reporter will be omitted from the details of the report +supplied to the accused. In potentially harmful situations, such as ongoing +harassment or threats to anyone's safety, we may take action without notice. + +## Attribution + +This Code of Conduct is adapted from the Contributor Covenant, version 1.4, +available at +https://www.contributor-covenant.org/version/1/4/code-of-conduct.html \ No newline at end of file diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 3366287d6..2faf5aed3 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -21,8 +21,8 @@ In order to add a feature: - The feature must be documented in both the API and narrative documentation. -- The feature must work fully on the following CPython versions: 2.7, - 3.5, 3.6, 3.7 and 3.8 on both UNIX and Windows. +- The feature must work fully on the following CPython versions: + 3.6, 3.7, 3.8 and 3.9 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -68,10 +68,12 @@ Using ``nox`` We use `nox `__ to instrument our tests. - To test your changes, run unit tests with ``nox``:: + $ nox -s unit + +- To run a single unit test:: + + $ nox -s unit-3.9 -- -k - $ nox -s unit-2.7 - $ nox -s unit-3.7 - $ ... .. note:: @@ -80,25 +82,6 @@ We use `nox `__ to instrument our tests. .. nox: https://pypi.org/project/nox/ -Note on Editable Installs / Develop Mode -======================================== - -- As mentioned previously, using ``setuptools`` in `develop mode`_ - or a ``pip`` `editable install`_ is not possible with this - library. This is because this library uses `namespace packages`_. - For context see `Issue #2316`_ and the relevant `PyPA issue`_. - - Since ``editable`` / ``develop`` mode can't be used, packages - need to be installed directly. Hence your changes to the source - tree don't get incorporated into the **already installed** - package. - -.. _namespace packages: https://www.python.org/dev/peps/pep-0420/ -.. _Issue #2316: https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2316 -.. _PyPA issue: https://github.com/pypa/packaging-problems/issues/12 -.. _develop mode: https://setuptools.readthedocs.io/en/latest/setuptools.html#development-mode -.. _editable install: https://pip.pypa.io/en/stable/reference/pip_install/#editable-installs - ***************************************** I'm getting weird errors... Can you help? ***************************************** @@ -112,8 +95,12 @@ On Debian/Ubuntu:: ************ Coding Style ************ +- We use the automatic code formatter ``black``. You can run it using + the nox session ``blacken``. This will eliminate many lint errors. Run via:: -- PEP8 compliance, with exceptions defined in the linter configuration. + $ nox -s blacken + +- PEP8 compliance is required, with exceptions defined in the linter configuration. If you have ``nox`` installed, you can test that you have not introduced any non-compliant code via:: @@ -130,6 +117,16 @@ Coding Style should point to the official ``googleapis`` checkout and the the branch should be the main branch on that remote (``master``). +- This repository contains configuration for the + `pre-commit `__ tool, which automates checking + our linters during a commit. If you have it installed on your ``$PATH``, + you can enable enforcing those checks via: + +.. code-block:: bash + + $ pre-commit install + pre-commit installed at .git/hooks/pre-commit + Exceptions to PEP8: - Many unit tests use a helper method, ``_call_fut`` ("FUT" is short for @@ -142,34 +139,23 @@ Running System Tests - To run system tests, you can execute:: - $ nox -s system-3.7 - $ nox -s system-2.7 + # Run all system tests + $ nox -s system + + # Run a single system test + $ nox -s system-3.8 -- -k + .. note:: - System tests are only configured to run under Python 2.7 and - Python 3.7. For expediency, we do not run them in older versions - of Python 3. + System tests are only configured to run under Python 3.8. + For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local auth settings and change some configuration in your project to run all the tests. -- System tests will be run against an actual project and - so you'll need to provide some environment variables to facilitate - authentication to your project: - - - ``GOOGLE_APPLICATION_CREDENTIALS``: The path to a JSON key file; - Such a file can be downloaded directly from the developer's console by clicking - "Generate new JSON key". See private key - `docs `__ - for more details. - -- Once you have downloaded your json keys, set the environment variable - ``GOOGLE_APPLICATION_CREDENTIALS`` to the absolute path of the json file:: - - $ export GOOGLE_APPLICATION_CREDENTIALS="/Users//path/to/app_credentials.json" - +- System tests will be run against an actual project. You should use local credentials from gcloud when possible. See `Best practices for application authentication `__. Some tests require a service account. For those tests see `Authenticating as a service account `__. ************* Test Coverage @@ -191,6 +177,30 @@ Build the docs via: $ nox -s docs +************************* +Samples and code snippets +************************* + +Code samples and snippets live in the `samples/` catalogue. Feel free to +provide more examples, but make sure to write tests for those examples. +Each folder containing example code requires its own `noxfile.py` script +which automates testing. If you decide to create a new folder, you can +base it on the `samples/snippets` folder (providing `noxfile.py` and +the requirements files). + +The tests will run against a real Google Cloud Project, so you should +configure them just like the System Tests. + +- To run sample tests, you can execute:: + + # Run all tests in a folder + $ cd samples/snippets + $ nox -s py-3.8 + + # Run a single sample test + $ cd samples/snippets + $ nox -s py-3.8 -- -k + ******************************************** Note About ``README`` as it pertains to PyPI ******************************************** @@ -211,25 +221,24 @@ Supported Python Versions We support: -- `Python 3.5`_ - `Python 3.6`_ - `Python 3.7`_ - `Python 3.8`_ +- `Python 3.9`_ -.. _Python 3.5: https://docs.python.org/3.5/ .. _Python 3.6: https://docs.python.org/3.6/ .. _Python 3.7: https://docs.python.org/3.7/ .. _Python 3.8: https://docs.python.org/3.8/ +.. _Python 3.9: https://docs.python.org/3.9/ Supported versions can be found in our ``noxfile.py`` `config`_. .. _config: https://github.com/googleapis/python-bigquery/blob/master/noxfile.py -Python 2.7 support is deprecated. All code changes should maintain Python 2.7 compatibility until January 1, 2020. -We also explicitly decided to support Python 3 beginning with version -3.5. Reasons for this include: +We also explicitly decided to support Python 3 beginning with version 3.6. +Reasons for this include: - Encouraging use of newest versions of Python 3 - Taking the lead of `prominent`_ open-source `projects`_ diff --git a/LICENSE b/LICENSE index a8ee855de..d64569567 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ - Apache License + + Apache License Version 2.0, January 2004 - https://www.apache.org/licenses/ + http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION @@ -192,7 +193,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at - https://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/MANIFEST.in b/MANIFEST.in index e9e29d120..e783f4c62 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -16,10 +16,10 @@ # Generated by synthtool. DO NOT EDIT! include README.rst LICENSE -recursive-include google *.json *.proto +recursive-include google *.json *.proto py.typed recursive-include tests * global-exclude *.py[co] global-exclude __pycache__ # Exclude scripts for samples readmegen -prune scripts/readme-gen \ No newline at end of file +prune scripts/readme-gen diff --git a/README.rst b/README.rst index c6bc17834..8454cf9c0 100644 --- a/README.rst +++ b/README.rst @@ -52,11 +52,14 @@ dependencies. Supported Python Versions ^^^^^^^^^^^^^^^^^^^^^^^^^ -Python >= 3.5 +Python >= 3.6, < 3.10 -Deprecated Python Versions -^^^^^^^^^^^^^^^^^^^^^^^^^^ -Python == 2.7. Python 2.7 support will be removed on January 1, 2020. +Unsupported Python Versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Python == 2.7, Python == 3.5. + +The last version of this library compatible with Python 2.7 and 3.5 is +`google-cloud-bigquery==1.28.0`. Mac/Linux diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..8b58ae9c0 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,7 @@ +# Security Policy + +To report a security issue, please use [g.co/vulnz](https://g.co/vulnz). + +The Google Security Team will respond within 5 working days of your report on g.co/vulnz. + +We use g.co/vulnz for our intake, and do coordination and disclosure here using GitHub Security Advisory to privately discuss and fix the issue. diff --git a/UPGRADING.md b/UPGRADING.md new file mode 100644 index 000000000..a4ba0efd2 --- /dev/null +++ b/UPGRADING.md @@ -0,0 +1,59 @@ + + + +# 2.0.0 Migration Guide + +The 2.0 release of the `google-cloud-bigquery` client drops support for Python +versions below 3.6. The client surface itself has not changed, but the 1.x series +will not be receiving any more feature updates or bug fixes. You are thus +encouraged to upgrade to the 2.x series. + +If you experience issues or have questions, please file an +[issue](https://github.com/googleapis/python-bigquery/issues). + + +## Supported Python Versions + +> **WARNING**: Breaking change + +The 2.0.0 release requires Python 3.6+. + + +## Supported BigQuery Storage Clients + +The 2.0.0 release requires BigQuery Storage `>= 2.0.0`, which dropped support +for `v1beta1` and `v1beta2` versions of the BigQuery Storage API. If you want to +use a BigQuery Storage client, it must be the one supporting the `v1` API version. + + +## Changed GAPIC Enums Path + +> **WARNING**: Breaking change + +Generated GAPIC enum types have been moved under `types`. Import paths need to be +adjusted. + +**Before:** +```py +from google.cloud.bigquery_v2.gapic import enums + +distance_type = enums.Model.DistanceType.COSINE +``` + +**After:** +```py +from google.cloud.bigquery_v2 import types + +distance_type = types.Model.DistanceType.COSINE +``` \ No newline at end of file diff --git a/docs/UPGRADING.md b/docs/UPGRADING.md new file mode 120000 index 000000000..01097c8c0 --- /dev/null +++ b/docs/UPGRADING.md @@ -0,0 +1 @@ +../UPGRADING.md \ No newline at end of file diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 0abaf229f..b0a295464 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -1,4 +1,20 @@ div#python2-eol { border-color: red; border-width: medium; -} \ No newline at end of file +} + +/* Ensure minimum width for 'Parameters' / 'Returns' column */ +dl.field-list > dt { + min-width: 100px +} + +/* Insert space between methods for readability */ +dl.method { + padding-top: 10px; + padding-bottom: 10px +} + +/* Insert empty space between classes */ +dl.class { + padding-bottom: 50px +} diff --git a/docs/bigquery_v2/types.rst b/docs/bigquery_v2/types.rst new file mode 100644 index 000000000..c36a83e0b --- /dev/null +++ b/docs/bigquery_v2/types.rst @@ -0,0 +1,7 @@ +Types for Google Cloud Bigquery v2 API +====================================== + +.. automodule:: google.cloud.bigquery_v2.types + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index 155606c97..59a2d8fb3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,17 @@ # -*- coding: utf-8 -*- +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # # google-cloud-bigquery documentation build configuration file # @@ -29,7 +42,7 @@ # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -needs_sphinx = "1.6.3" +needs_sphinx = "1.5.5" # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -39,6 +52,7 @@ "sphinx.ext.autosummary", "sphinx.ext.intersphinx", "sphinx.ext.coverage", + "sphinx.ext.doctest", "sphinx.ext.napoleon", "sphinx.ext.todo", "sphinx.ext.viewcode", @@ -66,9 +80,9 @@ master_doc = "index" # General information about the project. -project = u"google-cloud-bigquery" -copyright = u"2019, Google" -author = u"Google APIs" +project = "google-cloud-bigquery" +copyright = "2019, Google" +author = "Google APIs" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -96,9 +110,11 @@ # directories to ignore when looking for source files. exclude_patterns = [ "_build", + "**/.nox/**/*", "samples/AUTHORING_GUIDE.md", "samples/CONTRIBUTING.md", "samples/snippets/README.rst", + "bigquery_v2/services.rst", # generated by the code generator ] # The reST default role (used for this markup: `text`) to use for all @@ -267,7 +283,7 @@ ( master_doc, "google-cloud-bigquery.tex", - u"google-cloud-bigquery Documentation", + "google-cloud-bigquery Documentation", author, "manual", ) @@ -302,7 +318,7 @@ ( master_doc, "google-cloud-bigquery", - u"google-cloud-bigquery Documentation", + "google-cloud-bigquery Documentation", [author], 1, ) @@ -321,7 +337,7 @@ ( master_doc, "google-cloud-bigquery", - u"google-cloud-bigquery Documentation", + "google-cloud-bigquery Documentation", author, "google-cloud-bigquery", "google-cloud-bigquery Library", @@ -344,10 +360,14 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - "python": ("http://python.readthedocs.org/en/latest/", None), - "google-auth": ("https://google-auth.readthedocs.io/en/stable", None), + "python": ("https://python.readthedocs.org/en/latest/", None), + "google-auth": ("https://googleapis.dev/python/google-auth/latest/", None), "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None,), - "grpc": ("https://grpc.io/grpc/python/", None), + "grpc": ("https://grpc.github.io/grpc/python/", None), + "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), + "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), + "pandas": ("http://pandas.pydata.org/pandas-docs/dev", None), + "geopandas": ("https://geopandas.org/", None), } diff --git a/docs/dbapi.rst b/docs/dbapi.rst index ca0256d3c..81f000bc7 100644 --- a/docs/dbapi.rst +++ b/docs/dbapi.rst @@ -4,3 +4,47 @@ DB-API Reference .. automodule:: google.cloud.bigquery.dbapi :members: :show-inheritance: + + +DB-API Query-Parameter Syntax +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The BigQuery DB-API uses the `qmark` `parameter style +`_ for +unnamed/positional parameters and the `pyformat` parameter style for +named parameters. + +An example of a query using unnamed parameters:: + + insert into people (name, income) values (?, ?) + +and using named parameters:: + + insert into people (name, income) values (%(name)s, %(income)s) + +Providing explicit type information +----------------------------------- + +BigQuery requires type information for parameters. The BigQuery +DB-API can usually determine parameter types for parameters based on +provided values. Sometimes, however, types can't be determined (for +example when `None` is passed) or are determined incorrectly (for +example when passing a floating-point value to a numeric column). + +The BigQuery DB-API provides an extended parameter syntax. For named +parameters, a BigQuery type is provided after the name separated by a +colon, as in:: + + insert into people (name, income) values (%(name:string)s, %(income:numeric)s) + +For unnamed parameters, use the named syntax with a type, but no +name, as in:: + + insert into people (name, income) values (%(:string)s, %(:numeric)s) + +Providing type information is the *only* way to pass `struct` data:: + + cursor.execute( + "insert into points (point) values (%(:struct)s)", + [{"x": 10, "y": 20}], + ) diff --git a/docs/enums.rst b/docs/enums.rst new file mode 100644 index 000000000..57608968a --- /dev/null +++ b/docs/enums.rst @@ -0,0 +1,6 @@ +BigQuery Enums +============== + +.. automodule:: google.cloud.bigquery.enums + :members: + :undoc-members: diff --git a/docs/gapic/v2/enums.rst b/docs/gapic/v2/enums.rst deleted file mode 100644 index 0e0f05ada..000000000 --- a/docs/gapic/v2/enums.rst +++ /dev/null @@ -1,8 +0,0 @@ -Enums for BigQuery API Client -============================= - -.. autoclass:: google.cloud.bigquery_v2.gapic.enums.Model - :members: - -.. autoclass:: google.cloud.bigquery_v2.gapic.enums.StandardSqlDataType - :members: diff --git a/docs/gapic/v2/types.rst b/docs/gapic/v2/types.rst deleted file mode 100644 index 99b954eca..000000000 --- a/docs/gapic/v2/types.rst +++ /dev/null @@ -1,6 +0,0 @@ -Types for BigQuery API Client -============================= - -.. automodule:: google.cloud.bigquery_v2.types - :members: - :noindex: \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 62a82e0e9..3f8ba2304 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,6 +27,16 @@ API Reference reference dbapi +Migration Guide +--------------- + +See the guide below for instructions on migrating to the 2.x release of this library. + +.. toctree:: + :maxdepth: 2 + + UPGRADING + Changelog --------- diff --git a/docs/reference.rst b/docs/reference.rst index 981059de5..d8738e67b 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -58,11 +58,17 @@ Job-Related Types job.Compression job.CreateDisposition job.DestinationFormat + job.DmlStats job.Encoding + job.OperationType + job.QueryPlanEntry + job.QueryPlanEntryStep job.QueryPriority + job.ReservationUsage job.SourceFormat job.WriteDisposition job.SchemaUpdateOption + job.TransactionInfo Dataset @@ -87,6 +93,7 @@ Table table.RangePartitioning table.Row table.RowIterator + table.SnapshotDefinition table.Table table.TableListItem table.TableReference @@ -108,9 +115,11 @@ Routine .. autosummary:: :toctree: generated + routine.DeterminismLevel routine.Routine routine.RoutineArgument routine.RoutineReference + routine.RoutineType Schema ====== @@ -129,6 +138,7 @@ Query query.ArrayQueryParameter query.ScalarQueryParameter + query.ScalarQueryParameterType query.StructQueryParameter query.UDFResource @@ -169,10 +179,11 @@ Magics Enums ===== -.. autosummary:: - :toctree: generated +.. toctree:: + :maxdepth: 2 + + enums - enums.StandardSqlDataTypes Encryption Configuration ======================== @@ -182,6 +193,7 @@ Encryption Configuration encryption_configuration.EncryptionConfiguration + Additional Types ================ @@ -190,5 +202,4 @@ Protocol buffer classes for working with the Models API. .. toctree:: :maxdepth: 2 - gapic/v2/enums - gapic/v2/types + bigquery_v2/types diff --git a/docs/snippets.py b/docs/snippets.py index bc6b58020..c62001fc0 100644 --- a/docs/snippets.py +++ b/docs/snippets.py @@ -26,10 +26,6 @@ import pytest -try: - import fastparquet -except (ImportError, AttributeError): - fastparquet = None try: import pandas except (ImportError, AttributeError): @@ -367,7 +363,6 @@ def test_update_table_expiration(client, to_delete): # [START bigquery_update_table_expiration] import datetime - import pytz # from google.cloud import bigquery # client = bigquery.Client() @@ -379,7 +374,9 @@ def test_update_table_expiration(client, to_delete): assert table.expires is None # set table to expire 5 days from now - expiration = datetime.datetime.now(pytz.utc) + datetime.timedelta(days=5) + expiration = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta( + days=5 + ) table.expires = expiration table = client.update_table(table, ["expires"]) # API request @@ -482,132 +479,6 @@ def test_update_table_cmek(client, to_delete): # [END bigquery_update_table_cmek] -@pytest.mark.skip( - reason=( - "update_table() is flaky " - "https://github.com/GoogleCloudPlatform/google-cloud-python/issues/5589" - ) -) -def test_manage_views(client, to_delete): - project = client.project - source_dataset_id = "source_dataset_{}".format(_millis()) - source_dataset_ref = bigquery.DatasetReference(project, source_dataset_id) - source_dataset = bigquery.Dataset(source_dataset_ref) - source_dataset = client.create_dataset(source_dataset) - to_delete.append(source_dataset) - - job_config = bigquery.LoadJobConfig() - job_config.schema = [ - bigquery.SchemaField("name", "STRING"), - bigquery.SchemaField("post_abbr", "STRING"), - ] - job_config.skip_leading_rows = 1 - uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" - source_table_id = "us_states" - load_job = client.load_table_from_uri( - uri, source_dataset.table(source_table_id), job_config=job_config - ) - load_job.result() - - shared_dataset_id = "shared_dataset_{}".format(_millis()) - shared_dataset_ref = bigquery.DatasetReference(project, shared_dataset_id) - shared_dataset = bigquery.Dataset(shared_dataset_ref) - shared_dataset = client.create_dataset(shared_dataset) - to_delete.append(shared_dataset) - - # [START bigquery_create_view] - # from google.cloud import bigquery - # client = bigquery.Client() - # project = 'my-project' - # source_dataset_id = 'my_source_dataset' - # source_table_id = 'us_states' - # shared_dataset_ref = bigquery.DatasetReference(project, 'my_shared_dataset') - - # This example shows how to create a shared view of a source table of - # US States. The source table contains all 50 states, while the view will - # contain only states with names starting with 'W'. - view_ref = shared_dataset_ref.table("my_shared_view") - view = bigquery.Table(view_ref) - sql_template = 'SELECT name, post_abbr FROM `{}.{}.{}` WHERE name LIKE "W%"' - view.view_query = sql_template.format(project, source_dataset_id, source_table_id) - view = client.create_table(view) # API request - - print("Successfully created view at {}".format(view.full_table_id)) - # [END bigquery_create_view] - - # [START bigquery_update_view_query] - # from google.cloud import bigquery - # client = bigquery.Client() - # project = 'my-project' - # source_dataset_id = 'my_source_dataset' - # source_table_id = 'us_states' - # shared_dataset_ref = bigquery.DatasetReference(project, 'my_shared_dataset') - - # This example shows how to update a shared view of a source table of - # US States. The view's query will be updated to contain only states with - # names starting with 'M'. - view_ref = shared_dataset_ref.table("my_shared_view") - view = bigquery.Table(view_ref) - sql_template = 'SELECT name, post_abbr FROM `{}.{}.{}` WHERE name LIKE "M%"' - view.view_query = sql_template.format(project, source_dataset_id, source_table_id) - view = client.update_table(view, ["view_query"]) # API request - # [END bigquery_update_view_query] - - # [START bigquery_get_view] - # from google.cloud import bigquery - # client = bigquery.Client() - # shared_dataset_id = 'my_shared_dataset' - project = client.project - shared_dataset_ref = bigquery.DatasetReference(project, shared_dataset_id) - view_ref = shared_dataset_ref.table("my_shared_view") - view = client.get_table(view_ref) # API Request - - # Display view properties - print("View at {}".format(view.full_table_id)) - print("View Query:\n{}".format(view.view_query)) - # [END bigquery_get_view] - assert view.view_query is not None - - analyst_group_email = "example-analyst-group@google.com" - # [START bigquery_grant_view_access] - # from google.cloud import bigquery - # client = bigquery.Client() - - # Assign access controls to the dataset containing the view - # shared_dataset_id = 'my_shared_dataset' - # analyst_group_email = 'data_analysts@example.com' - project = client.project - shared_dataset_ref = bigquery.DatasetReference(project, shared_dataset_id) - shared_dataset = client.get_dataset(shared_dataset_ref) # API request - access_entries = shared_dataset.access_entries - access_entries.append( - bigquery.AccessEntry("READER", "groupByEmail", analyst_group_email) - ) - shared_dataset.access_entries = access_entries - shared_dataset = client.update_dataset( - shared_dataset, ["access_entries"] - ) # API request - - # Authorize the view to access the source dataset - # project = 'my-project' - # source_dataset_id = 'my_source_dataset' - project = client.project - source_dataset_ref = bigquery.DatasetReference(project, source_dataset_id) - source_dataset = client.get_dataset(source_dataset_ref) # API request - view_reference = { - "projectId": project, - "datasetId": shared_dataset_id, - "tableId": "my_shared_view", - } - access_entries = source_dataset.access_entries - access_entries.append(bigquery.AccessEntry(None, "view", view_reference)) - source_dataset.access_entries = access_entries - source_dataset = client.update_dataset( - source_dataset, ["access_entries"] - ) # API request - # [END bigquery_grant_view_access] - - def test_load_table_add_column(client, to_delete): dataset_id = "load_table_add_column_{}".format(_millis()) project = client.project diff --git a/docs/usage/index.rst b/docs/usage/index.rst index ff4c9d7f1..1d3cc9f64 100644 --- a/docs/usage/index.rst +++ b/docs/usage/index.rst @@ -29,7 +29,7 @@ Integrations with Other Libraries pandas -See also, the :mod:`google.cloud.bigquery.magics` module for integrations -with Jupyter. +See also, the :mod:`google.cloud.bigquery.magics.magics` module for +integrations with Jupyter. diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst index 9db98dfbb..92eee67cf 100644 --- a/docs/usage/pandas.rst +++ b/docs/usage/pandas.rst @@ -37,6 +37,21 @@ To retrieve table rows as a :class:`pandas.DataFrame`: :start-after: [START bigquery_list_rows_dataframe] :end-before: [END bigquery_list_rows_dataframe] + +Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame +------------------------------------------------------------ + +`GeoPandas `_ adds geospatial analytics +capabilities to Pandas. To retrieve query results containing +GEOGRAPHY data as a :class:`geopandas.GeoDataFrame`: + +.. literalinclude:: ../samples/geography/to_geodataframe.py + :language: python + :dedent: 4 + :start-after: [START bigquery_query_results_geodataframe] + :end-before: [END bigquery_query_results_geodataframe] + + Load a Pandas DataFrame to a BigQuery Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/usage/tables.rst b/docs/usage/tables.rst index 27af7c7df..d924fe214 100644 --- a/docs/usage/tables.rst +++ b/docs/usage/tables.rst @@ -58,6 +58,15 @@ Create an empty table with the :start-after: [START bigquery_create_table] :end-before: [END bigquery_create_table] +Create a clustered table with the +:func:`~google.cloud.bigquery.client.Client.create_table` method: + +.. literalinclude:: ../samples/create_table_clustered.py + :language: python + :dedent: 4 + :start-after: [START bigquery_create_table_clustered] + :end-before: [END bigquery_create_table_clustered] + Create an integer range partitioned table with the :func:`~google.cloud.bigquery.client.Client.create_table` method: @@ -76,6 +85,23 @@ Load table data from a file with the :start-after: [START bigquery_load_from_file] :end-before: [END bigquery_load_from_file] +Creating a clustered table from a query result: + +.. literalinclude:: ../samples/client_query_destination_table_clustered.py + :language: python + :dedent: 4 + :start-after: [START bigquery_query_clustered_table] + :end-before: [END bigquery_query_clustered_table] + +Creating a clustered table when you load data with the +:func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method: + +.. literalinclude:: ../samples/load_table_clustered.py + :language: python + :dedent: 4 + :start-after: [START bigquery_load_table_clustered] + :end-before: [END bigquery_load_table_clustered] + Load a CSV file from Cloud Storage with the :func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method: diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index 89c5a3624..a7a0da3dd 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -28,17 +28,21 @@ """ -from pkg_resources import get_distribution +from google.cloud.bigquery import version as bigquery_version -__version__ = get_distribution("google-cloud-bigquery").version +__version__ = bigquery_version.__version__ from google.cloud.bigquery.client import Client from google.cloud.bigquery.dataset import AccessEntry from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums +from google.cloud.bigquery.enums import AutoRowIDs +from google.cloud.bigquery.enums import DecimalTargetType +from google.cloud.bigquery.enums import KeyResultStatementKind +from google.cloud.bigquery.enums import SqlTypeNames from google.cloud.bigquery.enums import StandardSqlDataTypes -from google.cloud.bigquery.exceptions import PyarrowMissingWarning +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions from google.cloud.bigquery.external_config import BigtableColumnFamily @@ -46,37 +50,48 @@ from google.cloud.bigquery.external_config import CSVOptions from google.cloud.bigquery.external_config import GoogleSheetsOptions from google.cloud.bigquery.external_config import ExternalSourceFormat +from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery.job import Compression from google.cloud.bigquery.job import CopyJob from google.cloud.bigquery.job import CopyJobConfig from google.cloud.bigquery.job import CreateDisposition from google.cloud.bigquery.job import DestinationFormat +from google.cloud.bigquery.job import DmlStats from google.cloud.bigquery.job import Encoding from google.cloud.bigquery.job import ExtractJob from google.cloud.bigquery.job import ExtractJobConfig from google.cloud.bigquery.job import LoadJob from google.cloud.bigquery.job import LoadJobConfig +from google.cloud.bigquery.job import OperationType from google.cloud.bigquery.job import QueryJob from google.cloud.bigquery.job import QueryJobConfig from google.cloud.bigquery.job import QueryPriority from google.cloud.bigquery.job import SchemaUpdateOption +from google.cloud.bigquery.job import ScriptOptions from google.cloud.bigquery.job import SourceFormat from google.cloud.bigquery.job import UnknownJob +from google.cloud.bigquery.job import TransactionInfo from google.cloud.bigquery.job import WriteDisposition from google.cloud.bigquery.model import Model from google.cloud.bigquery.model import ModelReference from google.cloud.bigquery.query import ArrayQueryParameter +from google.cloud.bigquery.query import ArrayQueryParameterType from google.cloud.bigquery.query import ScalarQueryParameter +from google.cloud.bigquery.query import ScalarQueryParameterType from google.cloud.bigquery.query import StructQueryParameter +from google.cloud.bigquery.query import StructQueryParameterType from google.cloud.bigquery.query import UDFResource from google.cloud.bigquery.retry import DEFAULT_RETRY +from google.cloud.bigquery.routine import DeterminismLevel from google.cloud.bigquery.routine import Routine from google.cloud.bigquery.routine import RoutineArgument from google.cloud.bigquery.routine import RoutineReference +from google.cloud.bigquery.routine import RoutineType from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import PartitionRange from google.cloud.bigquery.table import RangePartitioning from google.cloud.bigquery.table import Row +from google.cloud.bigquery.table import SnapshotDefinition from google.cloud.bigquery.table import Table from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import TimePartitioningType @@ -92,6 +107,9 @@ "ArrayQueryParameter", "ScalarQueryParameter", "StructQueryParameter", + "ArrayQueryParameterType", + "ScalarQueryParameterType", + "StructQueryParameterType", # Datasets "Dataset", "DatasetReference", @@ -102,6 +120,7 @@ "PartitionRange", "RangePartitioning", "Row", + "SnapshotDefinition", "TimePartitioning", "TimePartitioningType", # Jobs @@ -126,25 +145,36 @@ "BigtableOptions", "BigtableColumnFamily", "BigtableColumn", + "DmlStats", "CSVOptions", "GoogleSheetsOptions", + "ParquetOptions", + "ScriptOptions", + "TransactionInfo", "DEFAULT_RETRY", # Enum Constants "enums", + "AutoRowIDs", "Compression", "CreateDisposition", + "DecimalTargetType", "DestinationFormat", + "DeterminismLevel", "ExternalSourceFormat", "Encoding", + "KeyResultStatementKind", + "OperationType", "QueryPriority", + "RoutineType", "SchemaUpdateOption", - "StandardSqlDataTypes", "SourceFormat", + "SqlTypeNames", + "StandardSqlDataTypes", "WriteDisposition", # EncryptionConfiguration "EncryptionConfiguration", - # Errors and warnings - "PyarrowMissingWarning", + # Custom exceptions + "LegacyBigQueryStorageError", ] diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 47851d42c..0a1f71444 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -15,18 +15,22 @@ """Shared helper functions for BigQuery API classes.""" import base64 -import copy import datetime import decimal +import math import re -import six +from typing import Union from google.cloud._helpers import UTC from google.cloud._helpers import _date_from_iso8601_date from google.cloud._helpers import _datetime_from_microseconds -from google.cloud._helpers import _microseconds_from_datetime +from google.cloud._helpers import _RFC3339_MICROS from google.cloud._helpers import _RFC3339_NO_FRACTION from google.cloud._helpers import _to_bytes +import packaging.version + +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + _RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f" _TIMEONLY_WO_MICROS = "%H:%M:%S" @@ -38,10 +42,70 @@ re.VERBOSE, ) +_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") +_BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") + + +class BQStorageVersions: + """Version comparisons for google-cloud-bigqueyr-storage package.""" + + def __init__(self): + self._installed_version = None + + @property + def installed_version(self) -> packaging.version.Version: + """Return the parsed version of google-cloud-bigquery-storage.""" + if self._installed_version is None: + from google.cloud import bigquery_storage + + self._installed_version = packaging.version.parse( + # Use 0.0.0, since it is earlier than any released version. + # Legacy versions also have the same property, but + # creating a LegacyVersion has been deprecated. + # https://github.com/pypa/packaging/issues/321 + getattr(bigquery_storage, "__version__", "0.0.0") + ) + + return self._installed_version + + @property + def is_read_session_optional(self) -> bool: + """True if read_session is optional to rows(). + + See: https://github.com/googleapis/python-bigquery-storage/pull/228 + """ + return self.installed_version >= _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION + + def verify_version(self): + """Verify that a recent enough version of BigQuery Storage extra is + installed. + + The function assumes that google-cloud-bigquery-storage extra is + installed, and should thus be used in places where this assumption + holds. + + Because `pip` can install an outdated version of this extra despite the + constraints in `setup.py`, the calling code can use this helper to + verify the version compatibility at runtime. + + Raises: + LegacyBigQueryStorageError: + If the google-cloud-bigquery-storage package is outdated. + """ + if self.installed_version < _MIN_BQ_STORAGE_VERSION: + msg = ( + "Dependency google-cloud-bigquery-storage is outdated, please upgrade " + f"it to version >= 2.0.0 (version found: {self.installed_version})." + ) + raise LegacyBigQueryStorageError(msg) + + +BQ_STORAGE_VERSIONS = BQStorageVersions() + def _not_null(value, field): """Check whether 'value' should be coerced to 'field' type.""" - return value is not None or field.mode != "NULLABLE" + return value is not None or (field is not None and field.mode != "NULLABLE") def _int_from_json(value, field): @@ -82,8 +146,8 @@ def _bytes_from_json(value, field): def _timestamp_from_json(value, field): """Coerce 'value' to a datetime, if set or not nullable.""" if _not_null(value, field): - # value will be a float in seconds, to microsecond precision, in UTC. - return _datetime_from_microseconds(1e6 * float(value)) + # value will be a integer in seconds, to microsecond precision, in UTC. + return _datetime_from_microseconds(int(value)) def _timestamp_query_param_from_json(value, field): @@ -189,6 +253,7 @@ def _record_from_json(value, field): "FLOAT": _float_from_json, "FLOAT64": _float_from_json, "NUMERIC": _decimal_from_json, + "BIGNUMERIC": _decimal_from_json, "BOOLEAN": _bool_from_json, "BOOL": _bool_from_json, "STRING": _string_from_json, @@ -274,9 +339,15 @@ def _int_to_json(value): return value -def _float_to_json(value): +def _float_to_json(value) -> Union[None, str, float]: """Coerce 'value' to an JSON-compatible representation.""" - return value + if value is None: + return None + + if isinstance(value, str): + value = float(value) + + return str(value) if (math.isnan(value) or math.isinf(value)) else float(value) def _decimal_to_json(value): @@ -314,18 +385,23 @@ def _timestamp_to_json_parameter(value): def _timestamp_to_json_row(value): - """Coerce 'value' to an JSON-compatible representation. - - This version returns floating-point seconds value used in row data. - """ + """Coerce 'value' to an JSON-compatible representation.""" if isinstance(value, datetime.datetime): - value = _microseconds_from_datetime(value) * 1e-6 + # For naive datetime objects UTC timezone is assumed, thus we format + # those to string directly without conversion. + if value.tzinfo is not None: + value = value.astimezone(UTC) + value = value.strftime(_RFC3339_MICROS) return value def _datetime_to_json(value): """Coerce 'value' to an JSON-compatible representation.""" if isinstance(value, datetime.datetime): + # For naive datetime objects UTC timezone is assumed, thus we format + # those to string directly without conversion. + if value.tzinfo is not None: + value = value.astimezone(UTC) value = value.strftime(_RFC3339_MICROS_NO_ZULU) return value @@ -351,6 +427,7 @@ def _time_to_json(value): "FLOAT": _float_to_json, "FLOAT64": _float_to_json, "NUMERIC": _decimal_to_json, + "BIGNUMERIC": _decimal_to_json, "BOOLEAN": _bool_to_json, "BOOL": _bool_to_json, "BYTES": _bytes_to_json, @@ -358,6 +435,11 @@ def _time_to_json(value): "DATETIME": _datetime_to_json, "DATE": _date_to_json, "TIME": _time_to_json, + # Make sure DECIMAL and BIGDECIMAL are handled, even though + # requests for them should be converted to NUMERIC. Better safe + # than sorry. + "DECIMAL": _decimal_to_json, + "BIGDECIMAL": _decimal_to_json, } @@ -397,13 +479,9 @@ def _repeated_field_to_json(field, row_value): Returns: List[Any]: A list of JSON-serializable objects. """ - # Remove the REPEATED, but keep the other fields. This allows us to process - # each item as if it were a top-level field. - item_field = copy.deepcopy(field) - item_field._mode = "NULLABLE" values = [] for item in row_value: - values.append(_field_to_json(item_field, item)) + values.append(_single_field_to_json(field, item)) return values @@ -457,11 +535,38 @@ def _record_field_to_json(fields, row_value): for field_name in not_processed: value = row_value[field_name] if value is not None: - record[field_name] = six.text_type(value) + record[field_name] = str(value) return record +def _single_field_to_json(field, row_value): + """Convert a single field into JSON-serializable values. + + Ignores mode so that this can function for ARRAY / REPEATING fields + without requiring a deepcopy of the field. See: + https://github.com/googleapis/python-bigquery/issues/6 + + Args: + field (google.cloud.bigquery.schema.SchemaField): + The SchemaField to use for type conversion and field name. + + row_value (Any): + Scalar or Struct to be inserted. The type + is inferred from the SchemaField's field_type. + + Returns: + Any: A JSON-serializable object. + """ + if row_value is None: + return None + + if field.field_type == "RECORD": + return _record_field_to_json(field.fields, row_value) + + return _scalar_field_to_json(field, row_value) + + def _field_to_json(field, row_value): """Convert a field into JSON-serializable values. @@ -483,10 +588,7 @@ def _field_to_json(field, row_value): if field.mode == "REPEATED": return _repeated_field_to_json(field, row_value) - if field.field_type == "RECORD": - return _record_field_to_json(field.fields, row_value) - - return _scalar_field_to_json(field, row_value) + return _single_field_to_json(field, row_value) def _snake_to_camel_case(value): diff --git a/google/cloud/bigquery/_http.py b/google/cloud/bigquery/_http.py index 8ee633e64..81e7922e6 100644 --- a/google/cloud/bigquery/_http.py +++ b/google/cloud/bigquery/_http.py @@ -14,11 +14,22 @@ """Create / interact with Google BigQuery connections.""" -from google.cloud import _http +import os +import pkg_resources +from google.cloud import _http # pytype: disable=import-error from google.cloud.bigquery import __version__ +# TODO: Increase the minimum version of google-cloud-core to 1.6.0 +# and remove this logic. See: +# https://github.com/googleapis/python-bigquery/issues/509 +if os.getenv("GOOGLE_API_USE_CLIENT_CERTIFICATE") == "true": # pragma: NO COVER + release = pkg_resources.get_distribution("google-cloud-core").parsed_version + if release < pkg_resources.parse_version("1.6.0"): + raise ImportError("google-cloud-core >= 1.6.0 is required to use mTLS feature") + + class Connection(_http.JSONConnection): """A connection to Google BigQuery via the JSON REST API. @@ -26,13 +37,18 @@ class Connection(_http.JSONConnection): client (google.cloud.bigquery.client.Client): The client that owns the current connection. client_info (Optional[google.api_core.client_info.ClientInfo]): Instance used to generate user agent. + + api_endpoint (str): The api_endpoint to use. If None, the library will decide what endpoint to use. """ DEFAULT_API_ENDPOINT = "https://bigquery.googleapis.com" + DEFAULT_API_MTLS_ENDPOINT = "https://bigquery.mtls.googleapis.com" - def __init__(self, client, client_info=None, api_endpoint=DEFAULT_API_ENDPOINT): + def __init__(self, client, client_info=None, api_endpoint=None): super(Connection, self).__init__(client, client_info) - self.API_BASE_URL = api_endpoint + self.API_BASE_URL = api_endpoint or self.DEFAULT_API_ENDPOINT + self.API_BASE_MTLS_URL = self.DEFAULT_API_MTLS_ENDPOINT + self.ALLOW_AUTO_SWITCH_TO_MTLS_URL = api_endpoint is None self._client_info.gapic_version = __version__ self._client_info.client_library_version = __version__ diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 953b7d0fe..ab58b1729 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -17,20 +17,43 @@ import concurrent.futures import functools import logging +import queue import warnings -import six -from six.moves import queue - try: - from google.cloud import bigquery_storage_v1 + import pandas except ImportError: # pragma: NO COVER - bigquery_storage_v1 = None + pandas = None +else: + import numpy try: - import pandas + # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` + from shapely.geometry.base import BaseGeometry as _BaseGeometry except ImportError: # pragma: NO COVER - pandas = None + # No shapely, use NoneType for _BaseGeometry as a placeholder. + _BaseGeometry = type(None) +else: + if pandas is not None: # pragma: NO COVER + + def _to_wkb(): + # Create a closure that: + # - Adds a not-null check. This allows the returned function to + # be used directly with apply, unlike `shapely.wkb.dumps`. + # - Avoid extra work done by `shapely.wkb.dumps` that we don't need. + # - Caches the WKBWriter (and write method lookup :) ) + # - Avoids adding WKBWriter, lgeos, and notnull to the module namespace. + from shapely.geos import WKBWriter, lgeos + + write = WKBWriter(lgeos).write + notnull = pandas.notnull + + def _to_wkb(v): + return write(v) if notnull(v) else v + + return _to_wkb + + _to_wkb = _to_wkb() try: import pyarrow @@ -38,6 +61,15 @@ except ImportError: # pragma: NO COVER pyarrow = None +try: + from google.cloud.bigquery_storage import ArrowSerializationOptions +except ImportError: + _ARROW_COMPRESSION_SUPPORT = False +else: + # Having BQ Storage available implies that pyarrow >=1.0.0 is available, too. + _ARROW_COMPRESSION_SUPPORT = True + +from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema @@ -50,6 +82,8 @@ _PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds. +_MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads + _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", @@ -65,6 +99,7 @@ "uint8": "INTEGER", "uint16": "INTEGER", "uint32": "INTEGER", + "geometry": "GEOGRAPHY", } @@ -86,6 +121,12 @@ def pyarrow_numeric(): return pyarrow.decimal128(38, 9) +def pyarrow_bignumeric(): + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) + + def pyarrow_time(): return pyarrow.time64("us") @@ -98,6 +139,7 @@ def pyarrow_timestamp(): # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py # When modifying it be sure to update it there as well. BQ_TO_ARROW_SCALARS = { + "BIGNUMERIC": pyarrow_bignumeric, "BOOL": pyarrow.bool_, "BOOLEAN": pyarrow.bool_, "BYTES": pyarrow.binary, @@ -134,9 +176,10 @@ def pyarrow_timestamp(): pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() - pyarrow.decimal128(38, scale=9).id: "NUMERIC", # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal128 instances. + # the type ID matters, and it's the same for all decimal256 instances. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", } else: # pragma: NO COVER @@ -181,14 +224,16 @@ def bq_to_arrow_data_type(field): return data_type_constructor() -def bq_to_arrow_field(bq_field): +def bq_to_arrow_field(bq_field, array_type=None): """Return the Arrow field, corresponding to a given BigQuery column. Returns: None: if the Arrow type cannot be determined. """ arrow_type = bq_to_arrow_data_type(bq_field) - if arrow_type: + if arrow_type is not None: + if array_type is not None: + arrow_type = array_type # For GEOGRAPHY, at least initially is_nullable = bq_field.mode.upper() == "NULLABLE" return pyarrow.field(bq_field.name, arrow_type, nullable=is_nullable) @@ -213,7 +258,24 @@ def bq_to_arrow_schema(bq_schema): def bq_to_arrow_array(series, bq_field): - arrow_type = bq_to_arrow_data_type(bq_field) + if bq_field.field_type.upper() == "GEOGRAPHY": + arrow_type = None + first = _first_valid(series) + if first is not None: + if series.dtype.name == "geometry" or isinstance(first, _BaseGeometry): + arrow_type = pyarrow.binary() + # Convert shapey geometry to WKB binary format: + series = series.apply(_to_wkb) + elif isinstance(first, bytes): + arrow_type = pyarrow.binary() + elif series.dtype.name == "geometry": + # We have a GeoSeries containing all nulls, convert it to a pandas series + series = pandas.Series(numpy.array(series)) + + if arrow_type is None: + arrow_type = bq_to_arrow_data_type(bq_field) + else: + arrow_type = bq_to_arrow_data_type(bq_field) field_type_upper = bq_field.field_type.upper() if bq_field.field_type else "" @@ -267,6 +329,12 @@ def list_columns_and_indexes(dataframe): return columns_and_indexes +def _first_valid(series): + first_valid_index = series.first_valid_index() + if first_valid_index is not None: + return series.at[first_valid_index] + + def dataframe_to_bq_schema(dataframe, bq_schema): """Convert a pandas DataFrame schema to a BigQuery schema. @@ -287,14 +355,6 @@ def dataframe_to_bq_schema(dataframe, bq_schema): """ if bq_schema: bq_schema = schema._to_schema_fields(bq_schema) - if six.PY2: - for field in bq_schema: - if field.field_type in schema._STRUCT_TYPES: - raise ValueError( - "Uploading dataframes with struct (record) column types " - "is not supported under Python2. See: " - "https://github.com/googleapis/python-bigquery/issues/21" - ) bq_schema_index = {field.name: field for field in bq_schema} bq_schema_unused = set(bq_schema_index.keys()) else: @@ -315,6 +375,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema): # Otherwise, try to automatically determine the type based on the # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) + if bq_type is None: + sample_data = _first_valid(dataframe[column]) + if ( + isinstance(sample_data, _BaseGeometry) + and sample_data is not None # Paranoia + ): + bq_type = "GEOGRAPHY" bq_field = schema.SchemaField(column, bq_type) bq_schema_out.append(bq_field) @@ -363,6 +430,7 @@ def augment_schema(dataframe, current_bq_schema): Returns: Optional[Sequence[google.cloud.bigquery.schema.SchemaField]] """ + # pytype: disable=attribute-error augmented_schema = [] unknown_type_fields = [] @@ -396,6 +464,7 @@ def augment_schema(dataframe, current_bq_schema): return None return augmented_schema + # pytype: enable=attribute-error def dataframe_to_arrow(dataframe, bq_schema): @@ -444,11 +513,11 @@ def dataframe_to_arrow(dataframe, bq_schema): arrow_names = [] arrow_fields = [] for bq_field in bq_schema: - arrow_fields.append(bq_to_arrow_field(bq_field)) arrow_names.append(bq_field.name) arrow_arrays.append( bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field) ) + arrow_fields.append(bq_to_arrow_field(bq_field, arrow_arrays[-1].type)) if all((field is not None for field in arrow_fields)): return pyarrow.Table.from_arrays( @@ -487,7 +556,7 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression) -def _tabledata_list_page_to_arrow(page, column_names, arrow_types): +def _row_iterator_page_to_arrow(page, column_names, arrow_types): # Iterate over the page to force the API request to get the page data. try: next(iter(page)) @@ -503,8 +572,8 @@ def _tabledata_list_page_to_arrow(page, column_names, arrow_types): return pyarrow.RecordBatch.from_arrays(arrays, names=column_names) -def download_arrow_tabledata_list(pages, bq_schema): - """Use tabledata.list to construct an iterable of RecordBatches. +def download_arrow_row_iterator(pages, bq_schema): + """Use HTTP JSON RowIterator to construct an iterable of RecordBatches. Args: pages (Iterator[:class:`google.api_core.page_iterator.Page`]): @@ -523,10 +592,10 @@ def download_arrow_tabledata_list(pages, bq_schema): arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema] for page in pages: - yield _tabledata_list_page_to_arrow(page, column_names, arrow_types) + yield _row_iterator_page_to_arrow(page, column_names, arrow_types) -def _tabledata_list_page_to_dataframe(page, column_names, dtypes): +def _row_iterator_page_to_dataframe(page, column_names, dtypes): # Iterate over the page to force the API request to get the page data. try: next(iter(page)) @@ -541,8 +610,8 @@ def _tabledata_list_page_to_dataframe(page, column_names, dtypes): return pandas.DataFrame(columns, columns=column_names) -def download_dataframe_tabledata_list(pages, bq_schema, dtypes): - """Use (slower, but free) tabledata.list to construct a DataFrame. +def download_dataframe_row_iterator(pages, bq_schema, dtypes): + """Use HTTP JSON RowIterator to construct a DataFrame. Args: pages (Iterator[:class:`google.api_core.page_iterator.Page`]): @@ -562,7 +631,7 @@ def download_dataframe_tabledata_list(pages, bq_schema, dtypes): bq_schema = schema._to_schema_fields(bq_schema) column_names = [field.name for field in bq_schema] for page in pages: - yield _tabledata_list_page_to_dataframe(page, column_names, dtypes) + yield _row_iterator_page_to_dataframe(page, column_names, dtypes) def _bqstorage_page_to_arrow(page): @@ -578,19 +647,14 @@ def _bqstorage_page_to_dataframe(column_names, dtypes, page): def _download_table_bqstorage_stream( download_state, bqstorage_client, session, stream, worker_queue, page_to_item ): - # Passing a BQ Storage client in implies that the BigQuery Storage library - # is available and can be imported. - from google.cloud import bigquery_storage_v1beta1 - - # We want to preserve comaptibility with the v1beta1 BQ Storage clients, - # thus adjust constructing the rowstream if needed. - # The assumption is that the caller provides a BQ Storage `session` that is - # compatible with the version of the BQ Storage client passed in. - if isinstance(bqstorage_client, bigquery_storage_v1beta1.BigQueryStorageClient): - position = bigquery_storage_v1beta1.types.StreamPosition(stream=stream) - rowstream = bqstorage_client.read_rows(position).rows(session) + reader = bqstorage_client.read_rows(stream.name) + + # Avoid deprecation warnings for passing in unnecessary read session. + # https://github.com/googleapis/python-bigquery-storage/issues/229 + if _helpers.BQ_STORAGE_VERSIONS.is_read_session_optional: + rowstream = reader.rows() else: - rowstream = bqstorage_client.read_rows(stream.name).rows(session) + rowstream = reader.rows(session) for page in rowstream.pages: if download_state.done: @@ -620,13 +684,13 @@ def _download_table_bqstorage( preserve_order=False, selected_fields=None, page_to_item=None, + max_queue_size=_MAX_QUEUE_SIZE_DEFAULT, ): """Use (faster, but billable) BQ Storage API to construct DataFrame.""" # Passing a BQ Storage client in implies that the BigQuery Storage library # is available and can be imported. - from google.cloud import bigquery_storage_v1 - from google.cloud import bigquery_storage_v1beta1 + from google.cloud import bigquery_storage if "$" in table.table_id: raise ValueError( @@ -637,42 +701,24 @@ def _download_table_bqstorage( requested_streams = 1 if preserve_order else 0 - # We want to preserve comaptibility with the v1beta1 BQ Storage clients, - # thus adjust the session creation if needed. - if isinstance(bqstorage_client, bigquery_storage_v1beta1.BigQueryStorageClient): - warnings.warn( - "Support for BigQuery Storage v1beta1 clients is deprecated, please " - "consider upgrading the client to BigQuery Storage v1 stable version.", - category=DeprecationWarning, - ) - read_options = bigquery_storage_v1beta1.types.TableReadOptions() - - if selected_fields is not None: - for field in selected_fields: - read_options.selected_fields.append(field.name) - - session = bqstorage_client.create_read_session( - table.to_bqstorage(v1beta1=True), - "projects/{}".format(project_id), - format_=bigquery_storage_v1beta1.enums.DataFormat.ARROW, - read_options=read_options, - requested_streams=requested_streams, - ) - else: - requested_session = bigquery_storage_v1.types.ReadSession( - table=table.to_bqstorage(), - data_format=bigquery_storage_v1.enums.DataFormat.ARROW, - ) - if selected_fields is not None: - for field in selected_fields: - requested_session.read_options.selected_fields.append(field.name) - - session = bqstorage_client.create_read_session( - parent="projects/{}".format(project_id), - read_session=requested_session, - max_stream_count=requested_streams, + requested_session = bigquery_storage.types.ReadSession( + table=table.to_bqstorage(), data_format=bigquery_storage.types.DataFormat.ARROW + ) + if selected_fields is not None: + for field in selected_fields: + requested_session.read_options.selected_fields.append(field.name) + + if _ARROW_COMPRESSION_SUPPORT: + requested_session.read_options.arrow_serialization_options.buffer_compression = ( + ArrowSerializationOptions.CompressionCodec.LZ4_FRAME ) + session = bqstorage_client.create_read_session( + parent="projects/{}".format(project_id), + read_session=requested_session, + max_stream_count=requested_streams, + ) + _LOGGER.debug( "Started reading table '{}.{}.{}' with BQ Storage API session '{}'.".format( table.project, table.dataset_id, table.table_id, session.name @@ -690,7 +736,17 @@ def _download_table_bqstorage( download_state = _DownloadState() # Create a queue to collect frames as they are created in each thread. - worker_queue = queue.Queue() + # + # The queue needs to be bounded by default, because if the user code processes the + # fetched result pages too slowly, while at the same time new pages are rapidly being + # fetched from the server, the queue can grow to the point where the process runs + # out of memory. + if max_queue_size is _MAX_QUEUE_SIZE_DEFAULT: + max_queue_size = total_streams + elif max_queue_size is None: + max_queue_size = 0 # unbounded + + worker_queue = queue.Queue(maxsize=max_queue_size) with concurrent.futures.ThreadPoolExecutor(max_workers=total_streams) as pool: try: @@ -731,15 +787,12 @@ def _download_table_bqstorage( continue # Return any remaining values after the workers finished. - while not worker_queue.empty(): # pragma: NO COVER + while True: # pragma: NO COVER try: - # Include a timeout because even though the queue is - # non-empty, it doesn't guarantee that a subsequent call to - # get() will not block. - frame = worker_queue.get(timeout=_PROGRESS_INTERVAL) + frame = worker_queue.get_nowait() yield frame except queue.Empty: # pragma: NO COVER - continue + break finally: # No need for a lock because reading/replacing a variable is # defined to be an atomic operation in the Python language @@ -752,7 +805,7 @@ def _download_table_bqstorage( def download_arrow_bqstorage( - project_id, table, bqstorage_client, preserve_order=False, selected_fields=None + project_id, table, bqstorage_client, preserve_order=False, selected_fields=None, ): return _download_table_bqstorage( project_id, @@ -772,6 +825,7 @@ def download_dataframe_bqstorage( dtypes, preserve_order=False, selected_fields=None, + max_queue_size=_MAX_QUEUE_SIZE_DEFAULT, ): page_to_item = functools.partial(_bqstorage_page_to_dataframe, column_names, dtypes) return _download_table_bqstorage( @@ -781,15 +835,16 @@ def download_dataframe_bqstorage( preserve_order=preserve_order, selected_fields=selected_fields, page_to_item=page_to_item, + max_queue_size=max_queue_size, ) def dataframe_to_json_generator(dataframe): for row in dataframe.itertuples(index=False, name=None): output = {} - for column, value in six.moves.zip(dataframe.columns, row): + for column, value in zip(dataframe.columns, row): # Omit NaN values. - if value != value: + if pandas.isna(value): continue output[column] = value yield output diff --git a/google/cloud/bigquery/_tqdm_helpers.py b/google/cloud/bigquery/_tqdm_helpers.py new file mode 100644 index 000000000..99e720e2b --- /dev/null +++ b/google/cloud/bigquery/_tqdm_helpers.py @@ -0,0 +1,120 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helper functions for tqdm progress bar.""" + +import concurrent.futures +import time +import typing +from typing import Optional +import warnings + +try: + import tqdm +except ImportError: # pragma: NO COVER + tqdm = None + +if typing.TYPE_CHECKING: # pragma: NO COVER + from google.cloud.bigquery import QueryJob + from google.cloud.bigquery.table import RowIterator + +_NO_TQDM_ERROR = ( + "A progress bar was requested, but there was an error loading the tqdm " + "library. Please install tqdm to use the progress bar functionality." +) + +_PROGRESS_BAR_UPDATE_INTERVAL = 0.5 + + +def get_progress_bar(progress_bar_type, description, total, unit): + """Construct a tqdm progress bar object, if tqdm is installed.""" + if tqdm is None: + if progress_bar_type is not None: + warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3) + return None + + try: + if progress_bar_type == "tqdm": + return tqdm.tqdm(desc=description, total=total, unit=unit) + elif progress_bar_type == "tqdm_notebook": + return tqdm.tqdm_notebook(desc=description, total=total, unit=unit) + elif progress_bar_type == "tqdm_gui": + return tqdm.tqdm_gui(desc=description, total=total, unit=unit) + except (KeyError, TypeError): + # Protect ourselves from any tqdm errors. In case of + # unexpected tqdm behavior, just fall back to showing + # no progress bar. + warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3) + return None + + +def wait_for_query( + query_job: "QueryJob", + progress_bar_type: Optional[str] = None, + max_results: Optional[int] = None, +) -> "RowIterator": + """Return query result and display a progress bar while the query running, if tqdm is installed. + + Args: + query_job: + The job representing the execution of the query on the server. + progress_bar_type: + The type of progress bar to use to show query progress. + max_results: + The maximum number of rows the row iterator should return. + + Returns: + A row iterator over the query results. + """ + default_total = 1 + current_stage = None + start_time = time.time() + + progress_bar = get_progress_bar( + progress_bar_type, "Query is running", default_total, "query" + ) + if progress_bar is None: + return query_job.result(max_results=max_results) + + i = 0 + while True: + if query_job.query_plan: + default_total = len(query_job.query_plan) + current_stage = query_job.query_plan[i] + progress_bar.total = len(query_job.query_plan) + progress_bar.set_description( + "Query executing stage {} and status {} : {:0.2f}s".format( + current_stage.name, current_stage.status, time.time() - start_time, + ), + ) + try: + query_result = query_job.result( + timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=max_results + ) + progress_bar.update(default_total) + progress_bar.set_description( + "Query complete after {:0.2f}s".format(time.time() - start_time), + ) + break + except concurrent.futures.TimeoutError: + query_job.reload() # Refreshes the state via a GET request. + if current_stage: + if current_stage.status == "COMPLETE": + if i < default_total - 1: + progress_bar.update(i + 1) + i += 1 + continue + + progress_bar.close() + return query_result diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index d2aa45999..023346ffa 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -17,12 +17,9 @@ from __future__ import absolute_import from __future__ import division -try: - from collections import abc as collections_abc -except ImportError: # Python 2.7 - import collections as collections_abc - +from collections import abc as collections_abc import copy +import datetime import functools import gzip import io @@ -30,46 +27,73 @@ import json import math import os +import packaging.version import tempfile +from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union import uuid import warnings try: import pyarrow + + _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__) except ImportError: # pragma: NO COVER pyarrow = None -import six -from google import resumable_media +from google import resumable_media # type: ignore from google.resumable_media.requests import MultipartUpload from google.resumable_media.requests import ResumableUpload import google.api_core.client_options -import google.api_core.exceptions +import google.api_core.exceptions as core_exceptions from google.api_core.iam import Policy from google.api_core import page_iterator +from google.api_core import retry as retries import google.cloud._helpers -from google.cloud import exceptions -from google.cloud.client import ClientWithProject +from google.cloud import exceptions # pytype: disable=import-error +from google.cloud.client import ClientWithProject # pytype: disable=import-error + +try: + from google.cloud.bigquery_storage_v1.services.big_query_read.client import ( + DEFAULT_CLIENT_INFO as DEFAULT_BQSTORAGE_CLIENT_INFO, + ) +except ImportError: + DEFAULT_BQSTORAGE_CLIENT_INFO = None +from google.cloud.bigquery._helpers import _del_sub_prop from google.cloud.bigquery._helpers import _get_sub_prop from google.cloud.bigquery._helpers import _record_field_to_json from google.cloud.bigquery._helpers import _str_or_none +from google.cloud.bigquery._helpers import BQ_STORAGE_VERSIONS from google.cloud.bigquery._helpers import _verify_job_config_type -from google.cloud.bigquery._helpers import _del_sub_prop from google.cloud.bigquery._http import Connection from google.cloud.bigquery import _pandas_helpers from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference -from google.cloud.bigquery.exceptions import PyarrowMissingWarning +from google.cloud.bigquery.enums import AutoRowIDs +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job +from google.cloud.bigquery.job import ( + CopyJob, + CopyJobConfig, + ExtractJob, + ExtractJobConfig, + LoadJob, + LoadJobConfig, + QueryJob, + QueryJobConfig, +) from google.cloud.bigquery.model import Model from google.cloud.bigquery.model import ModelReference from google.cloud.bigquery.model import _model_arg_to_model_ref from google.cloud.bigquery.query import _QueryResults -from google.cloud.bigquery.retry import DEFAULT_RETRY +from google.cloud.bigquery.retry import ( + DEFAULT_JOB_RETRY, + DEFAULT_RETRY, + DEFAULT_TIMEOUT, +) from google.cloud.bigquery.routine import Routine from google.cloud.bigquery.routine import RoutineReference from google.cloud.bigquery.schema import SchemaField @@ -81,22 +105,31 @@ from google.cloud.bigquery.table import RowIterator -_DEFAULT_CHUNKSIZE = 1048576 # 1024 * 1024 B = 1 MB +_DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB _MAX_MULTIPART_SIZE = 5 * 1024 * 1024 _DEFAULT_NUM_RETRIES = 6 -_BASE_UPLOAD_TEMPLATE = ( - u"https://bigquery.googleapis.com/upload/bigquery/v2/projects/" - u"{project}/jobs?uploadType=" -) -_MULTIPART_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + u"multipart" -_RESUMABLE_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + u"resumable" -_GENERIC_CONTENT_TYPE = u"*/*" +_BASE_UPLOAD_TEMPLATE = "{host}/upload/bigquery/v2/projects/{project}/jobs?uploadType=" +_MULTIPART_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + "multipart" +_RESUMABLE_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + "resumable" +_GENERIC_CONTENT_TYPE = "*/*" _READ_LESS_THAN_SIZE = ( "Size {:d} was specified but the file-like object only had " "{:d} bytes remaining." ) _NEED_TABLE_ARGUMENT = ( "The table argument should be a table ID string, Table, or TableReference" ) +_LIST_ROWS_FROM_QUERY_RESULTS_FIELDS = "jobReference,totalRows,pageToken,rows" + +# In microbenchmarks, it's been shown that even in ideal conditions (query +# finished, local data), requests to getQueryResults can take 10+ seconds. +# In less-than-ideal situations, the response can take even longer, as it must +# be able to download a full 100+ MB row in that time. Don't let the +# connection timeout before data can be downloaded. +# https://github.com/googleapis/python-bigquery/issues/438 +_MIN_GET_QUERY_RESULTS_TIMEOUT = 120 + +# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 +_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) class Project(object): @@ -216,8 +249,11 @@ def close(self): self._http.close() def get_service_account_email( - self, project=None, retry=DEFAULT_RETRY, timeout=None - ): + self, + project: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> str: """Get the email address of the project's BigQuery service account Note: @@ -259,8 +295,13 @@ def get_service_account_email( return api_response["email"] def list_projects( - self, max_results=None, page_token=None, retry=DEFAULT_RETRY, timeout=None - ): + self, + max_results: int = None, + page_token: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + page_size: int = None, + ) -> page_iterator.Iterator: """List projects for the project associated with this client. See @@ -268,8 +309,8 @@ def list_projects( Args: max_results (Optional[int]): - Maximum number of projects to return, If not passed, - defaults to a value set by the API. + Maximum number of projects to return. + Defaults to a value set by the API. page_token (Optional[str]): Token representing a cursor into the projects. If not passed, @@ -284,6 +325,10 @@ def list_projects( The number of seconds to wait for the underlying HTTP transport before using ``retry``. + page_size (Optional[int]): + Maximum number of projects to return in each page. + Defaults to a value set by the API. + Returns: google.api_core.page_iterator.Iterator: Iterator of :class:`~google.cloud.bigquery.client.Project` @@ -298,7 +343,7 @@ def api_request(*args, **kwargs): span_attributes=span_attributes, *args, timeout=timeout, - **kwargs + **kwargs, ) return page_iterator.HTTPIterator( @@ -309,18 +354,20 @@ def api_request(*args, **kwargs): items_key="projects", page_token=page_token, max_results=max_results, + page_size=page_size, ) def list_datasets( self, - project=None, - include_all=False, - filter=None, - max_results=None, - page_token=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + project: str = None, + include_all: bool = False, + filter: str = None, + max_results: int = None, + page_token: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + page_size: int = None, + ) -> page_iterator.Iterator: """List datasets for the project associated with this client. See @@ -349,6 +396,8 @@ def list_datasets( timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. + page_size (Optional[int]): + Maximum number of datasets to return per page. Returns: google.api_core.page_iterator.Iterator: @@ -376,7 +425,7 @@ def api_request(*args, **kwargs): span_attributes=span_attributes, *args, timeout=timeout, - **kwargs + **kwargs, ) return page_iterator.HTTPIterator( @@ -388,9 +437,10 @@ def api_request(*args, **kwargs): page_token=page_token, max_results=max_results, extra_params=extra_params, + page_size=page_size, ) - def dataset(self, dataset_id, project=None): + def dataset(self, dataset_id: str, project: str = None) -> DatasetReference: """Deprecated: Construct a reference to a dataset. .. deprecated:: 1.24.0 @@ -428,18 +478,41 @@ def dataset(self, dataset_id, project=None): ) return DatasetReference(project, dataset_id) - def _create_bqstorage_client(self): + def _ensure_bqstorage_client( + self, + bqstorage_client: Optional[ + "google.cloud.bigquery_storage.BigQueryReadClient" + ] = None, + client_options: Optional[google.api_core.client_options.ClientOptions] = None, + client_info: Optional[ + "google.api_core.gapic_v1.client_info.ClientInfo" + ] = DEFAULT_BQSTORAGE_CLIENT_INFO, + ) -> Optional["google.cloud.bigquery_storage.BigQueryReadClient"]: """Create a BigQuery Storage API client using this client's credentials. - If a client cannot be created due to missing dependencies, raise a - warning and return ``None``. + If a client cannot be created due to a missing or outdated dependency + `google-cloud-bigquery-storage`, raise a warning and return ``None``. + + If the `bqstorage_client` argument is not ``None``, still perform the version + check and return the argument back to the caller if the check passes. If it + fails, raise a warning and return ``None``. + + Args: + bqstorage_client: + An existing BigQuery Storage client instance to check for version + compatibility. If ``None``, a new instance is created and returned. + client_options: + Custom options used with a new BigQuery Storage client instance if one + is created. + client_info: + The client info used with a new BigQuery Storage client instance if one + is created. Returns: - Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]: - A BigQuery Storage API client. + A BigQuery Storage API client. """ try: - from google.cloud import bigquery_storage_v1 + from google.cloud import bigquery_storage except ImportError: warnings.warn( "Cannot create BigQuery Storage client, the dependency " @@ -447,11 +520,44 @@ def _create_bqstorage_client(self): ) return None - return bigquery_storage_v1.BigQueryReadClient(credentials=self._credentials) + try: + BQ_STORAGE_VERSIONS.verify_version() + except LegacyBigQueryStorageError as exc: + warnings.warn(str(exc)) + return None + + if bqstorage_client is None: + bqstorage_client = bigquery_storage.BigQueryReadClient( + credentials=self._credentials, + client_options=client_options, + client_info=client_info, + ) + + return bqstorage_client + + def _dataset_from_arg(self, dataset): + if isinstance(dataset, str): + dataset = DatasetReference.from_string( + dataset, default_project=self.project + ) + + if not isinstance(dataset, (Dataset, DatasetReference)): + if isinstance(dataset, DatasetListItem): + dataset = dataset.reference + else: + raise TypeError( + "dataset must be a Dataset, DatasetReference, DatasetListItem," + " or string" + ) + return dataset def create_dataset( - self, dataset, exists_ok=False, retry=DEFAULT_RETRY, timeout=None - ): + self, + dataset: Union[str, Dataset, DatasetReference], + exists_ok: bool = False, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Dataset: """API call: create the dataset via a POST request. See @@ -461,6 +567,7 @@ def create_dataset( dataset (Union[ \ google.cloud.bigquery.dataset.Dataset, \ google.cloud.bigquery.dataset.DatasetReference, \ + google.cloud.bigquery.dataset.DatasetListItem, \ str, \ ]): A :class:`~google.cloud.bigquery.dataset.Dataset` to create. @@ -491,10 +598,7 @@ def create_dataset( >>> dataset = client.create_dataset(dataset) """ - if isinstance(dataset, str): - dataset = DatasetReference.from_string( - dataset, default_project=self.project - ) + dataset = self._dataset_from_arg(dataset) if isinstance(dataset, DatasetReference): dataset = Dataset(dataset) @@ -517,14 +621,18 @@ def create_dataset( timeout=timeout, ) return Dataset.from_api_repr(api_response) - except google.api_core.exceptions.Conflict: + except core_exceptions.Conflict: if not exists_ok: raise return self.get_dataset(dataset.reference, retry=retry) def create_routine( - self, routine, exists_ok=False, retry=DEFAULT_RETRY, timeout=None - ): + self, + routine: Routine, + exists_ok: bool = False, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Routine: """[Beta] Create a routine via a POST request. See @@ -568,12 +676,18 @@ def create_routine( timeout=timeout, ) return Routine.from_api_repr(api_response) - except google.api_core.exceptions.Conflict: + except core_exceptions.Conflict: if not exists_ok: raise return self.get_routine(routine.reference, retry=retry) - def create_table(self, table, exists_ok=False, retry=DEFAULT_RETRY, timeout=None): + def create_table( + self, + table: Union[str, Table, TableReference], + exists_ok: bool = False, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Table: """API call: create a table via a PUT request See @@ -622,7 +736,7 @@ def create_table(self, table, exists_ok=False, retry=DEFAULT_RETRY, timeout=None timeout=timeout, ) return Table.from_api_repr(api_response) - except google.api_core.exceptions.Conflict: + except core_exceptions.Conflict: if not exists_ok: raise return self.get_table(table.reference, retry=retry) @@ -630,7 +744,6 @@ def create_table(self, table, exists_ok=False, retry=DEFAULT_RETRY, timeout=None def _call_api( self, retry, span_name=None, span_attributes=None, job_ref=None, **kwargs ): - call = functools.partial(self._connection.api_request, **kwargs) if retry: call = retry(call) @@ -641,7 +754,12 @@ def _call_api( return call() return call() - def get_dataset(self, dataset_ref, retry=DEFAULT_RETRY, timeout=None): + def get_dataset( + self, + dataset_ref: Union[DatasetReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Dataset: """Fetch the dataset referenced by ``dataset_ref`` Args: @@ -680,8 +798,12 @@ def get_dataset(self, dataset_ref, retry=DEFAULT_RETRY, timeout=None): return Dataset.from_api_repr(api_response) def get_iam_policy( - self, table, requested_policy_version=1, retry=DEFAULT_RETRY, timeout=None, - ): + self, + table: Union[Table, TableReference], + requested_policy_version: int = 1, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Policy: if not isinstance(table, (Table, TableReference)): raise TypeError("table must be a Table or TableReference") @@ -705,8 +827,13 @@ def get_iam_policy( return Policy.from_api_repr(response) def set_iam_policy( - self, table, policy, updateMask=None, retry=DEFAULT_RETRY, timeout=None, - ): + self, + table: Union[Table, TableReference], + policy: Policy, + updateMask: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Policy: if not isinstance(table, (Table, TableReference)): raise TypeError("table must be a Table or TableReference") @@ -734,8 +861,12 @@ def set_iam_policy( return Policy.from_api_repr(response) def test_iam_permissions( - self, table, permissions, retry=DEFAULT_RETRY, timeout=None, - ): + self, + table: Union[Table, TableReference], + permissions: Sequence[str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Dict[str, Any]: if not isinstance(table, (Table, TableReference)): raise TypeError("table must be a Table or TableReference") @@ -755,7 +886,12 @@ def test_iam_permissions( return response - def get_model(self, model_ref, retry=DEFAULT_RETRY, timeout=None): + def get_model( + self, + model_ref: Union[ModelReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Model: """[Beta] Fetch the model referenced by ``model_ref``. Args: @@ -793,7 +929,12 @@ def get_model(self, model_ref, retry=DEFAULT_RETRY, timeout=None): ) return Model.from_api_repr(api_response) - def get_routine(self, routine_ref, retry=DEFAULT_RETRY, timeout=None): + def get_routine( + self, + routine_ref: Union[Routine, RoutineReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Routine: """[Beta] Get the routine referenced by ``routine_ref``. Args: @@ -832,7 +973,12 @@ def get_routine(self, routine_ref, retry=DEFAULT_RETRY, timeout=None): ) return Routine.from_api_repr(api_response) - def get_table(self, table, retry=DEFAULT_RETRY, timeout=None): + def get_table( + self, + table: Union[Table, TableReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Table: """Fetch the table referenced by ``table``. Args: @@ -868,7 +1014,13 @@ def get_table(self, table, retry=DEFAULT_RETRY, timeout=None): ) return Table.from_api_repr(api_response) - def update_dataset(self, dataset, fields, retry=DEFAULT_RETRY, timeout=None): + def update_dataset( + self, + dataset: Dataset, + fields: Sequence[str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Dataset: """Change some fields of a dataset. Use ``fields`` to specify which fields to update. At least one field @@ -886,7 +1038,22 @@ def update_dataset(self, dataset, fields, retry=DEFAULT_RETRY, timeout=None): dataset (google.cloud.bigquery.dataset.Dataset): The dataset to update. fields (Sequence[str]): - The properties of ``dataset`` to change (e.g. "friendly_name"). + The properties of ``dataset`` to change. These are strings + corresponding to the properties of + :class:`~google.cloud.bigquery.dataset.Dataset`. + + For example, to update the default expiration times, specify + both properties in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_dataset( + dataset, + [ + "default_partition_expiration_ms", + "default_table_expiration_ms", + ] + ) retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. timeout (Optional[float]): @@ -917,7 +1084,13 @@ def update_dataset(self, dataset, fields, retry=DEFAULT_RETRY, timeout=None): ) return Dataset.from_api_repr(api_response) - def update_model(self, model, fields, retry=DEFAULT_RETRY, timeout=None): + def update_model( + self, + model: Model, + fields: Sequence[str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Model: """[Beta] Change some fields of a model. Use ``fields`` to specify which fields to update. At least one field @@ -933,8 +1106,18 @@ def update_model(self, model, fields, retry=DEFAULT_RETRY, timeout=None): Args: model (google.cloud.bigquery.model.Model): The model to update. fields (Sequence[str]): - The fields of ``model`` to change, spelled as the Model - properties (e.g. "friendly_name"). + The properties of ``model`` to change. These are strings + corresponding to the properties of + :class:`~google.cloud.bigquery.model.Model`. + + For example, to update the descriptive properties of the model, + specify them in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_model( + model, ["description", "friendly_name"] + ) retry (Optional[google.api_core.retry.Retry]): A description of how to retry the API call. timeout (Optional[float]): @@ -965,7 +1148,13 @@ def update_model(self, model, fields, retry=DEFAULT_RETRY, timeout=None): ) return Model.from_api_repr(api_response) - def update_routine(self, routine, fields, retry=DEFAULT_RETRY, timeout=None): + def update_routine( + self, + routine: Routine, + fields: Sequence[str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Routine: """[Beta] Change some fields of a routine. Use ``fields`` to specify which fields to update. At least one field @@ -985,11 +1174,20 @@ def update_routine(self, routine, fields, retry=DEFAULT_RETRY, timeout=None): occurred since the read. Args: - routine (google.cloud.bigquery.routine.Routine): The routine to update. + routine (google.cloud.bigquery.routine.Routine): + The routine to update. fields (Sequence[str]): The fields of ``routine`` to change, spelled as the - :class:`~google.cloud.bigquery.routine.Routine` properties - (e.g. ``type_``). + :class:`~google.cloud.bigquery.routine.Routine` properties. + + For example, to update the description property of the routine, + specify it in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_routine( + routine, ["description"] + ) retry (Optional[google.api_core.retry.Retry]): A description of how to retry the API call. timeout (Optional[float]): @@ -1024,7 +1222,13 @@ def update_routine(self, routine, fields, retry=DEFAULT_RETRY, timeout=None): ) return Routine.from_api_repr(api_response) - def update_table(self, table, fields, retry=DEFAULT_RETRY, timeout=None): + def update_table( + self, + table: Table, + fields: Sequence[str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Table: """Change some fields of a table. Use ``fields`` to specify which fields to update. At least one field @@ -1040,8 +1244,18 @@ def update_table(self, table, fields, retry=DEFAULT_RETRY, timeout=None): Args: table (google.cloud.bigquery.table.Table): The table to update. fields (Sequence[str]): - The fields of ``table`` to change, spelled as the Table - properties (e.g. "friendly_name"). + The fields of ``table`` to change, spelled as the + :class:`~google.cloud.bigquery.table.Table` properties. + + For example, to update the descriptive properties of the table, + specify them in the ``fields`` argument: + + .. code-block:: python + + bigquery_client.update_table( + table, + ["description", "friendly_name"] + ) retry (Optional[google.api_core.retry.Retry]): A description of how to retry the API call. timeout (Optional[float]): @@ -1075,12 +1289,13 @@ def update_table(self, table, fields, retry=DEFAULT_RETRY, timeout=None): def list_models( self, - dataset, - max_results=None, - page_token=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + dataset: Union[Dataset, DatasetReference, str], + max_results: int = None, + page_token: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + page_size: int = None, + ) -> page_iterator.Iterator: """[Beta] List models in the dataset. See @@ -1090,6 +1305,7 @@ def list_models( dataset (Union[ \ google.cloud.bigquery.dataset.Dataset, \ google.cloud.bigquery.dataset.DatasetReference, \ + google.cloud.bigquery.dataset.DatasetListItem, \ str, \ ]): A reference to the dataset whose models to list from the @@ -1097,7 +1313,7 @@ def list_models( to create a dataset reference from a string using :func:`google.cloud.bigquery.dataset.DatasetReference.from_string`. max_results (Optional[int]): - Maximum number of models to return. If not passed, defaults to a + Maximum number of models to return. Defaults to a value set by the API. page_token (Optional[str]): Token representing a cursor into the models. If not passed, @@ -1110,6 +1326,9 @@ def list_models( timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. + page_size (Optional[int]): + Maximum number of models to return per page. + Defaults to a value set by the API. Returns: google.api_core.page_iterator.Iterator: @@ -1117,13 +1336,7 @@ def list_models( :class:`~google.cloud.bigquery.model.Model` contained within the requested dataset. """ - if isinstance(dataset, str): - dataset = DatasetReference.from_string( - dataset, default_project=self.project - ) - - if not isinstance(dataset, (Dataset, DatasetReference)): - raise TypeError("dataset must be a Dataset, DatasetReference, or string") + dataset = self._dataset_from_arg(dataset) path = "%s/models" % dataset.path span_attributes = {"path": path} @@ -1135,7 +1348,7 @@ def api_request(*args, **kwargs): span_attributes=span_attributes, *args, timeout=timeout, - **kwargs + **kwargs, ) result = page_iterator.HTTPIterator( @@ -1146,18 +1359,20 @@ def api_request(*args, **kwargs): items_key="models", page_token=page_token, max_results=max_results, + page_size=page_size, ) result.dataset = dataset return result def list_routines( self, - dataset, - max_results=None, - page_token=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + dataset: Union[Dataset, DatasetReference, str], + max_results: int = None, + page_token: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + page_size: int = None, + ) -> page_iterator.Iterator: """[Beta] List routines in the dataset. See @@ -1167,6 +1382,7 @@ def list_routines( dataset (Union[ \ google.cloud.bigquery.dataset.Dataset, \ google.cloud.bigquery.dataset.DatasetReference, \ + google.cloud.bigquery.dataset.DatasetListItem, \ str, \ ]): A reference to the dataset whose routines to list from the @@ -1174,7 +1390,7 @@ def list_routines( to create a dataset reference from a string using :func:`google.cloud.bigquery.dataset.DatasetReference.from_string`. max_results (Optional[int]): - Maximum number of routines to return. If not passed, defaults + Maximum number of routines to return. Defaults to a value set by the API. page_token (Optional[str]): Token representing a cursor into the routines. If not passed, @@ -1187,6 +1403,9 @@ def list_routines( timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. + page_size (Optional[int]): + Maximum number of routines to return per page. + Defaults to a value set by the API. Returns: google.api_core.page_iterator.Iterator: @@ -1194,14 +1413,7 @@ def list_routines( :class:`~google.cloud.bigquery.routine.Routine`s contained within the requested dataset, limited by ``max_results``. """ - if isinstance(dataset, str): - dataset = DatasetReference.from_string( - dataset, default_project=self.project - ) - - if not isinstance(dataset, (Dataset, DatasetReference)): - raise TypeError("dataset must be a Dataset, DatasetReference, or string") - + dataset = self._dataset_from_arg(dataset) path = "{}/routines".format(dataset.path) span_attributes = {"path": path} @@ -1213,7 +1425,7 @@ def api_request(*args, **kwargs): span_attributes=span_attributes, *args, timeout=timeout, - **kwargs + **kwargs, ) result = page_iterator.HTTPIterator( @@ -1224,18 +1436,20 @@ def api_request(*args, **kwargs): items_key="routines", page_token=page_token, max_results=max_results, + page_size=page_size, ) result.dataset = dataset return result def list_tables( self, - dataset, - max_results=None, - page_token=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + dataset: Union[Dataset, DatasetReference, str], + max_results: int = None, + page_token: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + page_size: int = None, + ) -> page_iterator.Iterator: """List tables in the dataset. See @@ -1245,6 +1459,7 @@ def list_tables( dataset (Union[ \ google.cloud.bigquery.dataset.Dataset, \ google.cloud.bigquery.dataset.DatasetReference, \ + google.cloud.bigquery.dataset.DatasetListItem, \ str, \ ]): A reference to the dataset whose tables to list from the @@ -1252,7 +1467,7 @@ def list_tables( to create a dataset reference from a string using :func:`google.cloud.bigquery.dataset.DatasetReference.from_string`. max_results (Optional[int]): - Maximum number of tables to return. If not passed, defaults + Maximum number of tables to return. Defaults to a value set by the API. page_token (Optional[str]): Token representing a cursor into the tables. If not passed, @@ -1265,6 +1480,9 @@ def list_tables( timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. + page_size (Optional[int]): + Maximum number of tables to return per page. + Defaults to a value set by the API. Returns: google.api_core.page_iterator.Iterator: @@ -1272,14 +1490,7 @@ def list_tables( :class:`~google.cloud.bigquery.table.TableListItem` contained within the requested dataset. """ - if isinstance(dataset, str): - dataset = DatasetReference.from_string( - dataset, default_project=self.project - ) - - if not isinstance(dataset, (Dataset, DatasetReference)): - raise TypeError("dataset must be a Dataset, DatasetReference, or string") - + dataset = self._dataset_from_arg(dataset) path = "%s/tables" % dataset.path span_attributes = {"path": path} @@ -1290,7 +1501,7 @@ def api_request(*args, **kwargs): span_attributes=span_attributes, *args, timeout=timeout, - **kwargs + **kwargs, ) result = page_iterator.HTTPIterator( @@ -1301,18 +1512,19 @@ def api_request(*args, **kwargs): items_key="tables", page_token=page_token, max_results=max_results, + page_size=page_size, ) result.dataset = dataset return result def delete_dataset( self, - dataset, - delete_contents=False, - retry=DEFAULT_RETRY, - timeout=None, - not_found_ok=False, - ): + dataset: Union[Dataset, DatasetReference, str], + delete_contents: bool = False, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + not_found_ok: bool = False, + ) -> None: """Delete a dataset. See @@ -1322,6 +1534,7 @@ def delete_dataset( dataset (Union[ \ google.cloud.bigquery.dataset.Dataset, \ google.cloud.bigquery.dataset.DatasetReference, \ + google.cloud.bigquery.dataset.DatasetListItem, \ str, \ ]): A reference to the dataset to delete. If a string is passed @@ -1341,14 +1554,7 @@ def delete_dataset( Defaults to ``False``. If ``True``, ignore "not found" errors when deleting the dataset. """ - if isinstance(dataset, str): - dataset = DatasetReference.from_string( - dataset, default_project=self.project - ) - - if not isinstance(dataset, (Dataset, DatasetReference)): - raise TypeError("dataset must be a Dataset or a DatasetReference") - + dataset = self._dataset_from_arg(dataset) params = {} path = dataset.path if delete_contents: @@ -1367,13 +1573,17 @@ def delete_dataset( query_params=params, timeout=timeout, ) - except google.api_core.exceptions.NotFound: + except core_exceptions.NotFound: if not not_found_ok: raise def delete_model( - self, model, retry=DEFAULT_RETRY, timeout=None, not_found_ok=False - ): + self, + model: Union[Model, ModelReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + not_found_ok: bool = False, + ) -> None: """[Beta] Delete a model See @@ -1415,13 +1625,82 @@ def delete_model( path=path, timeout=timeout, ) + except core_exceptions.NotFound: + if not not_found_ok: + raise + + def delete_job_metadata( + self, + job_id: Union[str, LoadJob, CopyJob, ExtractJob, QueryJob], + project: Optional[str] = None, + location: Optional[str] = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + not_found_ok: bool = False, + ): + """[Beta] Delete job metadata from job history. + + Note: This does not stop a running job. Use + :func:`~google.cloud.bigquery.client.Client.cancel_job` instead. + + Args: + job_id: Job or job identifier. + + Keyword Arguments: + project: + ID of the project which owns the job (defaults to the client's project). + location: + Location where the job was run. Ignored if ``job_id`` is a job + object. + retry: + How to retry the RPC. + timeout: + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + not_found_ok: + Defaults to ``False``. If ``True``, ignore "not found" errors + when deleting the job. + """ + extra_params = {} + + project, location, job_id = _extract_job_reference( + job_id, project=project, location=location + ) + + if project is None: + project = self.project + + if location is None: + location = self.location + + # Location is always required for jobs.delete() + extra_params["location"] = location + + path = f"/projects/{project}/jobs/{job_id}/delete" + + span_attributes = {"path": path, "job_id": job_id, "location": location} + + try: + self._call_api( + retry, + span_name="BigQuery.deleteJob", + span_attributes=span_attributes, + method="DELETE", + path=path, + query_params=extra_params, + timeout=timeout, + ) except google.api_core.exceptions.NotFound: if not not_found_ok: raise def delete_routine( - self, routine, retry=DEFAULT_RETRY, timeout=None, not_found_ok=False - ): + self, + routine: Union[Routine, RoutineReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + not_found_ok: bool = False, + ) -> None: """[Beta] Delete a routine. See @@ -1465,13 +1744,17 @@ def delete_routine( path=path, timeout=timeout, ) - except google.api_core.exceptions.NotFound: + except core_exceptions.NotFound: if not not_found_ok: raise def delete_table( - self, table, retry=DEFAULT_RETRY, timeout=None, not_found_ok=False - ): + self, + table: Union[Table, TableReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + not_found_ok: bool = False, + ) -> None: """Delete a table See @@ -1511,13 +1794,19 @@ def delete_table( path=path, timeout=timeout, ) - except google.api_core.exceptions.NotFound: + except core_exceptions.NotFound: if not not_found_ok: raise def _get_query_results( - self, job_id, retry, project=None, timeout_ms=None, location=None, timeout=None - ): + self, + job_id: str, + retry: retries.Retry, + project: str = None, + timeout_ms: int = None, + location: str = None, + timeout: float = DEFAULT_TIMEOUT, + ) -> _QueryResults: """Get the query results object for a query job. Args: @@ -1532,7 +1821,9 @@ def _get_query_results( location (Optional[str]): Location of the query job. timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport - before using ``retry``. + before using ``retry``. If set, this connection timeout may be + increased to a minimum value. This prevents retries on what + would otherwise be a successful response. Returns: google.cloud.bigquery.query._QueryResults: @@ -1541,6 +1832,9 @@ def _get_query_results( extra_params = {"maxResults": 0} + if timeout is not None: + timeout = max(timeout, _MIN_GET_QUERY_RESULTS_TIMEOUT) + if project is None: project = self.project @@ -1570,7 +1864,7 @@ def _get_query_results( ) return _QueryResults.from_api_repr(resource) - def job_from_resource(self, resource): + def job_from_resource(self, resource: dict) -> job.UnknownJob: """Detect correct job type from resource and instantiate. Args: @@ -1596,7 +1890,12 @@ def job_from_resource(self, resource): return job.QueryJob.from_api_repr(resource, self) return job.UnknownJob.from_api_repr(resource, self) - def create_job(self, job_config, retry=DEFAULT_RETRY): + def create_job( + self, + job_config: dict, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Union[job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob]: """Create a new job. Args: job_config (dict): configuration job representation returned from the API. @@ -1604,6 +1903,9 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): Keyword Arguments: retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: Union[ \ @@ -1621,40 +1923,53 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): ) destination = _get_sub_prop(job_config, ["load", "destinationTable"]) source_uris = _get_sub_prop(job_config, ["load", "sourceUris"]) + destination = TableReference.from_api_repr(destination) return self.load_table_from_uri( - source_uris, destination, job_config=load_job_config, retry=retry + source_uris, + destination, + job_config=load_job_config, + retry=retry, + timeout=timeout, ) elif "copy" in job_config: copy_job_config = google.cloud.bigquery.job.CopyJobConfig.from_api_repr( job_config ) destination = _get_sub_prop(job_config, ["copy", "destinationTable"]) + destination = TableReference.from_api_repr(destination) sources = [] source_configs = _get_sub_prop(job_config, ["copy", "sourceTables"]) - if source_configs is None: source_configs = [_get_sub_prop(job_config, ["copy", "sourceTable"])] for source_config in source_configs: table_ref = TableReference.from_api_repr(source_config) sources.append(table_ref) return self.copy_table( - sources, destination, job_config=copy_job_config, retry=retry + sources, + destination, + job_config=copy_job_config, + retry=retry, + timeout=timeout, ) elif "extract" in job_config: extract_job_config = google.cloud.bigquery.job.ExtractJobConfig.from_api_repr( job_config ) source = _get_sub_prop(job_config, ["extract", "sourceTable"]) - source_type = "Table" - if not source: + if source: + source_type = "Table" + source = TableReference.from_api_repr(source) + else: source = _get_sub_prop(job_config, ["extract", "sourceModel"]) source_type = "Model" + source = ModelReference.from_api_repr(source) destination_uris = _get_sub_prop(job_config, ["extract", "destinationUris"]) return self.extract_table( source, destination_uris, job_config=extract_job_config, retry=retry, + timeout=timeout, source_type=source_type, ) elif "query" in job_config: @@ -1664,25 +1979,40 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): copy_config ) query = _get_sub_prop(copy_config, ["query", "query"]) - return self.query(query, job_config=query_job_config, retry=retry) + return self.query( + query, job_config=query_job_config, retry=retry, timeout=timeout + ) else: raise TypeError("Invalid job configuration received.") def get_job( - self, job_id, project=None, location=None, retry=DEFAULT_RETRY, timeout=None - ): + self, + job_id: str, + project: str = None, + location: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Union[job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob]: """Fetch a job for the project associated with this client. See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/get Args: - job_id (str): Unique job identifier. + job_id (Union[ \ + str, \ + google.cloud.bigquery.job.LoadJob, \ + google.cloud.bigquery.job.CopyJob, \ + google.cloud.bigquery.job.ExtractJob, \ + google.cloud.bigquery.job.QueryJob \ + ]): Job identifier. Keyword Arguments: project (Optional[str]): ID of the project which owns the job (defaults to the client's project). - location (Optional[str]): Location where the job was run. + location (Optional[str]): + Location where the job was run. Ignored if ``job_id`` is a job + object. retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. timeout (Optional[float]): @@ -1700,6 +2030,10 @@ def get_job( """ extra_params = {"projection": "full"} + project, location, job_id = _extract_job_reference( + job_id, project=project, location=location + ) + if project is None: project = self.project @@ -1726,20 +2060,33 @@ def get_job( return self.job_from_resource(resource) def cancel_job( - self, job_id, project=None, location=None, retry=DEFAULT_RETRY, timeout=None - ): + self, + job_id: str, + project: str = None, + location: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Union[job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob]: """Attempt to cancel a job from a job ID. See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/cancel Args: - job_id (str): Unique job identifier. + job_id (Union[ \ + str, \ + google.cloud.bigquery.job.LoadJob, \ + google.cloud.bigquery.job.CopyJob, \ + google.cloud.bigquery.job.ExtractJob, \ + google.cloud.bigquery.job.QueryJob \ + ]): Job identifier. Keyword Arguments: project (Optional[str]): ID of the project which owns the job (defaults to the client's project). - location (Optional[str]): Location where the job was run. + location (Optional[str]): + Location where the job was run. Ignored if ``job_id`` is a job + object. retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. timeout (Optional[float]): @@ -1757,6 +2104,10 @@ def cancel_job( """ extra_params = {"projection": "full"} + project, location, job_id = _extract_job_reference( + job_id, project=project, location=location + ) + if project is None: project = self.project @@ -1784,17 +2135,18 @@ def cancel_job( def list_jobs( self, - project=None, - parent_job=None, - max_results=None, - page_token=None, - all_users=None, - state_filter=None, - retry=DEFAULT_RETRY, - timeout=None, - min_creation_time=None, - max_creation_time=None, - ): + project: str = None, + parent_job: Optional[Union[QueryJob, str]] = None, + max_results: int = None, + page_token: str = None, + all_users: bool = None, + state_filter: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + min_creation_time: datetime.datetime = None, + max_creation_time: datetime.datetime = None, + page_size: int = None, + ) -> page_iterator.Iterator: """List jobs for the project associated with this client. See @@ -1839,13 +2191,15 @@ def list_jobs( Max value for job creation time. If set, only jobs created before or at this timestamp are returned. If the datetime has no time zone assumes UTC time. + page_size (Optional[int]): + Maximum number of jobs to return per page. Returns: google.api_core.page_iterator.Iterator: Iterable of job instances. """ if isinstance(parent_job, job._AsyncJob): - parent_job = parent_job.job_id + parent_job = parent_job.job_id # pytype: disable=attribute-error extra_params = { "allUsers": all_users, @@ -1878,7 +2232,7 @@ def api_request(*args, **kwargs): span_attributes=span_attributes, *args, timeout=timeout, - **kwargs + **kwargs, ) return page_iterator.HTTPIterator( @@ -1890,20 +2244,21 @@ def api_request(*args, **kwargs): page_token=page_token, max_results=max_results, extra_params=extra_params, + page_size=page_size, ) def load_table_from_uri( self, - source_uris, - destination, - job_id=None, - job_id_prefix=None, - location=None, - project=None, - job_config=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + source_uris: Union[str, Sequence[str]], + destination: Union[Table, TableReference, str], + job_id: str = None, + job_id_prefix: str = None, + location: str = None, + project: str = None, + job_config: LoadJobConfig = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> job.LoadJob: """Starts a job for loading data into a table from CloudStorage. See @@ -1960,7 +2315,7 @@ def load_table_from_uri( job_ref = job._JobReference(job_id, project=project, location=location) - if isinstance(source_uris, six.string_types): + if isinstance(source_uris, str): source_uris = [source_uris] destination = _table_arg_to_table_ref(destination, default_project=self.project) @@ -1976,17 +2331,18 @@ def load_table_from_uri( def load_table_from_file( self, - file_obj, - destination, - rewind=False, - size=None, - num_retries=_DEFAULT_NUM_RETRIES, - job_id=None, - job_id_prefix=None, - location=None, - project=None, - job_config=None, - ): + file_obj: BinaryIO, + destination: Union[Table, TableReference, str], + rewind: bool = False, + size: int = None, + num_retries: int = _DEFAULT_NUM_RETRIES, + job_id: str = None, + job_id_prefix: str = None, + location: str = None, + project: str = None, + job_config: LoadJobConfig = None, + timeout: float = DEFAULT_TIMEOUT, + ) -> job.LoadJob: """Upload the contents of this table from a file-like object. Similar to :meth:`load_table_from_uri`, this method creates, starts and @@ -2025,6 +2381,9 @@ def load_table_from_file( to the client's project. job_config (Optional[google.cloud.bigquery.job.LoadJobConfig]): Extra configuration options for the job. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: google.cloud.bigquery.job.LoadJob: A new load job. @@ -2063,11 +2422,11 @@ def load_table_from_file( try: if size is None or size >= _MAX_MULTIPART_SIZE: response = self._do_resumable_upload( - file_obj, job_resource, num_retries + file_obj, job_resource, num_retries, timeout, project=project ) else: response = self._do_multipart_upload( - file_obj, job_resource, size, num_retries + file_obj, job_resource, size, num_retries, timeout, project=project ) except resumable_media.InvalidResponse as exc: raise exceptions.from_http_response(exc.response) @@ -2077,15 +2436,16 @@ def load_table_from_file( def load_table_from_dataframe( self, dataframe, - destination, - num_retries=_DEFAULT_NUM_RETRIES, - job_id=None, - job_id_prefix=None, - location=None, - project=None, - job_config=None, - parquet_compression="snappy", - ): + destination: Union[Table, TableReference, str], + num_retries: int = _DEFAULT_NUM_RETRIES, + job_id: str = None, + job_id_prefix: str = None, + location: str = None, + project: str = None, + job_config: LoadJobConfig = None, + parquet_compression: str = "snappy", + timeout: float = DEFAULT_TIMEOUT, + ) -> job.LoadJob: """Upload the contents of a table from a pandas DataFrame. Similar to :meth:`load_table_from_uri`, this method creates, starts and @@ -2093,9 +2453,12 @@ def load_table_from_dataframe( .. note:: - Due to the way REPEATED fields are encoded in the ``parquet`` file - format, a mismatch with the existing table schema can occur, and - 100% compatibility cannot be guaranteed for REPEATED fields. + REPEATED fields are NOT supported when using the CSV source format. + They are supported when using the PARQUET source format, but + due to the way they are encoded in the ``parquet`` file, + a mismatch with the existing table schema can occur, so + 100% compatibility cannot be guaranteed for REPEATED fields when + using the parquet format. https://github.com/googleapis/python-bigquery/issues/17 @@ -2135,29 +2498,38 @@ def load_table_from_dataframe( column names matching those of the dataframe. The BigQuery schema is used to determine the correct data type conversion. Indexes are not loaded. Requires the :mod:`pyarrow` library. + + By default, this method uses the parquet source format. To + override this, supply a value for + :attr:`~google.cloud.bigquery.job.LoadJobConfig.source_format` + with the format name. Currently only + :attr:`~google.cloud.bigquery.job.SourceFormat.CSV` and + :attr:`~google.cloud.bigquery.job.SourceFormat.PARQUET` are + supported. parquet_compression (Optional[str]): [Beta] The compression method to use if intermittently serializing ``dataframe`` to a parquet file. - If ``pyarrow`` and job config schema are used, the argument - is directly passed as the ``compression`` argument to the - underlying ``pyarrow.parquet.write_table()`` method (the - default value "snappy" gets converted to uppercase). + The argument is directly passed as the ``compression`` + argument to the underlying ``pyarrow.parquet.write_table()`` + method (the default value "snappy" gets converted to uppercase). https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table - If either ``pyarrow`` or job config schema are missing, the - argument is directly passed as the ``compression`` argument - to the underlying ``DataFrame.to_parquet()`` method. + If the job config schema is missing, the argument is directly + passed as the ``compression`` argument to the underlying + ``DataFrame.to_parquet()`` method. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: google.cloud.bigquery.job.LoadJob: A new load job. Raises: - ImportError: + ValueError: If a usable parquet engine cannot be found. This method - requires :mod:`pyarrow` or :mod:`fastparquet` to be - installed. + requires :mod:`pyarrow` to be installed. TypeError: If ``job_config`` is not an instance of :class:`~google.cloud.bigquery.job.LoadJobConfig` class. @@ -2174,15 +2546,20 @@ def load_table_from_dataframe( else: job_config = job.LoadJobConfig() - if job_config.source_format: - if job_config.source_format != job.SourceFormat.PARQUET: - raise ValueError( - "Got unexpected source_format: '{}'. Currently, only PARQUET is supported".format( - job_config.source_format - ) - ) - else: + supported_formats = {job.SourceFormat.CSV, job.SourceFormat.PARQUET} + if job_config.source_format is None: + # default value job_config.source_format = job.SourceFormat.PARQUET + if job_config.source_format not in supported_formats: + raise ValueError( + "Got unexpected source_format: '{}'. Currently, only PARQUET and CSV are supported".format( + job_config.source_format + ) + ) + + if pyarrow is None and job_config.source_format == job.SourceFormat.PARQUET: + # pyarrow is now the only supported parquet engine. + raise ValueError("This method requires pyarrow to be installed") if location is None: location = self.location @@ -2196,16 +2573,25 @@ def load_table_from_dataframe( ): try: table = self.get_table(destination) - except google.api_core.exceptions.NotFound: + except core_exceptions.NotFound: table = None else: columns_and_indexes = frozenset( name for name, _ in _pandas_helpers.list_columns_and_indexes(dataframe) ) - # schema fields not present in the dataframe are not needed job_config.schema = [ - field for field in table.schema if field.name in columns_and_indexes + # Field description and policy tags are not needed to + # serialize a data frame. + SchemaField( + field.name, + field.field_type, + mode=field.mode, + fields=field.fields, + ) + # schema fields not present in the dataframe are not needed + for field in table.schema + if field.name in columns_and_indexes ] job_config.schema = _pandas_helpers.dataframe_to_bq_schema( @@ -2222,52 +2608,62 @@ def load_table_from_dataframe( stacklevel=2, ) - tmpfd, tmppath = tempfile.mkstemp(suffix="_job_{}.parquet".format(job_id[:8])) + tmpfd, tmppath = tempfile.mkstemp( + suffix="_job_{}.{}".format(job_id[:8], job_config.source_format.lower()) + ) os.close(tmpfd) try: - if pyarrow and job_config.schema: - if parquet_compression == "snappy": # adjust the default value - parquet_compression = parquet_compression.upper() - _pandas_helpers.dataframe_to_parquet( - dataframe, - job_config.schema, - tmppath, - parquet_compression=parquet_compression, - ) - else: - if not pyarrow: - warnings.warn( - "Loading dataframe data without pyarrow installed is " - "deprecated and will become unsupported in the future. " - "Please install the pyarrow package.", - PyarrowMissingWarning, - stacklevel=2, + if job_config.source_format == job.SourceFormat.PARQUET: + if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS: + msg = ( + "Loading dataframe data in PARQUET format with pyarrow " + f"{_PYARROW_VERSION} can result in data corruption. It is " + "therefore *strongly* advised to use a different pyarrow " + "version or a different source format. " + "See: https://github.com/googleapis/python-bigquery/issues/781" ) + warnings.warn(msg, category=RuntimeWarning) if job_config.schema: - warnings.warn( - "job_config.schema is set, but not used to assist in " - "identifying correct types for data serialization. " - "Please install the pyarrow package.", - PendingDeprecationWarning, - stacklevel=2, + if parquet_compression == "snappy": # adjust the default value + parquet_compression = parquet_compression.upper() + + _pandas_helpers.dataframe_to_parquet( + dataframe, + job_config.schema, + tmppath, + parquet_compression=parquet_compression, ) + else: + dataframe.to_parquet(tmppath, compression=parquet_compression) - dataframe.to_parquet(tmppath, compression=parquet_compression) + else: - with open(tmppath, "rb") as parquet_file: + dataframe.to_csv( + tmppath, + index=False, + header=False, + encoding="utf-8", + float_format="%.17g", + date_format="%Y-%m-%d %H:%M:%S.%f", + ) + + with open(tmppath, "rb") as tmpfile: + file_size = os.path.getsize(tmppath) return self.load_table_from_file( - parquet_file, + tmpfile, destination, num_retries=num_retries, rewind=True, + size=file_size, job_id=job_id, job_id_prefix=job_id_prefix, location=location, project=project, job_config=job_config, + timeout=timeout, ) finally: @@ -2275,15 +2671,16 @@ def load_table_from_dataframe( def load_table_from_json( self, - json_rows, - destination, - num_retries=_DEFAULT_NUM_RETRIES, - job_id=None, - job_id_prefix=None, - location=None, - project=None, - job_config=None, - ): + json_rows: Iterable[Dict[str, Any]], + destination: Union[Table, TableReference, str], + num_retries: int = _DEFAULT_NUM_RETRIES, + job_id: str = None, + job_id_prefix: str = None, + location: str = None, + project: str = None, + job_config: LoadJobConfig = None, + timeout: float = DEFAULT_TIMEOUT, + ) -> job.LoadJob: """Upload the contents of a table from a JSON string or dict. Args: @@ -2332,6 +2729,9 @@ def load_table_from_json( Extra configuration options for the job. The ``source_format`` setting is always set to :attr:`~google.cloud.bigquery.job.SourceFormat.NEWLINE_DELIMITED_JSON`. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: google.cloud.bigquery.job.LoadJob: A new load job. @@ -2363,21 +2763,25 @@ def load_table_from_json( destination = _table_arg_to_table_ref(destination, default_project=self.project) - data_str = u"\n".join(json.dumps(item) for item in json_rows) - data_file = io.BytesIO(data_str.encode()) - + data_str = "\n".join(json.dumps(item, ensure_ascii=False) for item in json_rows) + encoded_str = data_str.encode() + data_file = io.BytesIO(encoded_str) return self.load_table_from_file( data_file, destination, + size=len(encoded_str), num_retries=num_retries, job_id=job_id, job_id_prefix=job_id_prefix, location=location, project=project, job_config=job_config, + timeout=timeout, ) - def _do_resumable_upload(self, stream, metadata, num_retries): + def _do_resumable_upload( + self, stream, metadata, num_retries, timeout, project=None + ): """Perform a resumable upload. Args: @@ -2389,13 +2793,21 @@ def _do_resumable_upload(self, stream, metadata, num_retries): Number of upload retries. (Deprecated: This argument will be removed in a future release.) + timeout (float): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + + project (Optional[str]): + Project ID of the project of where to run the upload. Defaults + to the client's project. + Returns: requests.Response: The "200 OK" response object returned after the final chunk is uploaded. """ upload, transport = self._initiate_resumable_upload( - stream, metadata, num_retries + stream, metadata, num_retries, timeout, project=project ) while not upload.finished: @@ -2403,7 +2815,9 @@ def _do_resumable_upload(self, stream, metadata, num_retries): return response - def _initiate_resumable_upload(self, stream, metadata, num_retries): + def _initiate_resumable_upload( + self, stream, metadata, num_retries, timeout, project=None + ): """Initiate a resumable upload. Args: @@ -2415,6 +2829,14 @@ def _initiate_resumable_upload(self, stream, metadata, num_retries): Number of upload retries. (Deprecated: This argument will be removed in a future release.) + timeout (float): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + + project (Optional[str]): + Project ID of the project of where to run the upload. Defaults + to the client's project. + Returns: Tuple: Pair of @@ -2426,7 +2848,19 @@ def _initiate_resumable_upload(self, stream, metadata, num_retries): chunk_size = _DEFAULT_CHUNKSIZE transport = self._http headers = _get_upload_headers(self._connection.user_agent) - upload_url = _RESUMABLE_URL_TEMPLATE.format(project=self.project) + + if project is None: + project = self.project + # TODO: Increase the minimum version of google-cloud-core to 1.6.0 + # and remove this logic. See: + # https://github.com/googleapis/python-bigquery/issues/509 + hostname = ( + self._connection.API_BASE_URL + if not hasattr(self._connection, "get_api_base_url_for_mtls") + else self._connection.get_api_base_url_for_mtls() + ) + upload_url = _RESUMABLE_URL_TEMPLATE.format(host=hostname, project=project) + # TODO: modify ResumableUpload to take a retry.Retry object # that it can use for the initial RPC. upload = ResumableUpload(upload_url, chunk_size, headers=headers) @@ -2437,12 +2871,19 @@ def _initiate_resumable_upload(self, stream, metadata, num_retries): ) upload.initiate( - transport, stream, metadata, _GENERIC_CONTENT_TYPE, stream_final=False + transport, + stream, + metadata, + _GENERIC_CONTENT_TYPE, + stream_final=False, + timeout=timeout, ) return upload, transport - def _do_multipart_upload(self, stream, metadata, size, num_retries): + def _do_multipart_upload( + self, stream, metadata, size, num_retries, timeout, project=None + ): """Perform a multipart upload. Args: @@ -2459,6 +2900,14 @@ def _do_multipart_upload(self, stream, metadata, size, num_retries): Number of upload retries. (Deprecated: This argument will be removed in a future release.) + timeout (float): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + + project (Optional[str]): + Project ID of the project of where to run the upload. Defaults + to the client's project. + Returns: requests.Response: The "200 OK" response object returned after the multipart @@ -2476,7 +2925,18 @@ def _do_multipart_upload(self, stream, metadata, size, num_retries): headers = _get_upload_headers(self._connection.user_agent) - upload_url = _MULTIPART_URL_TEMPLATE.format(project=self.project) + if project is None: + project = self.project + + # TODO: Increase the minimum version of google-cloud-core to 1.6.0 + # and remove this logic. See: + # https://github.com/googleapis/python-bigquery/issues/509 + hostname = ( + self._connection.API_BASE_URL + if not hasattr(self._connection, "get_api_base_url_for_mtls") + else self._connection.get_api_base_url_for_mtls() + ) + upload_url = _MULTIPART_URL_TEMPLATE.format(host=hostname, project=project) upload = MultipartUpload(upload_url, headers=headers) if num_retries is not None: @@ -2484,22 +2944,26 @@ def _do_multipart_upload(self, stream, metadata, size, num_retries): max_retries=num_retries ) - response = upload.transmit(self._http, data, metadata, _GENERIC_CONTENT_TYPE) + response = upload.transmit( + self._http, data, metadata, _GENERIC_CONTENT_TYPE, timeout=timeout + ) return response def copy_table( self, - sources, - destination, - job_id=None, - job_id_prefix=None, - location=None, - project=None, - job_config=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + sources: Union[ + Table, TableReference, str, Sequence[Union[Table, TableReference, str]] + ], + destination: Union[Table, TableReference, str], + job_id: str = None, + job_id_prefix: str = None, + location: str = None, + project: str = None, + job_config: CopyJobConfig = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> job.CopyJob: """Copy one or more tables to another table. See @@ -2593,17 +3057,17 @@ def copy_table( def extract_table( self, - source, - destination_uris, - job_id=None, - job_id_prefix=None, - location=None, - project=None, - job_config=None, - retry=DEFAULT_RETRY, - timeout=None, - source_type="Table", - ): + source: Union[Table, TableReference, Model, ModelReference, str], + destination_uris: Union[str, Sequence[str]], + job_id: str = None, + job_id_prefix: str = None, + location: str = None, + project: str = None, + job_config: ExtractJobConfig = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + source_type: str = "Table", + ) -> job.ExtractJob: """Start a job to extract a table into Cloud Storage files. See @@ -2674,7 +3138,7 @@ def extract_table( ) ) - if isinstance(destination_uris, six.string_types): + if isinstance(destination_uris, str): destination_uris = [destination_uris] if job_config: @@ -2692,15 +3156,16 @@ def extract_table( def query( self, - query, - job_config=None, - job_id=None, - job_id_prefix=None, - location=None, - project=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + query: str, + job_config: QueryJobConfig = None, + job_id: str = None, + job_id_prefix: str = None, + location: str = None, + project: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + job_retry: retries.Retry = DEFAULT_JOB_RETRY, + ) -> job.QueryJob: """Run a SQL query. See @@ -2729,20 +3194,52 @@ def query( Project ID of the project of where to run the job. Defaults to the client's project. retry (Optional[google.api_core.retry.Retry]): - How to retry the RPC. + How to retry the RPC. This only applies to making RPC + calls. It isn't used to retry failed jobs. This has + a reasonable default that should only be overridden + with care. timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. + job_retry (Optional[google.api_core.retry.Retry]): + How to retry failed jobs. The default retries + rate-limit-exceeded errors. Passing ``None`` disables + job retry. + + Not all jobs can be retried. If ``job_id`` is + provided, then the job returned by the query will not + be retryable, and an exception will be raised if a + non-``None`` (and non-default) value for ``job_retry`` + is also provided. + + Note that errors aren't detected until ``result()`` is + called on the job returned. The ``job_retry`` + specified here becomes the default ``job_retry`` for + ``result()``, where it can also be specified. Returns: google.cloud.bigquery.job.QueryJob: A new query job instance. Raises: TypeError: - If ``job_config`` is not an instance of :class:`~google.cloud.bigquery.job.QueryJobConfig` - class. + If ``job_config`` is not an instance of + :class:`~google.cloud.bigquery.job.QueryJobConfig` + class, or if both ``job_id`` and non-``None`` non-default + ``job_retry`` are provided. """ - job_id = _make_job_id(job_id, job_id_prefix) + job_id_given = job_id is not None + if ( + job_id_given + and job_retry is not None + and job_retry is not DEFAULT_JOB_RETRY + ): + raise TypeError( + "`job_retry` was provided, but the returned job is" + " not retryable, because a custom `job_id` was" + " provided." + ) + + job_id_save = job_id if project is None: project = self.project @@ -2750,8 +3247,6 @@ def query( if location is None: location = self.location - job_config = copy.deepcopy(job_config) - if self._default_query_job_config: if job_config: _verify_job_config_type( @@ -2761,6 +3256,8 @@ def query( # that is in the default, # should be filled in with the default # the incoming therefore has precedence + # + # Note that _fill_from_default doesn't mutate the receiver job_config = job_config._fill_from_default( self._default_query_job_config ) @@ -2769,15 +3266,62 @@ def query( self._default_query_job_config, google.cloud.bigquery.job.QueryJobConfig, ) - job_config = copy.deepcopy(self._default_query_job_config) + job_config = self._default_query_job_config - job_ref = job._JobReference(job_id, project=project, location=location) - query_job = job.QueryJob(job_ref, query, client=self, job_config=job_config) - query_job._begin(retry=retry, timeout=timeout) + # Note that we haven't modified the original job_config (or + # _default_query_job_config) up to this point. + job_config_save = job_config + + def do_query(): + # Make a copy now, so that original doesn't get changed by the process + # below and to facilitate retry + job_config = copy.deepcopy(job_config_save) + + job_id = _make_job_id(job_id_save, job_id_prefix) + job_ref = job._JobReference(job_id, project=project, location=location) + query_job = job.QueryJob(job_ref, query, client=self, job_config=job_config) + + try: + query_job._begin(retry=retry, timeout=timeout) + except core_exceptions.Conflict as create_exc: + # The thought is if someone is providing their own job IDs and they get + # their job ID generation wrong, this could end up returning results for + # the wrong query. We thus only try to recover if job ID was not given. + if job_id_given: + raise create_exc + + try: + query_job = self.get_job( + job_id, + project=project, + location=location, + retry=retry, + timeout=timeout, + ) + except core_exceptions.GoogleAPIError: # (includes RetryError) + raise create_exc + else: + return query_job + else: + return query_job + + future = do_query() + # The future might be in a failed state now, but if it's + # unrecoverable, we'll find out when we ask for it's result, at which + # point, we may retry. + if not job_id_given: + future._retry_do_query = do_query # in case we have to retry later + future._job_retry = job_retry - return query_job + return future - def insert_rows(self, table, rows, selected_fields=None, **kwargs): + def insert_rows( + self, + table: Union[Table, TableReference, str], + rows: Union[Iterable[Tuple], Iterable[Dict]], + selected_fields: Sequence[SchemaField] = None, + **kwargs: dict, + ) -> Sequence[dict]: """Insert rows into a table via the streaming API. See @@ -2800,7 +3344,7 @@ def insert_rows(self, table, rows, selected_fields=None, **kwargs): selected_fields (Sequence[google.cloud.bigquery.schema.SchemaField]): The fields to return. Required if ``table`` is a :class:`~google.cloud.bigquery.table.TableReference`. - kwargs (Dict): + kwargs (dict): Keyword arguments to :meth:`~google.cloud.bigquery.client.Client.insert_rows_json`. @@ -2840,8 +3384,13 @@ def insert_rows(self, table, rows, selected_fields=None, **kwargs): return self.insert_rows_json(table, json_rows, **kwargs) def insert_rows_from_dataframe( - self, table, dataframe, selected_fields=None, chunk_size=500, **kwargs - ): + self, + table: Union[Table, TableReference, str], + dataframe, + selected_fields: Sequence[SchemaField] = None, + chunk_size: int = 500, + **kwargs: Dict, + ) -> Sequence[Sequence[dict]]: """Insert rows into a table from a dataframe via the streaming API. Args: @@ -2889,15 +3438,15 @@ def insert_rows_from_dataframe( def insert_rows_json( self, - table, - json_rows, - row_ids=None, - skip_invalid_rows=None, - ignore_unknown_values=None, - template_suffix=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + table: Union[Table, TableReference, str], + json_rows: Sequence[Dict], + row_ids: Union[Iterable[str], AutoRowIDs, None] = AutoRowIDs.GENERATE_UUID, + skip_invalid_rows: bool = None, + ignore_unknown_values: bool = None, + template_suffix: str = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Sequence[dict]: """Insert rows into a table without applying local type conversions. See @@ -2913,11 +3462,20 @@ def insert_rows_json( json_rows (Sequence[Dict]): Row data to be inserted. Keys must match the table schema fields and values must be JSON-compatible representations. - row_ids (Optional[Sequence[Optional[str]]]): + row_ids (Union[Iterable[str], AutoRowIDs, None]): Unique IDs, one per row being inserted. An ID can also be ``None``, indicating that an explicit insert ID should **not** be used for that row. If the argument is omitted altogether, unique IDs are created automatically. + + .. versionchanged:: 2.21.0 + Can also be an iterable, not just a sequence, or an + :class:`AutoRowIDs` enum member. + + .. deprecated:: 2.21.0 + Passing ``None`` to explicitly request autogenerating insert IDs is + deprecated, use :attr:`AutoRowIDs.GENERATE_UUID` instead. + skip_invalid_rows (Optional[bool]): Insert all valid rows of a request, even if invalid rows exist. The default value is ``False``, which causes the entire request @@ -2957,12 +3515,37 @@ def insert_rows_json( rows_info = [] data = {"rows": rows_info} - for index, row in enumerate(json_rows): + if row_ids is None: + warnings.warn( + "Passing None for row_ids is deprecated. To explicitly request " + "autogenerated insert IDs, use AutoRowIDs.GENERATE_UUID instead", + category=DeprecationWarning, + ) + row_ids = AutoRowIDs.GENERATE_UUID + + if not isinstance(row_ids, AutoRowIDs): + try: + row_ids_iter = iter(row_ids) + except TypeError: + msg = "row_ids is neither an iterable nor an AutoRowIDs enum member" + raise TypeError(msg) + + for i, row in enumerate(json_rows): info = {"json": row} - if row_ids is not None: - info["insertId"] = row_ids[index] - else: + + if row_ids is AutoRowIDs.GENERATE_UUID: info["insertId"] = str(uuid.uuid4()) + elif row_ids is AutoRowIDs.DISABLED: + info["insertId"] = None + else: + try: + insert_id = next(row_ids_iter) + except StopIteration: + msg = f"row_ids did not generate enough IDs, error at index {i}" + raise ValueError(msg) + else: + info["insertId"] = insert_id + rows_info.append(info) if skip_invalid_rows is not None: @@ -2993,7 +3576,12 @@ def insert_rows_json( return errors - def list_partitions(self, table, retry=DEFAULT_RETRY, timeout=None): + def list_partitions( + self, + table: Union[Table, TableReference, str], + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> Sequence[str]: """List the partitions in a table. Args: @@ -3035,15 +3623,15 @@ def list_partitions(self, table, retry=DEFAULT_RETRY, timeout=None): def list_rows( self, - table, - selected_fields=None, - max_results=None, - page_token=None, - start_index=None, - page_size=None, - retry=DEFAULT_RETRY, - timeout=None, - ): + table: Union[Table, TableListItem, TableReference, str], + selected_fields: Sequence[SchemaField] = None, + max_results: int = None, + page_token: str = None, + start_index: int = None, + page_size: int = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> RowIterator: """List the rows of the table. See @@ -3124,6 +3712,7 @@ def list_rows( if start_index is not None: params["startIndex"] = start_index + params["formatOptions.useInt64Timestamp"] = True row_iterator = RowIterator( client=self, api_request=functools.partial(self._call_api, retry, timeout=timeout), @@ -3137,6 +3726,89 @@ def list_rows( # Pass in selected_fields separately from schema so that full # tables can be fetched without a column filter. selected_fields=selected_fields, + total_rows=getattr(table, "num_rows", None), + ) + return row_iterator + + def _list_rows_from_query_results( + self, + job_id: str, + location: str, + project: str, + schema: SchemaField, + total_rows: int = None, + destination: Union[Table, TableReference, TableListItem, str] = None, + max_results: int = None, + start_index: int = None, + page_size: int = None, + retry: retries.Retry = DEFAULT_RETRY, + timeout: float = DEFAULT_TIMEOUT, + ) -> RowIterator: + """List the rows of a completed query. + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/getQueryResults + Args: + job_id (str): + ID of a query job. + location (str): Location of the query job. + project (str): + ID of the project where the query job was run. + schema (Sequence[google.cloud.bigquery.schema.SchemaField]): + The fields expected in these query results. Used to convert + from JSON to expected Python types. + total_rows (Optional[int]): + Total number of rows in the query results. + destination (Optional[Union[ \ + google.cloud.bigquery.table.Table, \ + google.cloud.bigquery.table.TableListItem, \ + google.cloud.bigquery.table.TableReference, \ + str, \ + ]]): + Destination table reference. Used to fetch the query results + with the BigQuery Storage API. + max_results (Optional[int]): + Maximum number of rows to return across the whole iterator. + start_index (Optional[int]): + The zero-based index of the starting row to read. + page_size (Optional[int]): + The maximum number of rows in each page of results from this request. + Non-positive values are ignored. Defaults to a sensible value set by the API. + retry (Optional[google.api_core.retry.Retry]): + How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. If set, this connection timeout may be + increased to a minimum value. This prevents retries on what + would otherwise be a successful response. + If multiple requests are made under the hood, ``timeout`` + applies to each individual request. + Returns: + google.cloud.bigquery.table.RowIterator: + Iterator of row data + :class:`~google.cloud.bigquery.table.Row`-s. + """ + params = { + "fields": _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS, + "location": location, + } + + if timeout is not None: + timeout = max(timeout, _MIN_GET_QUERY_RESULTS_TIMEOUT) + + if start_index is not None: + params["startIndex"] = start_index + + params["formatOptions.useInt64Timestamp"] = True + row_iterator = RowIterator( + client=self, + api_request=functools.partial(self._call_api, retry, timeout=timeout), + path=f"/projects/{project}/queries/{job_id}", + schema=schema, + max_results=max_results, + page_size=page_size, + table=destination, + extra_params=params, + total_rows=total_rows, ) return row_iterator @@ -3156,7 +3828,7 @@ def _schema_to_json_file_object(self, schema_list, file_obj): """ json.dump(schema_list, file_obj, indent=2, sort_keys=True) - def schema_from_json(self, file_or_path): + def schema_from_json(self, file_or_path: Union[str, BinaryIO]): """Takes a file object or file path that contains json that describes a table schema. @@ -3169,7 +3841,9 @@ def schema_from_json(self, file_or_path): with open(file_or_path) as file_obj: return self._schema_from_json_file_object(file_obj) - def schema_to_json(self, schema_list, destination): + def schema_to_json( + self, schema_list: Sequence[SchemaField], destination: Union[str, BinaryIO] + ): """Takes a list of schema field objects. Serializes the list of schema field objects as json to a file. @@ -3184,6 +3858,12 @@ def schema_to_json(self, schema_list, destination): with open(destination, mode="w") as file_obj: return self._schema_to_json_file_object(json_schema_list, file_obj) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + # pylint: disable=unused-argument def _item_to_project(iterator, resource): @@ -3273,6 +3953,37 @@ def _item_to_table(iterator, resource): return TableListItem(resource) +def _extract_job_reference(job, project=None, location=None): + """Extract fully-qualified job reference from a job-like object. + + Args: + job_id (Union[ \ + str, \ + google.cloud.bigquery.job.LoadJob, \ + google.cloud.bigquery.job.CopyJob, \ + google.cloud.bigquery.job.ExtractJob, \ + google.cloud.bigquery.job.QueryJob \ + ]): Job identifier. + project (Optional[str]): + Project where the job was run. Ignored if ``job_id`` is a job + object. + location (Optional[str]): + Location where the job was run. Ignored if ``job_id`` is a job + object. + + Returns: + Tuple[str, str, str]: ``(project, location, job_id)`` + """ + if hasattr(job, "job_id"): + project = job.project + job_id = job.job_id + location = job.location + else: + job_id = job + + return (project, location, job_id) + + def _make_job_id(job_id, prefix=None): """Construct an ID for a new job. @@ -3306,7 +4017,7 @@ def _check_mode(stream): mode = getattr(stream, "mode", None) if isinstance(stream, gzip.GzipFile): - if mode != gzip.READ: + if mode != gzip.READ: # pytype: disable=module-attr raise ValueError( "Cannot upload gzip files opened in write mode: use " "gzip.GzipFile(filename, mode='rb')" diff --git a/google/cloud/bigquery/dataset.py b/google/cloud/bigquery/dataset.py index 9a80f30b5..21e56f305 100644 --- a/google/cloud/bigquery/dataset.py +++ b/google/cloud/bigquery/dataset.py @@ -16,7 +16,6 @@ from __future__ import absolute_import -import six import copy import google.cloud._helpers @@ -79,8 +78,9 @@ class AccessEntry(object): """Represents grant of an access role to an entity. An entry must have exactly one of the allowed :attr:`ENTITY_TYPES`. If - anything but ``view`` is set, a ``role`` is also required. ``role`` is - omitted for a ``view``, because ``view`` s are always read-only. + anything but ``view`` or ``routine`` are set, a ``role`` is also required. + ``role`` is omitted for ``view`` and ``routine``, because they are always + read-only. See https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets. @@ -88,17 +88,17 @@ class AccessEntry(object): role (str): Role granted to the entity. The following string values are supported: `'READER'`, `'WRITER'`, `'OWNER'`. It may also be - :data:`None` if the ``entity_type`` is ``view``. + :data:`None` if the ``entity_type`` is ``view`` or ``routine``. entity_type (str): Type of entity being granted the role. One of :attr:`ENTITY_TYPES`. entity_id (Union[str, Dict[str, str]]): - If the ``entity_type`` is not 'view', the ``entity_id`` is the - ``str`` ID of the entity being granted the role. If the - ``entity_type`` is 'view', the ``entity_id`` is a ``dict`` - representing the view from a different dataset to grant access to - in the following format:: + If the ``entity_type`` is not 'view' or 'routine', the ``entity_id`` + is the ``str`` ID of the entity being granted the role. If the + ``entity_type`` is 'view' or 'routine', the ``entity_id`` is a ``dict`` + representing the view or routine from a different dataset to grant + access to in the following format for views:: { 'projectId': string, @@ -106,11 +106,19 @@ class AccessEntry(object): 'tableId': string } + For routines:: + + { + 'projectId': string, + 'datasetId': string, + 'routineId': string + } + Raises: ValueError: If the ``entity_type`` is not among :attr:`ENTITY_TYPES`, or if a - ``view`` has ``role`` set, or a non ``view`` **does not** have a - ``role`` set. + ``view`` or a ``routine`` has ``role`` set, or a non ``view`` and + non ``routine`` **does not** have a ``role`` set. Examples: >>> entry = AccessEntry('OWNER', 'userByEmail', 'user@example.com') @@ -124,7 +132,15 @@ class AccessEntry(object): """ ENTITY_TYPES = frozenset( - ["userByEmail", "groupByEmail", "domain", "specialGroup", "view", "iamMember"] + [ + "userByEmail", + "groupByEmail", + "domain", + "specialGroup", + "view", + "iamMember", + "routine", + ] ) """Allowed entity types.""" @@ -135,10 +151,11 @@ def __init__(self, role, entity_type, entity_id): ", ".join(self.ENTITY_TYPES), ) raise ValueError(message) - if entity_type == "view": + if entity_type in ("view", "routine"): if role is not None: raise ValueError( - "Role must be None for a view. Received " "role: %r" % (role,) + "Role must be None for a %r. Received " + "role: %r" % (entity_type, role) ) else: if role is None: @@ -203,7 +220,7 @@ def to_api_repr(self): return resource @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "AccessEntry": """Factory: construct an access entry given its API representation Args: @@ -242,9 +259,9 @@ class DatasetReference(object): """ def __init__(self, project, dataset_id): - if not isinstance(project, six.string_types): + if not isinstance(project, str): raise ValueError("Pass a string for project") - if not isinstance(dataset_id, six.string_types): + if not isinstance(dataset_id, str): raise ValueError("Pass a string for dataset_id") self._project = project self._dataset_id = dataset_id @@ -271,7 +288,7 @@ def path(self): routine = _get_routine_reference @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "DatasetReference": """Factory: construct a dataset reference given its API representation Args: @@ -287,7 +304,9 @@ def from_api_repr(cls, resource): return cls(project, dataset_id) @classmethod - def from_string(cls, dataset_id, default_project=None): + def from_string( + cls, dataset_id: str, default_project: str = None + ) -> "DatasetReference": """Construct a dataset reference from dataset ID string. Args: @@ -333,7 +352,7 @@ def from_string(cls, dataset_id, default_project=None): return cls(output_project_id, output_dataset_id) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct the API resource representation of this dataset reference Returns: @@ -389,7 +408,7 @@ class Dataset(object): } def __init__(self, dataset_ref): - if isinstance(dataset_ref, six.string_types): + if isinstance(dataset_ref, str): dataset_ref = DatasetReference.from_string(dataset_ref) self._properties = {"datasetReference": dataset_ref.to_api_repr(), "labels": {}} @@ -409,7 +428,7 @@ def access_entries(self): entries. ``role`` augments the entity type and must be present **unless** the - entity type is ``view``. + entity type is ``view`` or ``routine``. Raises: TypeError: If 'value' is not a sequence @@ -526,7 +545,7 @@ def default_table_expiration_ms(self): @default_table_expiration_ms.setter def default_table_expiration_ms(self, value): - if not isinstance(value, six.integer_types) and value is not None: + if not isinstance(value, int) and value is not None: raise ValueError("Pass an integer, or None") self._properties["defaultTableExpirationMs"] = _helpers._str_or_none(value) @@ -542,7 +561,7 @@ def description(self): @description.setter def description(self, value): - if not isinstance(value, six.string_types) and value is not None: + if not isinstance(value, str) and value is not None: raise ValueError("Pass a string, or None") self._properties["description"] = value @@ -558,7 +577,7 @@ def friendly_name(self): @friendly_name.setter def friendly_name(self, value): - if not isinstance(value, six.string_types) and value is not None: + if not isinstance(value, str) and value is not None: raise ValueError("Pass a string, or None") self._properties["friendlyName"] = value @@ -574,7 +593,7 @@ def location(self): @location.setter def location(self, value): - if not isinstance(value, six.string_types) and value is not None: + if not isinstance(value, str) and value is not None: raise ValueError("Pass a string, or None") self._properties["location"] = value @@ -623,7 +642,7 @@ def default_encryption_configuration(self, value): self._properties["defaultEncryptionConfiguration"] = api_repr @classmethod - def from_string(cls, full_dataset_id): + def from_string(cls, full_dataset_id: str) -> "Dataset": """Construct a dataset from fully-qualified dataset ID. Args: @@ -647,7 +666,7 @@ def from_string(cls, full_dataset_id): return cls(DatasetReference.from_string(full_dataset_id)) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "Dataset": """Factory: construct a dataset given its API representation Args: @@ -672,7 +691,7 @@ def from_api_repr(cls, resource): dataset._properties = copy.deepcopy(resource) return dataset - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct the API resource representation of this dataset Returns: diff --git a/google/cloud/bigquery/dbapi/_helpers.py b/google/cloud/bigquery/dbapi/_helpers.py index 1bcf45f31..9c134b47c 100644 --- a/google/cloud/bigquery/dbapi/_helpers.py +++ b/google/cloud/bigquery/dbapi/_helpers.py @@ -12,24 +12,60 @@ # See the License for the specific language governing permissions and # limitations under the License. -try: - from collections import abc as collections_abc -except ImportError: # Python 2.7 - import collections as collections_abc +from collections import abc as collections_abc import datetime import decimal import functools import numbers - -import six +import re +import typing from google.cloud import bigquery -from google.cloud.bigquery import table +from google.cloud.bigquery import table, enums, query from google.cloud.bigquery.dbapi import exceptions -def scalar_to_query_parameter(value, name=None): +_NUMERIC_SERVER_MIN = decimal.Decimal("-9.9999999999999999999999999999999999999E+28") +_NUMERIC_SERVER_MAX = decimal.Decimal("9.9999999999999999999999999999999999999E+28") + +type_parameters_re = re.compile( + r""" + \( + \s*[0-9]+\s* + (, + \s*[0-9]+\s* + )* + \) + """, + re.VERBOSE, +) + + +def _parameter_type(name, value, query_parameter_type=None, value_doc=""): + if query_parameter_type: + # Strip type parameters + query_parameter_type = type_parameters_re.sub("", query_parameter_type) + try: + parameter_type = getattr( + enums.SqlParameterScalarTypes, query_parameter_type.upper() + )._type + except AttributeError: + raise exceptions.ProgrammingError( + f"The given parameter type, {query_parameter_type}," + f" for {name} is not a valid BigQuery scalar type." + ) + else: + parameter_type = bigquery_scalar_type(value) + if parameter_type is None: + raise exceptions.ProgrammingError( + f"Encountered parameter {name} with " + f"{value_doc} value {value} of unexpected type." + ) + return parameter_type + + +def scalar_to_query_parameter(value, name=None, query_parameter_type=None): """Convert a scalar value into a query parameter. Args: @@ -38,6 +74,7 @@ def scalar_to_query_parameter(value, name=None): name (str): (Optional) Name of the query parameter. + query_parameter_type (Optional[str]): Given type for the parameter. Returns: google.cloud.bigquery.ScalarQueryParameter: @@ -48,24 +85,19 @@ def scalar_to_query_parameter(value, name=None): google.cloud.bigquery.dbapi.exceptions.ProgrammingError: if the type cannot be determined. """ - parameter_type = bigquery_scalar_type(value) - - if parameter_type is None: - raise exceptions.ProgrammingError( - "encountered parameter {} with value {} of unexpected type".format( - name, value - ) - ) - return bigquery.ScalarQueryParameter(name, parameter_type, value) + return bigquery.ScalarQueryParameter( + name, _parameter_type(name, value, query_parameter_type), value + ) -def array_to_query_parameter(value, name=None): +def array_to_query_parameter(value, name=None, query_parameter_type=None): """Convert an array-like value into a query parameter. Args: value (Sequence[Any]): The elements of the array (should not be a string-like Sequence). name (Optional[str]): Name of the query parameter. + query_parameter_type (Optional[str]): Given type for the parameter. Returns: A query parameter corresponding with the type and value of the plain @@ -81,81 +113,267 @@ def array_to_query_parameter(value, name=None): "not string-like.".format(name) ) - if not value: + if query_parameter_type or value: + array_type = _parameter_type( + name, + value[0] if value else None, + query_parameter_type, + value_doc="array element ", + ) + else: raise exceptions.ProgrammingError( "Encountered an empty array-like value of parameter {}, cannot " "determine array elements type.".format(name) ) - # Assume that all elements are of the same type, and let the backend handle - # any type incompatibilities among the array elements - array_type = bigquery_scalar_type(value[0]) - if array_type is None: - raise exceptions.ProgrammingError( - "Encountered unexpected first array element of parameter {}, " - "cannot determine array elements type.".format(name) + return bigquery.ArrayQueryParameter(name, array_type, value) + + +def _parse_struct_fields( + fields, + base, + parse_struct_field=re.compile( + r""" + (?:(\w+)\s+) # field name + ([A-Z0-9<> ,()]+) # Field type + $""", + re.VERBOSE | re.IGNORECASE, + ).match, +): + # Split a string of struct fields. They're defined by commas, but + # we have to avoid splitting on commas internal to fields. For + # example: + # name string, children array> + # + # only has 2 top-level fields. + fields = fields.split(",") + fields = list(reversed(fields)) # in the off chance that there are very many + while fields: + field = fields.pop() + while fields and field.count("<") != field.count(">"): + field += "," + fields.pop() + + m = parse_struct_field(field.strip()) + if not m: + raise exceptions.ProgrammingError( + f"Invalid struct field, {field}, in {base}" + ) + yield m.group(1, 2) + + +SCALAR, ARRAY, STRUCT = "sar" + + +def _parse_type( + type_, + name, + base, + complex_query_parameter_parse=re.compile( + r""" + \s* + (ARRAY|STRUCT|RECORD) # Type + \s* + <([A-Z0-9<> ,()]+)> # Subtype(s) + \s*$ + """, + re.IGNORECASE | re.VERBOSE, + ).match, +): + if "<" not in type_: + # Scalar + + # Strip type parameters + type_ = type_parameters_re.sub("", type_).strip() + try: + type_ = getattr(enums.SqlParameterScalarTypes, type_.upper()) + except AttributeError: + raise exceptions.ProgrammingError( + f"The given parameter type, {type_}," + f"{' for ' + name if name else ''}" + f" is not a valid BigQuery scalar type, in {base}." + ) + if name: + type_ = type_.with_name(name) + return SCALAR, type_ + + m = complex_query_parameter_parse(type_) + if not m: + raise exceptions.ProgrammingError(f"Invalid parameter type, {type_}") + tname, sub = m.group(1, 2) + if tname.upper() == "ARRAY": + sub_type = complex_query_parameter_type(None, sub, base) + if isinstance(sub_type, query.ArrayQueryParameterType): + raise exceptions.ProgrammingError(f"Array can't contain an array in {base}") + sub_type._complex__src = sub + return ARRAY, sub_type + else: + return STRUCT, _parse_struct_fields(sub, base) + + +def complex_query_parameter_type(name: typing.Optional[str], type_: str, base: str): + """Construct a parameter type (`StructQueryParameterType`) for a complex type + + or a non-complex type that's part of a complex type. + + Examples: + + array> + + struct>> + + This is used for computing array types. + """ + + type_type, sub_type = _parse_type(type_, name, base) + if type_type == SCALAR: + type_ = sub_type + elif type_type == ARRAY: + type_ = query.ArrayQueryParameterType(sub_type, name=name) + elif type_type == STRUCT: + fields = [ + complex_query_parameter_type(field_name, field_type, base) + for field_name, field_type in sub_type + ] + type_ = query.StructQueryParameterType(*fields, name=name) + else: # pragma: NO COVER + raise AssertionError("Bad type_type", type_type) # Can't happen :) + + return type_ + + +def complex_query_parameter( + name: typing.Optional[str], value, type_: str, base: typing.Optional[str] = None +): + """ + Construct a query parameter for a complex type (array or struct record) + + or for a subtype, which may not be complex + + Examples: + + array> + + struct>> + + """ + base = base or type_ + + type_type, sub_type = _parse_type(type_, name, base) + + if type_type == SCALAR: + param = query.ScalarQueryParameter(name, sub_type._type, value) + elif type_type == ARRAY: + if not array_like(value): + raise exceptions.ProgrammingError( + f"Array type with non-array-like value" + f" with type {type(value).__name__}" + ) + param = query.ArrayQueryParameter( + name, + sub_type, + value + if isinstance(sub_type, query.ScalarQueryParameterType) + else [ + complex_query_parameter(None, v, sub_type._complex__src, base) + for v in value + ], ) + elif type_type == STRUCT: + if not isinstance(value, collections_abc.Mapping): + raise exceptions.ProgrammingError(f"Non-mapping value for type {type_}") + value_keys = set(value) + fields = [] + for field_name, field_type in sub_type: + if field_name not in value: + raise exceptions.ProgrammingError( + f"No field value for {field_name} in {type_}" + ) + value_keys.remove(field_name) + fields.append( + complex_query_parameter(field_name, value[field_name], field_type, base) + ) + if value_keys: + raise exceptions.ProgrammingError(f"Extra data keys for {type_}") - return bigquery.ArrayQueryParameter(name, array_type, value) + param = query.StructQueryParameter(name, *fields) + else: # pragma: NO COVER + raise AssertionError("Bad type_type", type_type) # Can't happen :) + return param + + +def _dispatch_parameter(type_, value, name=None): + if type_ is not None and "<" in type_: + param = complex_query_parameter(name, value, type_) + elif isinstance(value, collections_abc.Mapping): + raise NotImplementedError( + f"STRUCT-like parameter values are not supported" + f"{' (parameter ' + name + ')' if name else ''}," + f" unless an explicit type is give in the parameter placeholder" + f" (e.g. '%({name if name else ''}:struct<...>)s')." + ) + elif array_like(value): + param = array_to_query_parameter(value, name, type_) + else: + param = scalar_to_query_parameter(value, name, type_) -def to_query_parameters_list(parameters): + return param + + +def to_query_parameters_list(parameters, parameter_types): """Converts a sequence of parameter values into query parameters. Args: parameters (Sequence[Any]): Sequence of query parameter values. + parameter_types: + A list of parameter types, one for each parameter. + Unknown types are provided as None. Returns: List[google.cloud.bigquery.query._AbstractQueryParameter]: A list of query parameters. """ - result = [] - - for value in parameters: - if isinstance(value, collections_abc.Mapping): - raise NotImplementedError("STRUCT-like parameter values are not supported.") - elif array_like(value): - param = array_to_query_parameter(value) - else: - param = scalar_to_query_parameter(value) - result.append(param) + return [ + _dispatch_parameter(type_, value) + for value, type_ in zip(parameters, parameter_types) + ] - return result - -def to_query_parameters_dict(parameters): +def to_query_parameters_dict(parameters, query_parameter_types): """Converts a dictionary of parameter values into query parameters. Args: parameters (Mapping[str, Any]): Dictionary of query parameter values. + parameter_types: + A dictionary of parameter types. It needn't have a key for each + parameter. Returns: List[google.cloud.bigquery.query._AbstractQueryParameter]: A list of named query parameters. """ - result = [] - - for name, value in six.iteritems(parameters): - if isinstance(value, collections_abc.Mapping): - raise NotImplementedError( - "STRUCT-like parameter values are not supported " - "(parameter {}).".format(name) - ) - elif array_like(value): - param = array_to_query_parameter(value, name=name) - else: - param = scalar_to_query_parameter(value, name=name) - result.append(param) - - return result + return [ + _dispatch_parameter(query_parameter_types.get(name), value, name) + for name, value in parameters.items() + ] -def to_query_parameters(parameters): +def to_query_parameters(parameters, parameter_types): """Converts DB-API parameter values into query parameters. Args: parameters (Union[Mapping[str, Any], Sequence[Any]]): A dictionary or sequence of query parameter values. + parameter_types (Union[Mapping[str, str], Sequence[str]]): + A dictionary or list of parameter types. + + If parameters is a mapping, then this must be a dictionary + of parameter types. It needn't have a key for each + parameter. + + If parameters is a sequence, then this must be a list of + parameter types, one for each paramater. Unknown types + are provided as None. Returns: List[google.cloud.bigquery.query._AbstractQueryParameter]: @@ -165,9 +383,9 @@ def to_query_parameters(parameters): return [] if isinstance(parameters, collections_abc.Mapping): - return to_query_parameters_dict(parameters) - - return to_query_parameters_list(parameters) + return to_query_parameters_dict(parameters, parameter_types) + else: + return to_query_parameters_list(parameters, parameter_types) def bigquery_scalar_type(value): @@ -189,10 +407,23 @@ def bigquery_scalar_type(value): elif isinstance(value, numbers.Real): return "FLOAT64" elif isinstance(value, decimal.Decimal): - return "NUMERIC" - elif isinstance(value, six.text_type): + vtuple = value.as_tuple() + # NUMERIC values have precision of 38 (number of digits) and scale of 9 (number + # of fractional digits), and their max absolute value must be strictly smaller + # than 1.0E+29. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + if ( + len(vtuple.digits) <= 38 # max precision: 38 + and vtuple.exponent >= -9 # max scale: 9 + and _NUMERIC_SERVER_MIN <= value <= _NUMERIC_SERVER_MAX + ): + return "NUMERIC" + else: + return "BIGNUMERIC" + + elif isinstance(value, str): return "STRING" - elif isinstance(value, six.binary_type): + elif isinstance(value, bytes): return "BYTES" elif isinstance(value, datetime.datetime): return "DATETIME" if value.tzinfo is None else "TIMESTAMP" @@ -218,7 +449,7 @@ def array_like(value): bool: ``True`` if the value is considered array-like, ``False`` otherwise. """ return isinstance(value, collections_abc.Sequence) and not isinstance( - value, (six.text_type, six.binary_type, bytearray) + value, (str, bytes, bytearray) ) @@ -264,7 +495,7 @@ def decorate_public_methods(klass): """Apply ``_raise_on_closed()`` decorator to public instance methods. """ for name in dir(klass): - if name.startswith("_"): + if name.startswith("_") and name != "__iter__": continue member = getattr(klass, name) diff --git a/google/cloud/bigquery/dbapi/connection.py b/google/cloud/bigquery/dbapi/connection.py index 464b0fd06..66dee7dfb 100644 --- a/google/cloud/bigquery/dbapi/connection.py +++ b/google/cloud/bigquery/dbapi/connection.py @@ -47,12 +47,14 @@ def __init__(self, client=None, bqstorage_client=None): else: self._owns_client = False + # A warning is already raised by the BQ Storage client factory factory if + # instantiation fails, or if the given BQ Storage client instance is outdated. if bqstorage_client is None: - # A warning is already raised by the factory if instantiation fails. - bqstorage_client = client._create_bqstorage_client() + bqstorage_client = client._ensure_bqstorage_client() self._owns_bqstorage_client = bqstorage_client is not None else: self._owns_bqstorage_client = False + bqstorage_client = client._ensure_bqstorage_client(bqstorage_client) self._client = client self._bqstorage_client = bqstorage_client @@ -73,10 +75,11 @@ def close(self): if self._owns_bqstorage_client: # There is no close() on the BQ Storage client itself. - self._bqstorage_client.transport.channel.close() + self._bqstorage_client._transport.grpc_channel.close() for cursor_ in self._cursors_created: - cursor_.close() + if not cursor_._closed: + cursor_.close() def commit(self): """No-op, but for consistency raise an error if connection is closed.""" diff --git a/google/cloud/bigquery/dbapi/cursor.py b/google/cloud/bigquery/dbapi/cursor.py index 7a10637f0..587598d5f 100644 --- a/google/cloud/bigquery/dbapi/cursor.py +++ b/google/cloud/bigquery/dbapi/cursor.py @@ -15,17 +15,18 @@ """Cursor for the Google BigQuery DB-API.""" import collections +from collections import abc as collections_abc import copy -import warnings - -try: - from collections import abc as collections_abc -except ImportError: # Python 2.7 - import collections as collections_abc - import logging +import re -import six +try: + from google.cloud.bigquery_storage import ArrowSerializationOptions +except ImportError: + _ARROW_COMPRESSION_SUPPORT = False +else: + # Having BQ Storage available implies that pyarrow >=1.0.0 is available, too. + _ARROW_COMPRESSION_SUPPORT = True from google.cloud.bigquery import job from google.cloud.bigquery.dbapi import _helpers @@ -161,6 +162,14 @@ def execute(self, operation, parameters=None, job_id=None, job_config=None): job_config (google.cloud.bigquery.job.QueryJobConfig): (Optional) Extra configuration options for the query job. """ + formatted_operation, parameter_types = _format_operation(operation, parameters) + self._execute( + formatted_operation, parameters, job_id, job_config, parameter_types + ) + + def _execute( + self, formatted_operation, parameters, job_id, job_config, parameter_types + ): self._query_data = None self._query_job = None client = self.connection._client @@ -169,8 +178,7 @@ def execute(self, operation, parameters=None, job_id=None, job_config=None): # query parameters was not one of the standard options. Convert both # the query and the parameters to the format expected by the client # libraries. - formatted_operation = _format_operation(operation, parameters=parameters) - query_parameters = _helpers.to_query_parameters(parameters) + query_parameters = _helpers.to_query_parameters(parameters, parameter_types) if client._default_query_job_config: if job_config: @@ -209,8 +217,23 @@ def executemany(self, operation, seq_of_parameters): seq_of_parameters (Union[Sequence[Mapping[str, Any], Sequence[Any]]]): Sequence of many sets of parameter values. """ - for parameters in seq_of_parameters: - self.execute(operation, parameters) + if seq_of_parameters: + rowcount = 0 + # There's no reason to format the line more than once, as + # the operation only barely depends on the parameters. So + # we just use the first set of parameters. If there are + # different numbers or types of parameters, we'll error + # anyway. + formatted_operation, parameter_types = _format_operation( + operation, seq_of_parameters[0] + ) + for parameters in seq_of_parameters: + self._execute( + formatted_operation, parameters, None, None, parameter_types + ) + rowcount += self.rowcount + + self.rowcount = rowcount def _try_fetch(self, size=None): """Try to start fetching data, if not yet started. @@ -226,16 +249,7 @@ def _try_fetch(self, size=None): self._query_data = iter([]) return - is_dml = ( - self._query_job.statement_type - and self._query_job.statement_type.upper() != "SELECT" - ) - if is_dml: - self._query_data = iter([]) - return - if self._query_data is None: - client = self.connection._client bqstorage_client = self.connection._bqstorage_client if bqstorage_client is not None: @@ -243,11 +257,7 @@ def _try_fetch(self, size=None): self._query_data = _helpers.to_bq_table_rows(rows_iterable) return - rows_iter = client.list_rows( - self._query_job.destination, - selected_fields=self._query_job._query_results.schema, - page_size=self.arraysize, - ) + rows_iter = self._query_job.result(page_size=self.arraysize) self._query_data = iter(rows_iter) def _bqstorage_fetch(self, bqstorage_client): @@ -267,54 +277,33 @@ def _bqstorage_fetch(self, bqstorage_client): A sequence of rows, represented as dictionaries. """ # Hitting this code path with a BQ Storage client instance implies that - # bigquery_storage_v1* can indeed be imported here without errors. - from google.cloud import bigquery_storage_v1 - from google.cloud import bigquery_storage_v1beta1 + # bigquery_storage can indeed be imported here without errors. + from google.cloud import bigquery_storage table_reference = self._query_job.destination - is_v1beta1_client = isinstance( - bqstorage_client, bigquery_storage_v1beta1.BigQueryStorageClient + requested_session = bigquery_storage.types.ReadSession( + table=table_reference.to_bqstorage(), + data_format=bigquery_storage.types.DataFormat.ARROW, ) - # We want to preserve compatibility with the v1beta1 BQ Storage clients, - # thus adjust the session creation if needed. - if is_v1beta1_client: - warnings.warn( - "Support for BigQuery Storage v1beta1 clients is deprecated, please " - "consider upgrading the client to BigQuery Storage v1 stable version.", - category=DeprecationWarning, - ) - read_session = bqstorage_client.create_read_session( - table_reference.to_bqstorage(v1beta1=True), - "projects/{}".format(table_reference.project), - # a single stream only, as DB API is not well-suited for multithreading - requested_streams=1, - format_=bigquery_storage_v1beta1.enums.DataFormat.ARROW, - ) - else: - requested_session = bigquery_storage_v1.types.ReadSession( - table=table_reference.to_bqstorage(), - data_format=bigquery_storage_v1.enums.DataFormat.ARROW, - ) - read_session = bqstorage_client.create_read_session( - parent="projects/{}".format(table_reference.project), - read_session=requested_session, - # a single stream only, as DB API is not well-suited for multithreading - max_stream_count=1, + if _ARROW_COMPRESSION_SUPPORT: + requested_session.read_options.arrow_serialization_options.buffer_compression = ( + ArrowSerializationOptions.CompressionCodec.LZ4_FRAME ) + read_session = bqstorage_client.create_read_session( + parent="projects/{}".format(table_reference.project), + read_session=requested_session, + # a single stream only, as DB API is not well-suited for multithreading + max_stream_count=1, + ) + if not read_session.streams: return iter([]) # empty table, nothing to read - if is_v1beta1_client: - read_position = bigquery_storage_v1beta1.types.StreamPosition( - stream=read_session.streams[0], - ) - read_rows_stream = bqstorage_client.read_rows(read_position) - else: - stream_name = read_session.streams[0].name - read_rows_stream = bqstorage_client.read_rows(stream_name) + stream_name = read_session.streams[0].name + read_rows_stream = bqstorage_client.read_rows(stream_name) rows_iterable = read_rows_stream.rows(read_session) return rows_iterable @@ -335,7 +324,7 @@ def fetchone(self): """ self._try_fetch() try: - return six.next(self._query_data) + return next(self._query_data) except StopIteration: return None @@ -399,6 +388,10 @@ def setinputsizes(self, sizes): def setoutputsize(self, size, column=None): """No-op, but for consistency raise an error if cursor is closed.""" + def __iter__(self): + self._try_fetch() + return iter(self._query_data) + def _format_operation_list(operation, parameters): """Formats parameters in operation in the way BigQuery expects. @@ -423,7 +416,7 @@ def _format_operation_list(operation, parameters): try: return operation % tuple(formatted_params) - except TypeError as exc: + except (TypeError, ValueError) as exc: raise exceptions.ProgrammingError(exc) @@ -453,11 +446,11 @@ def _format_operation_dict(operation, parameters): try: return operation % formatted_params - except KeyError as exc: + except (KeyError, ValueError, TypeError) as exc: raise exceptions.ProgrammingError(exc) -def _format_operation(operation, parameters=None): +def _format_operation(operation, parameters): """Formats parameters in operation in way BigQuery expects. Args: @@ -474,10 +467,94 @@ def _format_operation(operation, parameters=None): if a parameter used in the operation is not found in the ``parameters`` argument. """ - if parameters is None: - return operation + if parameters is None or len(parameters) == 0: + return operation.replace("%%", "%"), None # Still do percent de-escaping. + + operation, parameter_types = _extract_types(operation) + if parameter_types is None: + raise exceptions.ProgrammingError( + f"Parameters were provided, but {repr(operation)} has no placeholders." + ) if isinstance(parameters, collections_abc.Mapping): - return _format_operation_dict(operation, parameters) + return _format_operation_dict(operation, parameters), parameter_types + + return _format_operation_list(operation, parameters), parameter_types + + +def _extract_types( + operation, + extra_type_sub=re.compile( + r""" + (%*) # Extra %s. We'll deal with these in the replacement code + + % # Beginning of replacement, %s, %(...)s + + (?:\( # Begin of optional name and/or type + ([^:)]*) # name + (?:: # ':' introduces type + ( # start of type group + [a-zA-Z0-9<>, ]+ # First part, no parens + + (?: # start sets of parens + non-paren text + \([0-9 ,]+\) # comma-separated groups of digits in parens + # (e.g. string(10)) + (?=[, >)]) # Must be followed by ,>) or space + [a-zA-Z0-9<>, ]* # Optional non-paren chars + )* # Can be zero or more of parens and following text + ) # end of type group + )? # close type clause ":type" + \))? # End of optional name and/or type + + s # End of replacement + """, + re.VERBOSE, + ).sub, +): + """Remove type information from parameter placeholders. + + For every parameter of the form %(name:type)s, replace with %(name)s and add the + item name->type to dict that's returned. + + Returns operation without type information and a dictionary of names and types. + """ + parameter_types = None + + def repl(m): + nonlocal parameter_types + prefix, name, type_ = m.groups() + if len(prefix) % 2: + # The prefix has an odd number of %s, the last of which + # escapes the % we're looking for, so we don't want to + # change anything. + return m.group(0) + + try: + if name: + if not parameter_types: + parameter_types = {} + if type_: + if name in parameter_types: + if type_ != parameter_types[name]: + raise exceptions.ProgrammingError( + f"Conflicting types for {name}: " + f"{parameter_types[name]} and {type_}." + ) + else: + parameter_types[name] = type_ + else: + if not isinstance(parameter_types, dict): + raise TypeError() + + return f"{prefix}%({name})s" + else: + if parameter_types is None: + parameter_types = [] + parameter_types.append(type_) + return f"{prefix}%s" + except (AttributeError, TypeError): + raise exceptions.ProgrammingError( + f"{repr(operation)} mixes named and unamed parameters." + ) - return _format_operation_list(operation, parameters) + return extra_type_sub(repl, operation), parameter_types diff --git a/google/cloud/bigquery/dbapi/types.py b/google/cloud/bigquery/dbapi/types.py index 14917820c..717593ae1 100644 --- a/google/cloud/bigquery/dbapi/types.py +++ b/google/cloud/bigquery/dbapi/types.py @@ -30,16 +30,28 @@ TimestampFromTicks = datetime.datetime.fromtimestamp -def Binary(string): +def Binary(data): """Contruct a DB-API binary value. Args: - string (str): A string to encode as a binary value. + data (bytes-like): An object containing binary data and that + can be converted to bytes with the `bytes` builtin. Returns: - bytes: The UTF-8 encoded bytes representing the string. + bytes: The binary data as a bytes object. """ - return string.encode("utf-8") + if isinstance(data, int): + # This is not the conversion we're looking for, because it + # will simply create a bytes object of the given size. + raise TypeError("cannot convert `int` object to binary") + + try: + return bytes(data) + except TypeError: + if isinstance(data, str): + return data.encode("utf-8") + else: + raise def TimeFromTicks(ticks, tz=None): @@ -78,7 +90,7 @@ def __eq__(self, other): STRING = "STRING" BINARY = _DBAPITypeObject("BYTES", "RECORD", "STRUCT") NUMBER = _DBAPITypeObject( - "INTEGER", "INT64", "FLOAT", "FLOAT64", "NUMERIC", "BOOLEAN", "BOOL" + "INTEGER", "INT64", "FLOAT", "FLOAT64", "NUMERIC", "BIGNUMERIC", "BOOLEAN", "BOOL" ) DATETIME = _DBAPITypeObject("TIMESTAMP", "DATE", "TIME", "DATETIME") ROWID = "ROWID" diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index 29fe543f6..d67cebd4c 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -15,9 +15,169 @@ import re import enum -import six +import itertools -from google.cloud.bigquery_v2.gapic import enums as gapic_enums +from google.cloud.bigquery_v2 import types as gapic_types +from google.cloud.bigquery.query import ScalarQueryParameterType + + +class AutoRowIDs(enum.Enum): + """How to handle automatic insert IDs when inserting rows as a stream.""" + + DISABLED = enum.auto() + GENERATE_UUID = enum.auto() + + +class Compression(object): + """The compression type to use for exported files. The default value is + :attr:`NONE`. + + :attr:`DEFLATE` and :attr:`SNAPPY` are + only supported for Avro. + """ + + GZIP = "GZIP" + """Specifies GZIP format.""" + + DEFLATE = "DEFLATE" + """Specifies DEFLATE format.""" + + SNAPPY = "SNAPPY" + """Specifies SNAPPY format.""" + + NONE = "NONE" + """Specifies no compression.""" + + +class DecimalTargetType: + """The data types that could be used as a target type when converting decimal values. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#DecimalTargetType + + .. versionadded:: 2.21.0 + """ + + NUMERIC = "NUMERIC" + """Decimal values could be converted to NUMERIC type.""" + + BIGNUMERIC = "BIGNUMERIC" + """Decimal values could be converted to BIGNUMERIC type.""" + + STRING = "STRING" + """Decimal values could be converted to STRING type.""" + + +class CreateDisposition(object): + """Specifies whether the job is allowed to create new tables. The default + value is :attr:`CREATE_IF_NEEDED`. + + Creation, truncation and append actions occur as one atomic update + upon job completion. + """ + + CREATE_IF_NEEDED = "CREATE_IF_NEEDED" + """If the table does not exist, BigQuery creates the table.""" + + CREATE_NEVER = "CREATE_NEVER" + """The table must already exist. If it does not, a 'notFound' error is + returned in the job result.""" + + +class DestinationFormat(object): + """The exported file format. The default value is :attr:`CSV`. + + Tables with nested or repeated fields cannot be exported as CSV. + """ + + CSV = "CSV" + """Specifies CSV format.""" + + NEWLINE_DELIMITED_JSON = "NEWLINE_DELIMITED_JSON" + """Specifies newline delimited JSON format.""" + + AVRO = "AVRO" + """Specifies Avro format.""" + + PARQUET = "PARQUET" + """Specifies Parquet format.""" + + +class Encoding(object): + """The character encoding of the data. The default is :attr:`UTF_8`. + + BigQuery decodes the data after the raw, binary data has been + split using the values of the quote and fieldDelimiter properties. + """ + + UTF_8 = "UTF-8" + """Specifies UTF-8 encoding.""" + + ISO_8859_1 = "ISO-8859-1" + """Specifies ISO-8859-1 encoding.""" + + +class QueryPriority(object): + """Specifies a priority for the query. The default value is + :attr:`INTERACTIVE`. + """ + + INTERACTIVE = "INTERACTIVE" + """Specifies interactive priority.""" + + BATCH = "BATCH" + """Specifies batch priority.""" + + +class SchemaUpdateOption(object): + """Specifies an update to the destination table schema as a side effect of + a load job. + """ + + ALLOW_FIELD_ADDITION = "ALLOW_FIELD_ADDITION" + """Allow adding a nullable field to the schema.""" + + ALLOW_FIELD_RELAXATION = "ALLOW_FIELD_RELAXATION" + """Allow relaxing a required field in the original schema to nullable.""" + + +class SourceFormat(object): + """The format of the data files. The default value is :attr:`CSV`. + + Note that the set of allowed values for loading data is different + than the set used for external data sources (see + :class:`~google.cloud.bigquery.external_config.ExternalSourceFormat`). + """ + + CSV = "CSV" + """Specifies CSV format.""" + + DATASTORE_BACKUP = "DATASTORE_BACKUP" + """Specifies datastore backup format""" + + NEWLINE_DELIMITED_JSON = "NEWLINE_DELIMITED_JSON" + """Specifies newline delimited JSON format.""" + + AVRO = "AVRO" + """Specifies Avro format.""" + + PARQUET = "PARQUET" + """Specifies Parquet format.""" + + ORC = "ORC" + """Specifies Orc format.""" + + +class KeyResultStatementKind: + """Determines which statement in the script represents the "key result". + + The "key result" is used to populate the schema and query results of the script job. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#keyresultstatementkind + """ + + KEY_RESULT_STATEMENT_KIND_UNSPECIFIED = "KEY_RESULT_STATEMENT_KIND_UNSPECIFIED" + LAST = "LAST" + FIRST_SELECT = "FIRST_SELECT" _SQL_SCALAR_TYPES = frozenset( @@ -31,8 +191,11 @@ "DATE", "TIME", "DATETIME", + "INTERVAL", "GEOGRAPHY", "NUMERIC", + "BIGNUMERIC", + "JSON", ) ) @@ -46,20 +209,20 @@ def _make_sql_scalars_enum(): "StandardSqlDataTypes", ( (member.name, member.value) - for member in gapic_enums.StandardSqlDataType.TypeKind + for member in gapic_types.StandardSqlDataType.TypeKind if member.name in _SQL_SCALAR_TYPES ), ) # make sure the docstring for the new enum is also correct - orig_doc = gapic_enums.StandardSqlDataType.TypeKind.__doc__ + orig_doc = gapic_types.StandardSqlDataType.TypeKind.__doc__ skip_pattern = re.compile( "|".join(_SQL_NONSCALAR_TYPES) + "|because a JSON object" # the second description line of STRUCT member ) new_doc = "\n".join( - six.moves.filterfalse(skip_pattern.search, orig_doc.splitlines()) + itertools.filterfalse(skip_pattern.search, orig_doc.splitlines()) ) new_enum.__doc__ = "An Enum of scalar SQL types.\n" + new_doc @@ -80,7 +243,8 @@ class SqlTypeNames(str, enum.Enum): INT64 = "INTEGER" FLOAT = "FLOAT" FLOAT64 = "FLOAT" - NUMERIC = "NUMERIC" + DECIMAL = NUMERIC = "NUMERIC" + BIGDECIMAL = BIGNUMERIC = "BIGNUMERIC" BOOLEAN = "BOOLEAN" BOOL = "BOOLEAN" GEOGRAPHY = "GEOGRAPHY" # NOTE: not available in legacy types @@ -90,3 +254,63 @@ class SqlTypeNames(str, enum.Enum): DATE = "DATE" TIME = "TIME" DATETIME = "DATETIME" + + +class SqlParameterScalarTypes: + """Supported scalar SQL query parameter types as type objects.""" + + BOOL = ScalarQueryParameterType("BOOL") + BOOLEAN = ScalarQueryParameterType("BOOL") + BIGDECIMAL = ScalarQueryParameterType("BIGNUMERIC") + BIGNUMERIC = ScalarQueryParameterType("BIGNUMERIC") + BYTES = ScalarQueryParameterType("BYTES") + DATE = ScalarQueryParameterType("DATE") + DATETIME = ScalarQueryParameterType("DATETIME") + DECIMAL = ScalarQueryParameterType("NUMERIC") + FLOAT = ScalarQueryParameterType("FLOAT64") + FLOAT64 = ScalarQueryParameterType("FLOAT64") + GEOGRAPHY = ScalarQueryParameterType("GEOGRAPHY") + INT64 = ScalarQueryParameterType("INT64") + INTEGER = ScalarQueryParameterType("INT64") + NUMERIC = ScalarQueryParameterType("NUMERIC") + STRING = ScalarQueryParameterType("STRING") + TIME = ScalarQueryParameterType("TIME") + TIMESTAMP = ScalarQueryParameterType("TIMESTAMP") + + +class WriteDisposition(object): + """Specifies the action that occurs if destination table already exists. + + The default value is :attr:`WRITE_APPEND`. + + Each action is atomic and only occurs if BigQuery is able to complete + the job successfully. Creation, truncation and append actions occur as one + atomic update upon job completion. + """ + + WRITE_APPEND = "WRITE_APPEND" + """If the table already exists, BigQuery appends the data to the table.""" + + WRITE_TRUNCATE = "WRITE_TRUNCATE" + """If the table already exists, BigQuery overwrites the table data.""" + + WRITE_EMPTY = "WRITE_EMPTY" + """If the table already exists and contains data, a 'duplicate' error is + returned in the job result.""" + + +class DeterminismLevel: + """Specifies determinism level for JavaScript user-defined functions (UDFs). + + https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#DeterminismLevel + """ + + DETERMINISM_LEVEL_UNSPECIFIED = "DETERMINISM_LEVEL_UNSPECIFIED" + """The determinism of the UDF is unspecified.""" + + DETERMINISTIC = "DETERMINISTIC" + """The UDF is deterministic, meaning that 2 function calls with the same inputs + always produce the same result, even across 2 query runs.""" + + NOT_DETERMINISTIC = "NOT_DETERMINISTIC" + """The UDF is not deterministic.""" diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py index 93490ef97..6e5c27eb1 100644 --- a/google/cloud/bigquery/exceptions.py +++ b/google/cloud/bigquery/exceptions.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,5 +13,9 @@ # limitations under the License. -class PyarrowMissingWarning(DeprecationWarning): - pass +class BigQueryError(Exception): + """Base class for all custom exceptions defined by the BigQuery client.""" + + +class LegacyBigQueryStorageError(BigQueryError): + """Raised when too old a version of BigQuery Storage extra is detected at runtime.""" diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py index 112dfdba4..f1692ba50 100644 --- a/google/cloud/bigquery/external_config.py +++ b/google/cloud/bigquery/external_config.py @@ -22,11 +22,13 @@ import base64 import copy +from typing import FrozenSet, Iterable, Optional from google.cloud.bigquery._helpers import _to_bytes from google.cloud.bigquery._helpers import _bytes_to_json from google.cloud.bigquery._helpers import _int_or_none from google.cloud.bigquery._helpers import _str_or_none +from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery.schema import SchemaField @@ -53,6 +55,12 @@ class ExternalSourceFormat(object): DATASTORE_BACKUP = "DATASTORE_BACKUP" """Specifies datastore backup format""" + ORC = "ORC" + """Specifies ORC format.""" + + PARQUET = "PARQUET" + """Specifies Parquet format.""" + BIGTABLE = "BIGTABLE" """Specifies Bigtable format.""" @@ -149,7 +157,7 @@ def type_(self): def type_(self, value): self._properties["type"] = value - def to_api_repr(self): + def to_api_repr(self) -> dict: """Build an API representation of this object. Returns: @@ -159,7 +167,7 @@ def to_api_repr(self): return copy.deepcopy(self._properties) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "BigtableColumn": """Factory: construct a :class:`~.external_config.BigtableColumn` instance given its API representation. @@ -251,7 +259,7 @@ def columns(self): def columns(self, value): self._properties["columns"] = [col.to_api_repr() for col in value] - def to_api_repr(self): + def to_api_repr(self) -> dict: """Build an API representation of this object. Returns: @@ -261,7 +269,7 @@ def to_api_repr(self): return copy.deepcopy(self._properties) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "BigtableColumnFamily": """Factory: construct a :class:`~.external_config.BigtableColumnFamily` instance given its API representation. @@ -333,7 +341,7 @@ def column_families(self): def column_families(self, value): self._properties["columnFamilies"] = [cf.to_api_repr() for cf in value] - def to_api_repr(self): + def to_api_repr(self) -> dict: """Build an API representation of this object. Returns: @@ -343,7 +351,7 @@ def to_api_repr(self): return copy.deepcopy(self._properties) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "BigtableOptions": """Factory: construct a :class:`~.external_config.BigtableOptions` instance given its API representation. @@ -450,7 +458,7 @@ def skip_leading_rows(self): def skip_leading_rows(self, value): self._properties["skipLeadingRows"] = str(value) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Build an API representation of this object. Returns: @@ -459,7 +467,7 @@ def to_api_repr(self): return copy.deepcopy(self._properties) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "CSVOptions": """Factory: construct a :class:`~.external_config.CSVOptions` instance given its API representation. @@ -513,7 +521,7 @@ def range(self): def range(self, value): self._properties["range"] = value - def to_api_repr(self): + def to_api_repr(self) -> dict: """Build an API representation of this object. Returns: @@ -522,7 +530,7 @@ def to_api_repr(self): return copy.deepcopy(self._properties) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "GoogleSheetsOptions": """Factory: construct a :class:`~.external_config.GoogleSheetsOptions` instance given its API representation. @@ -540,7 +548,7 @@ def from_api_repr(cls, resource): return config -_OPTION_CLASSES = (BigtableOptions, CSVOptions, GoogleSheetsOptions) +_OPTION_CLASSES = (BigtableOptions, CSVOptions, GoogleSheetsOptions, ParquetOptions) class HivePartitioningOptions(object): @@ -601,7 +609,7 @@ def require_partition_filter(self): def require_partition_filter(self, value): self._properties["requirePartitionFilter"] = value - def to_api_repr(self): + def to_api_repr(self) -> dict: """Build an API representation of this object. Returns: @@ -610,7 +618,7 @@ def to_api_repr(self): return copy.deepcopy(self._properties) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "HivePartitioningOptions": """Factory: construct a :class:`~.external_config.HivePartitioningOptions` instance given its API representation. @@ -686,6 +694,28 @@ def compression(self): def compression(self, value): self._properties["compression"] = value + @property + def decimal_target_types(self) -> Optional[FrozenSet[str]]: + """Possible SQL data types to which the source decimal values are converted. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.decimal_target_types + + .. versionadded:: 2.21.0 + """ + prop = self._properties.get("decimalTargetTypes") + if prop is not None: + prop = frozenset(prop) + return prop + + @decimal_target_types.setter + def decimal_target_types(self, value: Optional[Iterable[str]]): + if value is not None: + self._properties["decimalTargetTypes"] = list(value) + else: + if "decimalTargetTypes" in self._properties: + del self._properties["decimalTargetTypes"] + @property def hive_partitioning(self): """Optional[:class:`~.external_config.HivePartitioningOptions`]: [Beta] When set, \ @@ -760,6 +790,23 @@ def schema(self): prop = self._properties.get("schema", {}) return [SchemaField.from_api_repr(field) for field in prop.get("fields", [])] + @property + def connection_id(self): + """Optional[str]: [Experimental] ID of a BigQuery Connection API + resource. + + .. WARNING:: + + This feature is experimental. Pre-GA features may have limited + support, and changes to pre-GA features may not be compatible with + other pre-GA versions. + """ + return self._properties.get("connectionId") + + @connection_id.setter + def connection_id(self, value): + self._properties["connectionId"] = value + @schema.setter def schema(self, value): prop = value @@ -767,7 +814,26 @@ def schema(self, value): prop = {"fields": [field.to_api_repr() for field in value]} self._properties["schema"] = prop - def to_api_repr(self): + @property + def parquet_options(self): + """Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional + properties to set if ``sourceFormat`` is set to PARQUET. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.parquet_options + """ + if self.source_format != ExternalSourceFormat.PARQUET: + return None + return self._options + + @parquet_options.setter + def parquet_options(self, value): + if self.source_format != ExternalSourceFormat.PARQUET: + msg = f"Cannot set Parquet options, source format is {self.source_format}" + raise TypeError(msg) + self._options = value + + def to_api_repr(self) -> dict: """Build an API representation of this object. Returns: @@ -782,7 +848,7 @@ def to_api_repr(self): return config @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "ExternalConfig": """Factory: construct an :class:`~.external_config.ExternalConfig` instance given its API representation. diff --git a/google/cloud/bigquery/format_options.py b/google/cloud/bigquery/format_options.py new file mode 100644 index 000000000..2c9a2ce20 --- /dev/null +++ b/google/cloud/bigquery/format_options.py @@ -0,0 +1,80 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from typing import Dict + + +class ParquetOptions: + """Additional options if the PARQUET source format is used.""" + + _SOURCE_FORMAT = "PARQUET" + _RESOURCE_NAME = "parquetOptions" + + def __init__(self): + self._properties = {} + + @property + def enum_as_string(self) -> bool: + """Indicates whether to infer Parquet ENUM logical type as STRING instead of + BYTES by default. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enum_as_string + """ + return self._properties.get("enumAsString") + + @enum_as_string.setter + def enum_as_string(self, value: bool) -> None: + self._properties["enumAsString"] = value + + @property + def enable_list_inference(self) -> bool: + """Indicates whether to use schema inference specifically for Parquet LIST + logical type. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enable_list_inference + """ + return self._properties.get("enableListInference") + + @enable_list_inference.setter + def enable_list_inference(self, value: bool) -> None: + self._properties["enableListInference"] = value + + @classmethod + def from_api_repr(cls, resource: Dict[str, bool]) -> "ParquetOptions": + """Factory: construct an instance from a resource dict. + + Args: + resource (Dict[str, bool]): + Definition of a :class:`~.format_options.ParquetOptions` instance in + the same representation as is returned from the API. + + Returns: + :class:`~.format_options.ParquetOptions`: + Configuration parsed from ``resource``. + """ + config = cls() + config._properties = copy.deepcopy(resource) + return config + + def to_api_repr(self) -> dict: + """Build an API representation of this object. + + Returns: + Dict[str, bool]: + A dictionary in the format used by the BigQuery API. + """ + return copy.deepcopy(self._properties) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py deleted file mode 100644 index 20bce597a..000000000 --- a/google/cloud/bigquery/job.py +++ /dev/null @@ -1,3840 +0,0 @@ -# Copyright 2015 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Define API Jobs.""" - -from __future__ import division - -import concurrent.futures -import copy -import re -import threading - -import requests -import six -from six.moves import http_client - -import google.api_core.future.polling -from google.cloud import exceptions -from google.cloud.exceptions import NotFound -from google.cloud.bigquery.dataset import Dataset -from google.cloud.bigquery.dataset import DatasetListItem -from google.cloud.bigquery.dataset import DatasetReference -from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration -from google.cloud.bigquery.external_config import ExternalConfig -from google.cloud.bigquery.external_config import HivePartitioningOptions -from google.cloud.bigquery import _helpers -from google.cloud.bigquery.query import _query_param_from_api_repr -from google.cloud.bigquery.query import ArrayQueryParameter -from google.cloud.bigquery.query import ScalarQueryParameter -from google.cloud.bigquery.query import StructQueryParameter -from google.cloud.bigquery.query import UDFResource -from google.cloud.bigquery.retry import DEFAULT_RETRY -from google.cloud.bigquery.routine import RoutineReference -from google.cloud.bigquery.schema import SchemaField -from google.cloud.bigquery.schema import _to_schema_fields -from google.cloud.bigquery.table import _EmptyRowIterator -from google.cloud.bigquery.table import RangePartitioning -from google.cloud.bigquery.table import _table_arg_to_table_ref -from google.cloud.bigquery.table import TableReference -from google.cloud.bigquery.table import Table -from google.cloud.bigquery.table import TimePartitioning - -_DONE_STATE = "DONE" -_STOPPED_REASON = "stopped" -_TIMEOUT_BUFFER_SECS = 0.1 -_CONTAINS_ORDER_BY = re.compile(r"ORDER\s+BY", re.IGNORECASE) - -_ERROR_REASON_TO_EXCEPTION = { - "accessDenied": http_client.FORBIDDEN, - "backendError": http_client.INTERNAL_SERVER_ERROR, - "billingNotEnabled": http_client.FORBIDDEN, - "billingTierLimitExceeded": http_client.BAD_REQUEST, - "blocked": http_client.FORBIDDEN, - "duplicate": http_client.CONFLICT, - "internalError": http_client.INTERNAL_SERVER_ERROR, - "invalid": http_client.BAD_REQUEST, - "invalidQuery": http_client.BAD_REQUEST, - "notFound": http_client.NOT_FOUND, - "notImplemented": http_client.NOT_IMPLEMENTED, - "quotaExceeded": http_client.FORBIDDEN, - "rateLimitExceeded": http_client.FORBIDDEN, - "resourceInUse": http_client.BAD_REQUEST, - "resourcesExceeded": http_client.BAD_REQUEST, - "responseTooLarge": http_client.FORBIDDEN, - "stopped": http_client.OK, - "tableUnavailable": http_client.BAD_REQUEST, -} - - -def _error_result_to_exception(error_result): - """Maps BigQuery error reasons to an exception. - - The reasons and their matching HTTP status codes are documented on - the `troubleshooting errors`_ page. - - .. _troubleshooting errors: https://cloud.google.com/bigquery\ - /troubleshooting-errors - - Args: - error_result (Mapping[str, str]): The error result from BigQuery. - - Returns: - google.cloud.exceptions.GoogleCloudError: The mapped exception. - """ - reason = error_result.get("reason") - status_code = _ERROR_REASON_TO_EXCEPTION.get( - reason, http_client.INTERNAL_SERVER_ERROR - ) - return exceptions.from_http_status( - status_code, error_result.get("message", ""), errors=[error_result] - ) - - -def _contains_order_by(query): - """Do we need to preserve the order of the query results? - - This function has known false positives, such as with ordered window - functions: - - .. code-block:: sql - - SELECT SUM(x) OVER ( - window_name - PARTITION BY... - ORDER BY... - window_frame_clause) - FROM ... - - This false positive failure case means the behavior will be correct, but - downloading results with the BigQuery Storage API may be slower than it - otherwise would. This is preferable to the false negative case, where - results are expected to be in order but are not (due to parallel reads). - """ - return query and _CONTAINS_ORDER_BY.search(query) - - -class Compression(object): - """The compression type to use for exported files. The default value is - :attr:`NONE`. - - :attr:`DEFLATE` and :attr:`SNAPPY` are - only supported for Avro. - """ - - GZIP = "GZIP" - """Specifies GZIP format.""" - - DEFLATE = "DEFLATE" - """Specifies DEFLATE format.""" - - SNAPPY = "SNAPPY" - """Specifies SNAPPY format.""" - - NONE = "NONE" - """Specifies no compression.""" - - -class CreateDisposition(object): - """Specifies whether the job is allowed to create new tables. The default - value is :attr:`CREATE_IF_NEEDED`. - - Creation, truncation and append actions occur as one atomic update - upon job completion. - """ - - CREATE_IF_NEEDED = "CREATE_IF_NEEDED" - """If the table does not exist, BigQuery creates the table.""" - - CREATE_NEVER = "CREATE_NEVER" - """The table must already exist. If it does not, a 'notFound' error is - returned in the job result.""" - - -class DestinationFormat(object): - """The exported file format. The default value is :attr:`CSV`. - - Tables with nested or repeated fields cannot be exported as CSV. - """ - - CSV = "CSV" - """Specifies CSV format.""" - - NEWLINE_DELIMITED_JSON = "NEWLINE_DELIMITED_JSON" - """Specifies newline delimited JSON format.""" - - AVRO = "AVRO" - """Specifies Avro format.""" - - -class Encoding(object): - """The character encoding of the data. The default is :attr:`UTF_8`. - - BigQuery decodes the data after the raw, binary data has been - split using the values of the quote and fieldDelimiter properties. - """ - - UTF_8 = "UTF-8" - """Specifies UTF-8 encoding.""" - - ISO_8859_1 = "ISO-8859-1" - """Specifies ISO-8859-1 encoding.""" - - -class QueryPriority(object): - """Specifies a priority for the query. The default value is - :attr:`INTERACTIVE`. - """ - - INTERACTIVE = "INTERACTIVE" - """Specifies interactive priority.""" - - BATCH = "BATCH" - """Specifies batch priority.""" - - -class SourceFormat(object): - """The format of the data files. The default value is :attr:`CSV`. - - Note that the set of allowed values for loading data is different - than the set used for external data sources (see - :class:`~google.cloud.bigquery.external_config.ExternalSourceFormat`). - """ - - CSV = "CSV" - """Specifies CSV format.""" - - DATASTORE_BACKUP = "DATASTORE_BACKUP" - """Specifies datastore backup format""" - - NEWLINE_DELIMITED_JSON = "NEWLINE_DELIMITED_JSON" - """Specifies newline delimited JSON format.""" - - AVRO = "AVRO" - """Specifies Avro format.""" - - PARQUET = "PARQUET" - """Specifies Parquet format.""" - - ORC = "ORC" - """Specifies Orc format.""" - - -class WriteDisposition(object): - """Specifies the action that occurs if destination table already exists. - - The default value is :attr:`WRITE_APPEND`. - - Each action is atomic and only occurs if BigQuery is able to complete - the job successfully. Creation, truncation and append actions occur as one - atomic update upon job completion. - """ - - WRITE_APPEND = "WRITE_APPEND" - """If the table already exists, BigQuery appends the data to the table.""" - - WRITE_TRUNCATE = "WRITE_TRUNCATE" - """If the table already exists, BigQuery overwrites the table data.""" - - WRITE_EMPTY = "WRITE_EMPTY" - """If the table already exists and contains data, a 'duplicate' error is - returned in the job result.""" - - -class SchemaUpdateOption(object): - """Specifies an update to the destination table schema as a side effect of - a load job. - """ - - ALLOW_FIELD_ADDITION = "ALLOW_FIELD_ADDITION" - """Allow adding a nullable field to the schema.""" - - ALLOW_FIELD_RELAXATION = "ALLOW_FIELD_RELAXATION" - """Allow relaxing a required field in the original schema to nullable.""" - - -class _JobReference(object): - """A reference to a job. - - Args: - job_id (str): ID of the job to run. - project (str): ID of the project where the job runs. - location (str): Location of where the job runs. - """ - - def __init__(self, job_id, project, location): - self._properties = {"jobId": job_id, "projectId": project} - # The location field must not be populated if it is None. - if location: - self._properties["location"] = location - - @property - def job_id(self): - """str: ID of the job.""" - return self._properties.get("jobId") - - @property - def project(self): - """str: ID of the project where the job runs.""" - return self._properties.get("projectId") - - @property - def location(self): - """str: Location where the job runs.""" - return self._properties.get("location") - - def _to_api_repr(self): - """Returns the API resource representation of the job reference.""" - return copy.deepcopy(self._properties) - - @classmethod - def _from_api_repr(cls, resource): - """Returns a job reference for an API resource representation.""" - job_id = resource.get("jobId") - project = resource.get("projectId") - location = resource.get("location") - job_ref = cls(job_id, project, location) - return job_ref - - -class _AsyncJob(google.api_core.future.polling.PollingFuture): - """Base class for asynchronous jobs. - - Args: - job_id (Union[str, _JobReference]): - Job's ID in the project associated with the client or a - fully-qualified job reference. - client (google.cloud.bigquery.client.Client): - Client which holds credentials and project configuration. - """ - - def __init__(self, job_id, client): - super(_AsyncJob, self).__init__() - - # The job reference can be either a plain job ID or the full resource. - # Populate the properties dictionary consistently depending on what has - # been passed in. - job_ref = job_id - if not isinstance(job_id, _JobReference): - job_ref = _JobReference(job_id, client.project, None) - self._properties = {"jobReference": job_ref._to_api_repr()} - - self._client = client - self._result_set = False - self._completion_lock = threading.Lock() - - @property - def job_id(self): - """str: ID of the job.""" - return _helpers._get_sub_prop(self._properties, ["jobReference", "jobId"]) - - @property - def parent_job_id(self): - """Return the ID of the parent job. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics.FIELDS.parent_job_id - - Returns: - Optional[str]: parent job id. - """ - return _helpers._get_sub_prop(self._properties, ["statistics", "parentJobId"]) - - @property - def script_statistics(self): - resource = _helpers._get_sub_prop( - self._properties, ["statistics", "scriptStatistics"] - ) - if resource is None: - return None - return ScriptStatistics(resource) - - @property - def num_child_jobs(self): - """The number of child jobs executed. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics.FIELDS.num_child_jobs - - Returns: - int - """ - count = _helpers._get_sub_prop(self._properties, ["statistics", "numChildJobs"]) - return int(count) if count is not None else 0 - - @property - def project(self): - """Project bound to the job. - - Returns: - str: the project (derived from the client). - """ - return _helpers._get_sub_prop(self._properties, ["jobReference", "projectId"]) - - @property - def location(self): - """str: Location where the job runs.""" - return _helpers._get_sub_prop(self._properties, ["jobReference", "location"]) - - def _require_client(self, client): - """Check client or verify over-ride. - - Args: - client (Optional[google.cloud.bigquery.client.Client]): - the client to use. If not passed, falls back to the - ``client`` stored on the current dataset. - - Returns: - google.cloud.bigquery.client.Client: - The client passed in or the currently bound client. - """ - if client is None: - client = self._client - return client - - @property - def job_type(self): - """Type of job. - - Returns: - str: one of 'load', 'copy', 'extract', 'query'. - """ - return self._JOB_TYPE - - @property - def path(self): - """URL path for the job's APIs. - - Returns: - str: the path based on project and job ID. - """ - return "/projects/%s/jobs/%s" % (self.project, self.job_id) - - @property - def labels(self): - """Dict[str, str]: Labels for the job.""" - return self._properties.setdefault("labels", {}) - - @property - def etag(self): - """ETag for the job resource. - - Returns: - Optional[str]: the ETag (None until set from the server). - """ - return self._properties.get("etag") - - @property - def self_link(self): - """URL for the job resource. - - Returns: - Optional[str]: the URL (None until set from the server). - """ - return self._properties.get("selfLink") - - @property - def user_email(self): - """E-mail address of user who submitted the job. - - Returns: - Optional[str]: the URL (None until set from the server). - """ - return self._properties.get("user_email") - - @property - def created(self): - """Datetime at which the job was created. - - Returns: - Optional[datetime.datetime]: - the creation time (None until set from the server). - """ - statistics = self._properties.get("statistics") - if statistics is not None: - millis = statistics.get("creationTime") - if millis is not None: - return _helpers._datetime_from_microseconds(millis * 1000.0) - - @property - def started(self): - """Datetime at which the job was started. - - Returns: - Optional[datetime.datetime]: - the start time (None until set from the server). - """ - statistics = self._properties.get("statistics") - if statistics is not None: - millis = statistics.get("startTime") - if millis is not None: - return _helpers._datetime_from_microseconds(millis * 1000.0) - - @property - def ended(self): - """Datetime at which the job finished. - - Returns: - Optional[datetime.datetime]: - the end time (None until set from the server). - """ - statistics = self._properties.get("statistics") - if statistics is not None: - millis = statistics.get("endTime") - if millis is not None: - return _helpers._datetime_from_microseconds(millis * 1000.0) - - def _job_statistics(self): - """Helper for job-type specific statistics-based properties.""" - statistics = self._properties.get("statistics", {}) - return statistics.get(self._JOB_TYPE, {}) - - @property - def error_result(self): - """Error information about the job as a whole. - - Returns: - Optional[Mapping]: the error information (None until set from the server). - """ - status = self._properties.get("status") - if status is not None: - return status.get("errorResult") - - @property - def errors(self): - """Information about individual errors generated by the job. - - Returns: - Optional[List[Mapping]]: - the error information (None until set from the server). - """ - status = self._properties.get("status") - if status is not None: - return status.get("errors") - - @property - def state(self): - """Status of the job. - - Returns: - Optional[str]: - the state (None until set from the server). - """ - status = self._properties.get("status") - if status is not None: - return status.get("state") - - def _scrub_local_properties(self, cleaned): - """Helper: handle subclass properties in cleaned.""" - pass - - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - raise NotImplementedError("Abstract") - - def _set_properties(self, api_response): - """Update properties from resource in body of ``api_response`` - - Args: - api_response (Dict): response returned from an API call. - """ - cleaned = api_response.copy() - self._scrub_local_properties(cleaned) - - statistics = cleaned.get("statistics", {}) - if "creationTime" in statistics: - statistics["creationTime"] = float(statistics["creationTime"]) - if "startTime" in statistics: - statistics["startTime"] = float(statistics["startTime"]) - if "endTime" in statistics: - statistics["endTime"] = float(statistics["endTime"]) - - self._properties.clear() - self._properties.update(cleaned) - self._copy_configuration_properties(cleaned.get("configuration", {})) - - # For Future interface - self._set_future_result() - - @classmethod - def _get_resource_config(cls, resource): - """Helper for :meth:`from_api_repr` - - Args: - resource (Dict): resource for the job. - - Returns: - (str, Dict): - tuple (string, dict), where the first element is the - job ID and the second contains job-specific configuration. - - Raises: - KeyError: - If the resource has no identifier, or - is missing the appropriate configuration. - """ - if "jobReference" not in resource or "jobId" not in resource["jobReference"]: - raise KeyError( - "Resource lacks required identity information: " - '["jobReference"]["jobId"]' - ) - job_id = resource["jobReference"]["jobId"] - if ( - "configuration" not in resource - or cls._JOB_TYPE not in resource["configuration"] - ): - raise KeyError( - "Resource lacks required configuration: " - '["configuration"]["%s"]' % cls._JOB_TYPE - ) - return job_id, resource["configuration"] - - def to_api_repr(self): - """Generate a resource for the job.""" - raise NotImplementedError("Abstract") - - _build_resource = to_api_repr # backward-compatibility alias - - def _begin(self, client=None, retry=DEFAULT_RETRY, timeout=None): - """API call: begin the job via a POST request - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert - - Args: - client (Optional[google.cloud.bigquery.client.Client]): - The client to use. If not passed, falls back to the ``client`` - associated with the job object or``NoneType`` - retry (Optional[google.api_core.retry.Retry]): - How to retry the RPC. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - - Raises: - ValueError: - If the job has already begun. - """ - if self.state is not None: - raise ValueError("Job already begun.") - - client = self._require_client(client) - path = "/projects/%s/jobs" % (self.project,) - - # jobs.insert is idempotent because we ensure that every new - # job has an ID. - span_attributes = {"path": path} - api_response = client._call_api( - retry, - span_name="BigQuery.job.begin", - span_attributes=span_attributes, - job_ref=self, - method="POST", - path=path, - data=self.to_api_repr(), - timeout=timeout, - ) - self._set_properties(api_response) - - def exists(self, client=None, retry=DEFAULT_RETRY, timeout=None): - """API call: test for the existence of the job via a GET request - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/get - - Args: - client (Optional[google.cloud.bigquery.client.Client]): - the client to use. If not passed, falls back to the - ``client`` stored on the current dataset. - - retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - - Returns: - bool: Boolean indicating existence of the job. - """ - client = self._require_client(client) - - extra_params = {"fields": "id"} - if self.location: - extra_params["location"] = self.location - - try: - span_attributes = {"path": self.path} - - client._call_api( - retry, - span_name="BigQuery.job.exists", - span_attributes=span_attributes, - job_ref=self, - method="GET", - path=self.path, - query_params=extra_params, - timeout=timeout, - ) - except NotFound: - return False - else: - return True - - def reload(self, client=None, retry=DEFAULT_RETRY, timeout=None): - """API call: refresh job properties via a GET request. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/get - - Args: - client (Optional[google.cloud.bigquery.client.Client]): - the client to use. If not passed, falls back to the - ``client`` stored on the current dataset. - - retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - """ - client = self._require_client(client) - - extra_params = {} - if self.location: - extra_params["location"] = self.location - span_attributes = {"path": self.path} - - api_response = client._call_api( - retry, - span_name="BigQuery.job.reload", - span_attributes=span_attributes, - job_ref=self, - method="GET", - path=self.path, - query_params=extra_params, - timeout=timeout, - ) - self._set_properties(api_response) - - def cancel(self, client=None, retry=DEFAULT_RETRY, timeout=None): - """API call: cancel job via a POST request - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/cancel - - Args: - client (Optional[google.cloud.bigquery.client.Client]): - the client to use. If not passed, falls back to the - ``client`` stored on the current dataset. - retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry`` - - Returns: - bool: Boolean indicating that the cancel request was sent. - """ - client = self._require_client(client) - - extra_params = {} - if self.location: - extra_params["location"] = self.location - - path = "{}/cancel".format(self.path) - span_attributes = {"path": path} - - api_response = client._call_api( - retry, - span_name="BigQuery.job.cancel", - span_attributes=span_attributes, - job_ref=self, - method="POST", - path=path, - query_params=extra_params, - timeout=timeout, - ) - self._set_properties(api_response["job"]) - # The Future interface requires that we return True if the *attempt* - # to cancel was successful. - return True - - # The following methods implement the PollingFuture interface. Note that - # the methods above are from the pre-Future interface and are left for - # compatibility. The only "overloaded" method is :meth:`cancel`, which - # satisfies both interfaces. - - def _set_future_result(self): - """Set the result or exception from the job if it is complete.""" - # This must be done in a lock to prevent the polling thread - # and main thread from both executing the completion logic - # at the same time. - with self._completion_lock: - # If the operation isn't complete or if the result has already been - # set, do not call set_result/set_exception again. - # Note: self._result_set is set to True in set_result and - # set_exception, in case those methods are invoked directly. - if self.state != _DONE_STATE or self._result_set: - return - - if self.error_result is not None: - exception = _error_result_to_exception(self.error_result) - self.set_exception(exception) - else: - self.set_result(self) - - def done(self, retry=DEFAULT_RETRY, timeout=None): - """Refresh the job and checks if it is complete. - - Args: - retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - - Returns: - bool: True if the job is complete, False otherwise. - """ - # Do not refresh is the state is already done, as the job will not - # change once complete. - if self.state != _DONE_STATE: - self.reload(retry=retry, timeout=timeout) - return self.state == _DONE_STATE - - def result(self, retry=DEFAULT_RETRY, timeout=None): - """Start the job and wait for it to complete and get the result. - - Args: - retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - If multiple requests are made under the hood, ``timeout`` - applies to each individual request. - - Returns: - _AsyncJob: This instance. - - Raises: - google.cloud.exceptions.GoogleCloudError: - if the job failed. - concurrent.futures.TimeoutError: - if the job did not complete in the given timeout. - """ - if self.state is None: - self._begin(retry=retry, timeout=timeout) - # TODO: modify PollingFuture so it can pass a retry argument to done(). - return super(_AsyncJob, self).result(timeout=timeout) - - def cancelled(self): - """Check if the job has been cancelled. - - This always returns False. It's not possible to check if a job was - cancelled in the API. This method is here to satisfy the interface - for :class:`google.api_core.future.Future`. - - Returns: - bool: False - """ - return ( - self.error_result is not None - and self.error_result.get("reason") == _STOPPED_REASON - ) - - -class _JobConfig(object): - """Abstract base class for job configuration objects. - - Args: - job_type (str): The key to use for the job configuration. - """ - - def __init__(self, job_type, **kwargs): - self._job_type = job_type - self._properties = {job_type: {}} - for prop, val in kwargs.items(): - setattr(self, prop, val) - - @property - def labels(self): - """Dict[str, str]: Labels for the job. - - This method always returns a dict. To change a job's labels, - modify the dict, then call ``Client.update_job``. To delete a - label, set its value to :data:`None` before updating. - - Raises: - ValueError: If ``value`` type is invalid. - """ - return self._properties.setdefault("labels", {}) - - @labels.setter - def labels(self, value): - if not isinstance(value, dict): - raise ValueError("Pass a dict") - self._properties["labels"] = value - - def _get_sub_prop(self, key, default=None): - """Get a value in the ``self._properties[self._job_type]`` dictionary. - - Most job properties are inside the dictionary related to the job type - (e.g. 'copy', 'extract', 'load', 'query'). Use this method to access - those properties:: - - self._get_sub_prop('destinationTable') - - This is equivalent to using the ``_helpers._get_sub_prop`` function:: - - _helpers._get_sub_prop( - self._properties, ['query', 'destinationTable']) - - Args: - key (str): - Key for the value to get in the - ``self._properties[self._job_type]`` dictionary. - default (Optional[object]): - Default value to return if the key is not found. - Defaults to :data:`None`. - - Returns: - object: The value if present or the default. - """ - return _helpers._get_sub_prop( - self._properties, [self._job_type, key], default=default - ) - - def _set_sub_prop(self, key, value): - """Set a value in the ``self._properties[self._job_type]`` dictionary. - - Most job properties are inside the dictionary related to the job type - (e.g. 'copy', 'extract', 'load', 'query'). Use this method to set - those properties:: - - self._set_sub_prop('useLegacySql', False) - - This is equivalent to using the ``_helper._set_sub_prop`` function:: - - _helper._set_sub_prop( - self._properties, ['query', 'useLegacySql'], False) - - Args: - key (str): - Key to set in the ``self._properties[self._job_type]`` - dictionary. - value (object): Value to set. - """ - _helpers._set_sub_prop(self._properties, [self._job_type, key], value) - - def _del_sub_prop(self, key): - """Remove ``key`` from the ``self._properties[self._job_type]`` dict. - - Most job properties are inside the dictionary related to the job type - (e.g. 'copy', 'extract', 'load', 'query'). Use this method to clear - those properties:: - - self._del_sub_prop('useLegacySql') - - This is equivalent to using the ``_helper._del_sub_prop`` function:: - - _helper._del_sub_prop( - self._properties, ['query', 'useLegacySql']) - - Args: - key (str): - Key to remove in the ``self._properties[self._job_type]`` - dictionary. - """ - _helpers._del_sub_prop(self._properties, [self._job_type, key]) - - def to_api_repr(self): - """Build an API representation of the job config. - - Returns: - Dict: A dictionary in the format used by the BigQuery API. - """ - return copy.deepcopy(self._properties) - - def _fill_from_default(self, default_job_config): - """Merge this job config with a default job config. - - The keys in this object take precedence over the keys in the default - config. The merge is done at the top-level as well as for keys one - level below the job type. - - Args: - default_job_config (google.cloud.bigquery.job._JobConfig): - The default job config that will be used to fill in self. - - Returns: - google.cloud.bigquery.job._JobConfig: A new (merged) job config. - """ - if self._job_type != default_job_config._job_type: - raise TypeError( - "attempted to merge two incompatible job types: " - + repr(self._job_type) - + ", " - + repr(default_job_config._job_type) - ) - - new_job_config = self.__class__() - - default_job_properties = copy.deepcopy(default_job_config._properties) - for key in self._properties: - if key != self._job_type: - default_job_properties[key] = self._properties[key] - - default_job_properties[self._job_type].update(self._properties[self._job_type]) - new_job_config._properties = default_job_properties - - return new_job_config - - @classmethod - def from_api_repr(cls, resource): - """Factory: construct a job configuration given its API representation - - Args: - resource (Dict): - An extract job configuration in the same representation as is - returned from the API. - - Returns: - google.cloud.bigquery.job._JobConfig: Configuration parsed from ``resource``. - """ - config = cls() - config._properties = copy.deepcopy(resource) - return config - - -class LoadJobConfig(_JobConfig): - """Configuration options for load jobs. - - All properties in this class are optional. Values which are :data:`None` -> - server defaults. Set properties on the constructed configuration by using - the property name as the name of a keyword argument. - """ - - def __init__(self, **kwargs): - super(LoadJobConfig, self).__init__("load", **kwargs) - - @property - def allow_jagged_rows(self): - """Optional[bool]: Allow missing trailing optional columns (CSV only). - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.allow_jagged_rows - """ - return self._get_sub_prop("allowJaggedRows") - - @allow_jagged_rows.setter - def allow_jagged_rows(self, value): - self._set_sub_prop("allowJaggedRows", value) - - @property - def allow_quoted_newlines(self): - """Optional[bool]: Allow quoted data containing newline characters (CSV only). - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.allow_quoted_newlines - """ - return self._get_sub_prop("allowQuotedNewlines") - - @allow_quoted_newlines.setter - def allow_quoted_newlines(self, value): - self._set_sub_prop("allowQuotedNewlines", value) - - @property - def autodetect(self): - """Optional[bool]: Automatically infer the schema from a sample of the data. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.autodetect - """ - return self._get_sub_prop("autodetect") - - @autodetect.setter - def autodetect(self, value): - self._set_sub_prop("autodetect", value) - - @property - def clustering_fields(self): - """Optional[List[str]]: Fields defining clustering for the table - - (Defaults to :data:`None`). - - Clustering fields are immutable after table creation. - - .. note:: - - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. - """ - prop = self._get_sub_prop("clustering") - if prop is not None: - return list(prop.get("fields", ())) - - @clustering_fields.setter - def clustering_fields(self, value): - """Optional[List[str]]: Fields defining clustering for the table - - (Defaults to :data:`None`). - """ - if value is not None: - self._set_sub_prop("clustering", {"fields": value}) - else: - self._del_sub_prop("clustering") - - @property - def create_disposition(self): - """Optional[google.cloud.bigquery.job.CreateDisposition]: Specifies behavior - for creating tables. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.create_disposition - """ - return self._get_sub_prop("createDisposition") - - @create_disposition.setter - def create_disposition(self, value): - self._set_sub_prop("createDisposition", value) - - @property - def destination_encryption_configuration(self): - """Optional[google.cloud.bigquery.encryption_configuration.EncryptionConfiguration]: Custom - encryption configuration for the destination table. - - Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` - if using default encryption. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.destination_encryption_configuration - """ - prop = self._get_sub_prop("destinationEncryptionConfiguration") - if prop is not None: - prop = EncryptionConfiguration.from_api_repr(prop) - return prop - - @destination_encryption_configuration.setter - def destination_encryption_configuration(self, value): - api_repr = value - if value is not None: - api_repr = value.to_api_repr() - self._set_sub_prop("destinationEncryptionConfiguration", api_repr) - else: - self._del_sub_prop("destinationEncryptionConfiguration") - - @property - def destination_table_description(self): - """Optional[str]: Name given to destination table. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.description - """ - prop = self._get_sub_prop("destinationTableProperties") - if prop is not None: - return prop["description"] - - @destination_table_description.setter - def destination_table_description(self, value): - keys = [self._job_type, "destinationTableProperties", "description"] - if value is not None: - _helpers._set_sub_prop(self._properties, keys, value) - else: - _helpers._del_sub_prop(self._properties, keys) - - @property - def destination_table_friendly_name(self): - """Optional[str]: Name given to destination table. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.friendly_name - """ - prop = self._get_sub_prop("destinationTableProperties") - if prop is not None: - return prop["friendlyName"] - - @destination_table_friendly_name.setter - def destination_table_friendly_name(self, value): - keys = [self._job_type, "destinationTableProperties", "friendlyName"] - if value is not None: - _helpers._set_sub_prop(self._properties, keys, value) - else: - _helpers._del_sub_prop(self._properties, keys) - - @property - def encoding(self): - """Optional[google.cloud.bigquery.job.Encoding]: The character encoding of the - data. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.encoding - """ - return self._get_sub_prop("encoding") - - @encoding.setter - def encoding(self, value): - self._set_sub_prop("encoding", value) - - @property - def field_delimiter(self): - """Optional[str]: The separator for fields in a CSV file. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.field_delimiter - """ - return self._get_sub_prop("fieldDelimiter") - - @field_delimiter.setter - def field_delimiter(self, value): - self._set_sub_prop("fieldDelimiter", value) - - @property - def hive_partitioning(self): - """Optional[:class:`~.external_config.HivePartitioningOptions`]: [Beta] When set, \ - it configures hive partitioning support. - - .. note:: - **Experimental**. This feature is experimental and might change or - have limited support. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.hive_partitioning_options - """ - prop = self._get_sub_prop("hivePartitioningOptions") - if prop is None: - return None - return HivePartitioningOptions.from_api_repr(prop) - - @hive_partitioning.setter - def hive_partitioning(self, value): - if value is not None: - if isinstance(value, HivePartitioningOptions): - value = value.to_api_repr() - else: - raise TypeError("Expected a HivePartitioningOptions instance or None.") - - self._set_sub_prop("hivePartitioningOptions", value) - - @property - def ignore_unknown_values(self): - """Optional[bool]: Ignore extra values not represented in the table schema. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.ignore_unknown_values - """ - return self._get_sub_prop("ignoreUnknownValues") - - @ignore_unknown_values.setter - def ignore_unknown_values(self, value): - self._set_sub_prop("ignoreUnknownValues", value) - - @property - def max_bad_records(self): - """Optional[int]: Number of invalid rows to ignore. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.max_bad_records - """ - return _helpers._int_or_none(self._get_sub_prop("maxBadRecords")) - - @max_bad_records.setter - def max_bad_records(self, value): - self._set_sub_prop("maxBadRecords", value) - - @property - def null_marker(self): - """Optional[str]: Represents a null value (CSV only). - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_marker - """ - return self._get_sub_prop("nullMarker") - - @null_marker.setter - def null_marker(self, value): - self._set_sub_prop("nullMarker", value) - - @property - def quote_character(self): - """Optional[str]: Character used to quote data sections (CSV only). - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.quote - """ - return self._get_sub_prop("quote") - - @quote_character.setter - def quote_character(self, value): - self._set_sub_prop("quote", value) - - @property - def range_partitioning(self): - """Optional[google.cloud.bigquery.table.RangePartitioning]: - Configures range-based partitioning for destination table. - - .. note:: - **Beta**. The integer range partitioning feature is in a - pre-release state and might change or have limited support. - - Only specify at most one of - :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or - :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. - - Raises: - ValueError: - If the value is not - :class:`~google.cloud.bigquery.table.RangePartitioning` or - :data:`None`. - """ - resource = self._get_sub_prop("rangePartitioning") - if resource is not None: - return RangePartitioning(_properties=resource) - - @range_partitioning.setter - def range_partitioning(self, value): - resource = value - if isinstance(value, RangePartitioning): - resource = value._properties - elif value is not None: - raise ValueError( - "Expected value to be RangePartitioning or None, got {}.".format(value) - ) - self._set_sub_prop("rangePartitioning", resource) - - @property - def schema(self): - """Optional[Sequence[Union[ \ - :class:`~google.cloud.bigquery.schema.SchemaField`, \ - Mapping[str, Any] \ - ]]]: Schema of the destination table. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.schema - """ - schema = _helpers._get_sub_prop(self._properties, ["load", "schema", "fields"]) - if schema is None: - return - return [SchemaField.from_api_repr(field) for field in schema] - - @schema.setter - def schema(self, value): - if value is None: - self._del_sub_prop("schema") - return - - value = _to_schema_fields(value) - - _helpers._set_sub_prop( - self._properties, - ["load", "schema", "fields"], - [field.to_api_repr() for field in value], - ) - - @property - def schema_update_options(self): - """Optional[List[google.cloud.bigquery.job.SchemaUpdateOption]]: Specifies - updates to the destination table schema to allow as a side effect of - the load job. - """ - return self._get_sub_prop("schemaUpdateOptions") - - @schema_update_options.setter - def schema_update_options(self, values): - self._set_sub_prop("schemaUpdateOptions", values) - - @property - def skip_leading_rows(self): - """Optional[int]: Number of rows to skip when reading data (CSV only). - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.skip_leading_rows - """ - return _helpers._int_or_none(self._get_sub_prop("skipLeadingRows")) - - @skip_leading_rows.setter - def skip_leading_rows(self, value): - self._set_sub_prop("skipLeadingRows", str(value)) - - @property - def source_format(self): - """Optional[google.cloud.bigquery.job.SourceFormat]: File format of the data. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_format - """ - return self._get_sub_prop("sourceFormat") - - @source_format.setter - def source_format(self, value): - self._set_sub_prop("sourceFormat", value) - - @property - def time_partitioning(self): - """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based - partitioning for the destination table. - - Only specify at most one of - :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or - :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. - """ - prop = self._get_sub_prop("timePartitioning") - if prop is not None: - prop = TimePartitioning.from_api_repr(prop) - return prop - - @time_partitioning.setter - def time_partitioning(self, value): - api_repr = value - if value is not None: - api_repr = value.to_api_repr() - self._set_sub_prop("timePartitioning", api_repr) - else: - self._del_sub_prop("timePartitioning") - - @property - def use_avro_logical_types(self): - """Optional[bool]: For loads of Avro data, governs whether Avro logical types are - converted to their corresponding BigQuery types (e.g. TIMESTAMP) rather than - raw types (e.g. INTEGER). - """ - return self._get_sub_prop("useAvroLogicalTypes") - - @use_avro_logical_types.setter - def use_avro_logical_types(self, value): - self._set_sub_prop("useAvroLogicalTypes", bool(value)) - - @property - def write_disposition(self): - """Optional[google.cloud.bigquery.job.WriteDisposition]: Action that occurs if - the destination table already exists. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.write_disposition - """ - return self._get_sub_prop("writeDisposition") - - @write_disposition.setter - def write_disposition(self, value): - self._set_sub_prop("writeDisposition", value) - - -class LoadJob(_AsyncJob): - """Asynchronous job for loading data into a table. - - Can load from Google Cloud Storage URIs or from a file. - - Args: - job_id (str): the job's ID - - source_uris (Optional[Sequence[str]]): - URIs of one or more data files to be loaded. See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_uris - for supported URI formats. Pass None for jobs that load from a file. - - destination (google.cloud.bigquery.table.TableReference): reference to table into which data is to be loaded. - - client (google.cloud.bigquery.client.Client): - A client which holds credentials and project configuration - for the dataset (which requires a project). - """ - - _JOB_TYPE = "load" - - def __init__(self, job_id, source_uris, destination, client, job_config=None): - super(LoadJob, self).__init__(job_id, client) - - if job_config is None: - job_config = LoadJobConfig() - - self.source_uris = source_uris - self._destination = destination - self._configuration = job_config - - @property - def destination(self): - """google.cloud.bigquery.table.TableReference: table where loaded rows are written - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.destination_table - """ - return self._destination - - @property - def allow_jagged_rows(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.allow_jagged_rows`. - """ - return self._configuration.allow_jagged_rows - - @property - def allow_quoted_newlines(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.allow_quoted_newlines`. - """ - return self._configuration.allow_quoted_newlines - - @property - def autodetect(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.autodetect`. - """ - return self._configuration.autodetect - - @property - def create_disposition(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.create_disposition`. - """ - return self._configuration.create_disposition - - @property - def encoding(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.encoding`. - """ - return self._configuration.encoding - - @property - def field_delimiter(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.field_delimiter`. - """ - return self._configuration.field_delimiter - - @property - def ignore_unknown_values(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.ignore_unknown_values`. - """ - return self._configuration.ignore_unknown_values - - @property - def max_bad_records(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.max_bad_records`. - """ - return self._configuration.max_bad_records - - @property - def null_marker(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.null_marker`. - """ - return self._configuration.null_marker - - @property - def quote_character(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.quote_character`. - """ - return self._configuration.quote_character - - @property - def skip_leading_rows(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.skip_leading_rows`. - """ - return self._configuration.skip_leading_rows - - @property - def source_format(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.source_format`. - """ - return self._configuration.source_format - - @property - def write_disposition(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.write_disposition`. - """ - return self._configuration.write_disposition - - @property - def schema(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.schema`. - """ - return self._configuration.schema - - @property - def destination_encryption_configuration(self): - """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom - encryption configuration for the destination table. - - Custom encryption configuration (e.g., Cloud KMS keys) - or :data:`None` if using default encryption. - - See - :attr:`google.cloud.bigquery.job.LoadJobConfig.destination_encryption_configuration`. - """ - return self._configuration.destination_encryption_configuration - - @property - def destination_table_description(self): - """Optional[str] name given to destination table. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.description - """ - return self._configuration.destination_table_description - - @property - def destination_table_friendly_name(self): - """Optional[str] name given to destination table. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.friendly_name - """ - return self._configuration.destination_table_friendly_name - - @property - def range_partitioning(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. - """ - return self._configuration.range_partitioning - - @property - def time_partitioning(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.time_partitioning`. - """ - return self._configuration.time_partitioning - - @property - def use_avro_logical_types(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.use_avro_logical_types`. - """ - return self._configuration.use_avro_logical_types - - @property - def clustering_fields(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.clustering_fields`. - """ - return self._configuration.clustering_fields - - @property - def schema_update_options(self): - """See - :attr:`google.cloud.bigquery.job.LoadJobConfig.schema_update_options`. - """ - return self._configuration.schema_update_options - - @property - def input_file_bytes(self): - """Count of bytes loaded from source files. - - Returns: - Optional[int]: the count (None until set from the server). - - Raises: - ValueError: for invalid value types. - """ - return _helpers._int_or_none( - _helpers._get_sub_prop( - self._properties, ["statistics", "load", "inputFileBytes"] - ) - ) - - @property - def input_files(self): - """Count of source files. - - Returns: - Optional[int]: the count (None until set from the server). - """ - return _helpers._int_or_none( - _helpers._get_sub_prop( - self._properties, ["statistics", "load", "inputFiles"] - ) - ) - - @property - def output_bytes(self): - """Count of bytes saved to destination table. - - Returns: - Optional[int]: the count (None until set from the server). - """ - return _helpers._int_or_none( - _helpers._get_sub_prop( - self._properties, ["statistics", "load", "outputBytes"] - ) - ) - - @property - def output_rows(self): - """Count of rows saved to destination table. - - Returns: - Optional[int]: the count (None until set from the server). - """ - return _helpers._int_or_none( - _helpers._get_sub_prop( - self._properties, ["statistics", "load", "outputRows"] - ) - ) - - def to_api_repr(self): - """Generate a resource for :meth:`_begin`.""" - configuration = self._configuration.to_api_repr() - if self.source_uris is not None: - _helpers._set_sub_prop( - configuration, ["load", "sourceUris"], self.source_uris - ) - _helpers._set_sub_prop( - configuration, ["load", "destinationTable"], self.destination.to_api_repr() - ) - - return { - "jobReference": self._properties["jobReference"], - "configuration": configuration, - } - - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) - - @classmethod - def from_api_repr(cls, resource, client): - """Factory: construct a job given its API representation - - .. note: - - This method assumes that the project found in the resource matches - the client's project. - - Args: - resource (Dict): dataset job representation returned from the API - - client (google.cloud.bigquery.client.Client): - Client which holds credentials and project - configuration for the dataset. - - Returns: - google.cloud.bigquery.job.LoadJob: Job parsed from ``resource``. - """ - config_resource = resource.get("configuration", {}) - config = LoadJobConfig.from_api_repr(config_resource) - # A load job requires a destination table. - dest_config = config_resource["load"]["destinationTable"] - ds_ref = DatasetReference(dest_config["projectId"], dest_config["datasetId"]) - destination = TableReference(ds_ref, dest_config["tableId"]) - # sourceUris will be absent if this is a file upload. - source_uris = _helpers._get_sub_prop(config_resource, ["load", "sourceUris"]) - job_ref = _JobReference._from_api_repr(resource["jobReference"]) - job = cls(job_ref, source_uris, destination, client, config) - job._set_properties(resource) - return job - - -class CopyJobConfig(_JobConfig): - """Configuration options for copy jobs. - - All properties in this class are optional. Values which are :data:`None` -> - server defaults. Set properties on the constructed configuration by using - the property name as the name of a keyword argument. - """ - - def __init__(self, **kwargs): - super(CopyJobConfig, self).__init__("copy", **kwargs) - - @property - def create_disposition(self): - """google.cloud.bigquery.job.CreateDisposition: Specifies behavior - for creating tables. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.create_disposition - """ - return self._get_sub_prop("createDisposition") - - @create_disposition.setter - def create_disposition(self, value): - self._set_sub_prop("createDisposition", value) - - @property - def write_disposition(self): - """google.cloud.bigquery.job.WriteDisposition: Action that occurs if - the destination table already exists. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.write_disposition - """ - return self._get_sub_prop("writeDisposition") - - @write_disposition.setter - def write_disposition(self, value): - self._set_sub_prop("writeDisposition", value) - - @property - def destination_encryption_configuration(self): - """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom - encryption configuration for the destination table. - - Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` - if using default encryption. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.destination_encryption_configuration - """ - prop = self._get_sub_prop("destinationEncryptionConfiguration") - if prop is not None: - prop = EncryptionConfiguration.from_api_repr(prop) - return prop - - @destination_encryption_configuration.setter - def destination_encryption_configuration(self, value): - api_repr = value - if value is not None: - api_repr = value.to_api_repr() - self._set_sub_prop("destinationEncryptionConfiguration", api_repr) - - -class CopyJob(_AsyncJob): - """Asynchronous job: copy data into a table from other tables. - - Args: - job_id (str): the job's ID, within the project belonging to ``client``. - - sources (List[google.cloud.bigquery.table.TableReference]): Table from which data is to be loaded. - - destination (google.cloud.bigquery.table.TableReference): Table into which data is to be loaded. - - client (google.cloud.bigquery.client.Client): - A client which holds credentials and project configuration - for the dataset (which requires a project). - - job_config (Optional[google.cloud.bigquery.job.CopyJobConfig]): - Extra configuration options for the copy job. - """ - - _JOB_TYPE = "copy" - - def __init__(self, job_id, sources, destination, client, job_config=None): - super(CopyJob, self).__init__(job_id, client) - - if job_config is None: - job_config = CopyJobConfig() - - self.destination = destination - self.sources = sources - self._configuration = job_config - - @property - def create_disposition(self): - """See - :attr:`google.cloud.bigquery.job.CopyJobConfig.create_disposition`. - """ - return self._configuration.create_disposition - - @property - def write_disposition(self): - """See - :attr:`google.cloud.bigquery.job.CopyJobConfig.write_disposition`. - """ - return self._configuration.write_disposition - - @property - def destination_encryption_configuration(self): - """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom - encryption configuration for the destination table. - - Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` - if using default encryption. - - See - :attr:`google.cloud.bigquery.job.CopyJobConfig.destination_encryption_configuration`. - """ - return self._configuration.destination_encryption_configuration - - def to_api_repr(self): - """Generate a resource for :meth:`_begin`.""" - - source_refs = [ - { - "projectId": table.project, - "datasetId": table.dataset_id, - "tableId": table.table_id, - } - for table in self.sources - ] - - configuration = self._configuration.to_api_repr() - _helpers._set_sub_prop(configuration, ["copy", "sourceTables"], source_refs) - _helpers._set_sub_prop( - configuration, - ["copy", "destinationTable"], - { - "projectId": self.destination.project, - "datasetId": self.destination.dataset_id, - "tableId": self.destination.table_id, - }, - ) - - return { - "jobReference": self._properties["jobReference"], - "configuration": configuration, - } - - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) - - @classmethod - def from_api_repr(cls, resource, client): - """Factory: construct a job given its API representation - - .. note: - - This method assumes that the project found in the resource matches - the client's project. - - Args: - resource (Dict): dataset job representation returned from the API - - client (google.cloud.bigquery.client.Client): - Client which holds credentials and project - configuration for the dataset. - - Returns: - google.cloud.bigquery.job.CopyJob: Job parsed from ``resource``. - """ - job_id, config_resource = cls._get_resource_config(resource) - config = CopyJobConfig.from_api_repr(config_resource) - # Copy required fields to the job. - copy_resource = config_resource["copy"] - destination = TableReference.from_api_repr(copy_resource["destinationTable"]) - sources = [] - source_configs = copy_resource.get("sourceTables") - if source_configs is None: - single = copy_resource.get("sourceTable") - if single is None: - raise KeyError("Resource missing 'sourceTables' / 'sourceTable'") - source_configs = [single] - for source_config in source_configs: - table_ref = TableReference.from_api_repr(source_config) - sources.append(table_ref) - job = cls(job_id, sources, destination, client=client, job_config=config) - job._set_properties(resource) - return job - - -class ExtractJobConfig(_JobConfig): - """Configuration options for extract jobs. - - All properties in this class are optional. Values which are :data:`None` -> - server defaults. Set properties on the constructed configuration by using - the property name as the name of a keyword argument. - """ - - def __init__(self, **kwargs): - super(ExtractJobConfig, self).__init__("extract", **kwargs) - - @property - def compression(self): - """google.cloud.bigquery.job.Compression: Compression type to use for - exported files. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.compression - """ - return self._get_sub_prop("compression") - - @compression.setter - def compression(self, value): - self._set_sub_prop("compression", value) - - @property - def destination_format(self): - """google.cloud.bigquery.job.DestinationFormat: Exported file format. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.destination_format - """ - return self._get_sub_prop("destinationFormat") - - @destination_format.setter - def destination_format(self, value): - self._set_sub_prop("destinationFormat", value) - - @property - def field_delimiter(self): - """str: Delimiter to use between fields in the exported data. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.field_delimiter - """ - return self._get_sub_prop("fieldDelimiter") - - @field_delimiter.setter - def field_delimiter(self, value): - self._set_sub_prop("fieldDelimiter", value) - - @property - def print_header(self): - """bool: Print a header row in the exported data. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.print_header - """ - return self._get_sub_prop("printHeader") - - @print_header.setter - def print_header(self, value): - self._set_sub_prop("printHeader", value) - - @property - def use_avro_logical_types(self): - """bool: For loads of Avro data, governs whether Avro logical types are - converted to their corresponding BigQuery types (e.g. TIMESTAMP) rather than - raw types (e.g. INTEGER). - """ - return self._get_sub_prop("useAvroLogicalTypes") - - @use_avro_logical_types.setter - def use_avro_logical_types(self, value): - self._set_sub_prop("useAvroLogicalTypes", bool(value)) - - -class ExtractJob(_AsyncJob): - """Asynchronous job: extract data from a table into Cloud Storage. - - Args: - job_id (str): the job's ID. - - source (Union[ \ - google.cloud.bigquery.table.TableReference, \ - google.cloud.bigquery.model.ModelReference \ - ]): - Table or Model from which data is to be loaded or extracted. - - destination_uris (List[str]): - URIs describing where the extracted data will be written in Cloud - Storage, using the format ``gs:///``. - - client (google.cloud.bigquery.client.Client): - A client which holds credentials and project configuration. - - job_config (Optional[google.cloud.bigquery.job.ExtractJobConfig]): - Extra configuration options for the extract job. - """ - - _JOB_TYPE = "extract" - - def __init__(self, job_id, source, destination_uris, client, job_config=None): - super(ExtractJob, self).__init__(job_id, client) - - if job_config is None: - job_config = ExtractJobConfig() - - self.source = source - self.destination_uris = destination_uris - self._configuration = job_config - - @property - def compression(self): - """See - :attr:`google.cloud.bigquery.job.ExtractJobConfig.compression`. - """ - return self._configuration.compression - - @property - def destination_format(self): - """See - :attr:`google.cloud.bigquery.job.ExtractJobConfig.destination_format`. - """ - return self._configuration.destination_format - - @property - def field_delimiter(self): - """See - :attr:`google.cloud.bigquery.job.ExtractJobConfig.field_delimiter`. - """ - return self._configuration.field_delimiter - - @property - def print_header(self): - """See - :attr:`google.cloud.bigquery.job.ExtractJobConfig.print_header`. - """ - return self._configuration.print_header - - @property - def destination_uri_file_counts(self): - """Return file counts from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics4.FIELDS.destination_uri_file_counts - - Returns: - List[int]: - A list of integer counts, each representing the number of files - per destination URI or URI pattern specified in the extract - configuration. These values will be in the same order as the URIs - specified in the 'destinationUris' field. Returns None if job is - not yet complete. - """ - counts = self._job_statistics().get("destinationUriFileCounts") - if counts is not None: - return [int(count) for count in counts] - return None - - def to_api_repr(self): - """Generate a resource for :meth:`_begin`.""" - - configuration = self._configuration.to_api_repr() - source_ref = { - "projectId": self.source.project, - "datasetId": self.source.dataset_id, - } - - source = "sourceTable" - if isinstance(self.source, TableReference): - source_ref["tableId"] = self.source.table_id - else: - source_ref["modelId"] = self.source.model_id - source = "sourceModel" - - _helpers._set_sub_prop(configuration, ["extract", source], source_ref) - _helpers._set_sub_prop( - configuration, ["extract", "destinationUris"], self.destination_uris - ) - - return { - "jobReference": self._properties["jobReference"], - "configuration": configuration, - } - - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) - - @classmethod - def from_api_repr(cls, resource, client): - """Factory: construct a job given its API representation - - .. note: - - This method assumes that the project found in the resource matches - the client's project. - - Args: - resource (Dict): dataset job representation returned from the API - - client (google.cloud.bigquery.client.Client): - Client which holds credentials and project - configuration for the dataset. - - Returns: - google.cloud.bigquery.job.ExtractJob: Job parsed from ``resource``. - """ - job_id, config_resource = cls._get_resource_config(resource) - config = ExtractJobConfig.from_api_repr(config_resource) - source_config = _helpers._get_sub_prop( - config_resource, ["extract", "sourceTable"] - ) - if source_config: - dataset = DatasetReference( - source_config["projectId"], source_config["datasetId"] - ) - source = dataset.table(source_config["tableId"]) - else: - source_config = _helpers._get_sub_prop( - config_resource, ["extract", "sourceModel"] - ) - dataset = DatasetReference( - source_config["projectId"], source_config["datasetId"] - ) - source = dataset.model(source_config["modelId"]) - - destination_uris = _helpers._get_sub_prop( - config_resource, ["extract", "destinationUris"] - ) - - job = cls(job_id, source, destination_uris, client=client, job_config=config) - job._set_properties(resource) - return job - - -def _from_api_repr_query_parameters(resource): - return [_query_param_from_api_repr(mapping) for mapping in resource] - - -def _to_api_repr_query_parameters(value): - return [query_parameter.to_api_repr() for query_parameter in value] - - -def _from_api_repr_udf_resources(resource): - udf_resources = [] - for udf_mapping in resource: - for udf_type, udf_value in udf_mapping.items(): - udf_resources.append(UDFResource(udf_type, udf_value)) - return udf_resources - - -def _to_api_repr_udf_resources(value): - return [{udf_resource.udf_type: udf_resource.value} for udf_resource in value] - - -def _from_api_repr_table_defs(resource): - return {k: ExternalConfig.from_api_repr(v) for k, v in resource.items()} - - -def _to_api_repr_table_defs(value): - return {k: ExternalConfig.to_api_repr(v) for k, v in value.items()} - - -class QueryJobConfig(_JobConfig): - """Configuration options for query jobs. - - All properties in this class are optional. Values which are :data:`None` -> - server defaults. Set properties on the constructed configuration by using - the property name as the name of a keyword argument. - """ - - def __init__(self, **kwargs): - super(QueryJobConfig, self).__init__("query", **kwargs) - - @property - def destination_encryption_configuration(self): - """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom - encryption configuration for the destination table. - - Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` - if using default encryption. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.destination_encryption_configuration - """ - prop = self._get_sub_prop("destinationEncryptionConfiguration") - if prop is not None: - prop = EncryptionConfiguration.from_api_repr(prop) - return prop - - @destination_encryption_configuration.setter - def destination_encryption_configuration(self, value): - api_repr = value - if value is not None: - api_repr = value.to_api_repr() - self._set_sub_prop("destinationEncryptionConfiguration", api_repr) - - @property - def allow_large_results(self): - """bool: Allow large query results tables (legacy SQL, only) - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.allow_large_results - """ - return self._get_sub_prop("allowLargeResults") - - @allow_large_results.setter - def allow_large_results(self, value): - self._set_sub_prop("allowLargeResults", value) - - @property - def create_disposition(self): - """google.cloud.bigquery.job.CreateDisposition: Specifies behavior - for creating tables. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.create_disposition - """ - return self._get_sub_prop("createDisposition") - - @create_disposition.setter - def create_disposition(self, value): - self._set_sub_prop("createDisposition", value) - - @property - def default_dataset(self): - """google.cloud.bigquery.dataset.DatasetReference: the default dataset - to use for unqualified table names in the query or :data:`None` if not - set. - - The ``default_dataset`` setter accepts: - - - a :class:`~google.cloud.bigquery.dataset.Dataset`, or - - a :class:`~google.cloud.bigquery.dataset.DatasetReference`, or - - a :class:`str` of the fully-qualified dataset ID in standard SQL - format. The value must included a project ID and dataset ID - separated by ``.``. For example: ``your-project.your_dataset``. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.default_dataset - """ - prop = self._get_sub_prop("defaultDataset") - if prop is not None: - prop = DatasetReference.from_api_repr(prop) - return prop - - @default_dataset.setter - def default_dataset(self, value): - if value is None: - self._set_sub_prop("defaultDataset", None) - return - - if isinstance(value, six.string_types): - value = DatasetReference.from_string(value) - - if isinstance(value, (Dataset, DatasetListItem)): - value = value.reference - - resource = value.to_api_repr() - self._set_sub_prop("defaultDataset", resource) - - @property - def destination(self): - """google.cloud.bigquery.table.TableReference: table where results are - written or :data:`None` if not set. - - The ``destination`` setter accepts: - - - a :class:`~google.cloud.bigquery.table.Table`, or - - a :class:`~google.cloud.bigquery.table.TableReference`, or - - a :class:`str` of the fully-qualified table ID in standard SQL - format. The value must included a project ID, dataset ID, and table - ID, each separated by ``.``. For example: - ``your-project.your_dataset.your_table``. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.destination_table - """ - prop = self._get_sub_prop("destinationTable") - if prop is not None: - prop = TableReference.from_api_repr(prop) - return prop - - @destination.setter - def destination(self, value): - if value is None: - self._set_sub_prop("destinationTable", None) - return - - value = _table_arg_to_table_ref(value) - resource = value.to_api_repr() - self._set_sub_prop("destinationTable", resource) - - @property - def dry_run(self): - """bool: :data:`True` if this query should be a dry run to estimate - costs. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfiguration.FIELDS.dry_run - """ - return self._properties.get("dryRun") - - @dry_run.setter - def dry_run(self, value): - self._properties["dryRun"] = value - - @property - def flatten_results(self): - """bool: Flatten nested/repeated fields in results. (Legacy SQL only) - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.flatten_results - """ - return self._get_sub_prop("flattenResults") - - @flatten_results.setter - def flatten_results(self, value): - self._set_sub_prop("flattenResults", value) - - @property - def maximum_billing_tier(self): - """int: Deprecated. Changes the billing tier to allow high-compute - queries. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.maximum_billing_tier - """ - return self._get_sub_prop("maximumBillingTier") - - @maximum_billing_tier.setter - def maximum_billing_tier(self, value): - self._set_sub_prop("maximumBillingTier", value) - - @property - def maximum_bytes_billed(self): - """int: Maximum bytes to be billed for this job or :data:`None` if not set. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.maximum_bytes_billed - """ - return _helpers._int_or_none(self._get_sub_prop("maximumBytesBilled")) - - @maximum_bytes_billed.setter - def maximum_bytes_billed(self, value): - self._set_sub_prop("maximumBytesBilled", str(value)) - - @property - def priority(self): - """google.cloud.bigquery.job.QueryPriority: Priority of the query. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.priority - """ - return self._get_sub_prop("priority") - - @priority.setter - def priority(self, value): - self._set_sub_prop("priority", value) - - @property - def query_parameters(self): - """List[Union[google.cloud.bigquery.query.ArrayQueryParameter, \ - google.cloud.bigquery.query.ScalarQueryParameter, \ - google.cloud.bigquery.query.StructQueryParameter]]: list of parameters - for parameterized query (empty by default) - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.query_parameters - """ - prop = self._get_sub_prop("queryParameters", default=[]) - return _from_api_repr_query_parameters(prop) - - @query_parameters.setter - def query_parameters(self, values): - self._set_sub_prop("queryParameters", _to_api_repr_query_parameters(values)) - - @property - def range_partitioning(self): - """Optional[google.cloud.bigquery.table.RangePartitioning]: - Configures range-based partitioning for destination table. - - .. note:: - **Beta**. The integer range partitioning feature is in a - pre-release state and might change or have limited support. - - Only specify at most one of - :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or - :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. - - Raises: - ValueError: - If the value is not - :class:`~google.cloud.bigquery.table.RangePartitioning` or - :data:`None`. - """ - resource = self._get_sub_prop("rangePartitioning") - if resource is not None: - return RangePartitioning(_properties=resource) - - @range_partitioning.setter - def range_partitioning(self, value): - resource = value - if isinstance(value, RangePartitioning): - resource = value._properties - elif value is not None: - raise ValueError( - "Expected value to be RangePartitioning or None, got {}.".format(value) - ) - self._set_sub_prop("rangePartitioning", resource) - - @property - def udf_resources(self): - """List[google.cloud.bigquery.query.UDFResource]: user - defined function resources (empty by default) - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.user_defined_function_resources - """ - prop = self._get_sub_prop("userDefinedFunctionResources", default=[]) - return _from_api_repr_udf_resources(prop) - - @udf_resources.setter - def udf_resources(self, values): - self._set_sub_prop( - "userDefinedFunctionResources", _to_api_repr_udf_resources(values) - ) - - @property - def use_legacy_sql(self): - """bool: Use legacy SQL syntax. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.use_legacy_sql - """ - return self._get_sub_prop("useLegacySql") - - @use_legacy_sql.setter - def use_legacy_sql(self, value): - self._set_sub_prop("useLegacySql", value) - - @property - def use_query_cache(self): - """bool: Look for the query result in the cache. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.use_query_cache - """ - return self._get_sub_prop("useQueryCache") - - @use_query_cache.setter - def use_query_cache(self, value): - self._set_sub_prop("useQueryCache", value) - - @property - def write_disposition(self): - """google.cloud.bigquery.job.WriteDisposition: Action that occurs if - the destination table already exists. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.write_disposition - """ - return self._get_sub_prop("writeDisposition") - - @write_disposition.setter - def write_disposition(self, value): - self._set_sub_prop("writeDisposition", value) - - @property - def table_definitions(self): - """Dict[str, google.cloud.bigquery.external_config.ExternalConfig]: - Definitions for external tables or :data:`None` if not set. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.external_table_definitions - """ - prop = self._get_sub_prop("tableDefinitions") - if prop is not None: - prop = _from_api_repr_table_defs(prop) - return prop - - @table_definitions.setter - def table_definitions(self, values): - self._set_sub_prop("tableDefinitions", _to_api_repr_table_defs(values)) - - @property - def time_partitioning(self): - """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies - time-based partitioning for the destination table. - - Only specify at most one of - :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or - :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. - - Raises: - ValueError: - If the value is not - :class:`~google.cloud.bigquery.table.TimePartitioning` or - :data:`None`. - """ - prop = self._get_sub_prop("timePartitioning") - if prop is not None: - prop = TimePartitioning.from_api_repr(prop) - return prop - - @time_partitioning.setter - def time_partitioning(self, value): - api_repr = value - if value is not None: - api_repr = value.to_api_repr() - self._set_sub_prop("timePartitioning", api_repr) - - @property - def clustering_fields(self): - """Optional[List[str]]: Fields defining clustering for the table - - (Defaults to :data:`None`). - - Clustering fields are immutable after table creation. - - .. note:: - - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. - """ - prop = self._get_sub_prop("clustering") - if prop is not None: - return list(prop.get("fields", ())) - - @clustering_fields.setter - def clustering_fields(self, value): - """Optional[List[str]]: Fields defining clustering for the table - - (Defaults to :data:`None`). - """ - if value is not None: - self._set_sub_prop("clustering", {"fields": value}) - else: - self._del_sub_prop("clustering") - - @property - def schema_update_options(self): - """List[google.cloud.bigquery.job.SchemaUpdateOption]: Specifies - updates to the destination table schema to allow as a side effect of - the query job. - """ - return self._get_sub_prop("schemaUpdateOptions") - - @schema_update_options.setter - def schema_update_options(self, values): - self._set_sub_prop("schemaUpdateOptions", values) - - def to_api_repr(self): - """Build an API representation of the query job config. - - Returns: - Dict: A dictionary in the format used by the BigQuery API. - """ - resource = copy.deepcopy(self._properties) - - # Query parameters have an addition property associated with them - # to indicate if the query is using named or positional parameters. - query_parameters = resource["query"].get("queryParameters") - if query_parameters: - if query_parameters[0].get("name") is None: - resource["query"]["parameterMode"] = "POSITIONAL" - else: - resource["query"]["parameterMode"] = "NAMED" - - return resource - - -class QueryJob(_AsyncJob): - """Asynchronous job: query tables. - - Args: - job_id (str): the job's ID, within the project belonging to ``client``. - - query (str): SQL query string. - - client (google.cloud.bigquery.client.Client): - A client which holds credentials and project configuration - for the dataset (which requires a project). - - job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]): - Extra configuration options for the query job. - """ - - _JOB_TYPE = "query" - _UDF_KEY = "userDefinedFunctionResources" - - def __init__(self, job_id, query, client, job_config=None): - super(QueryJob, self).__init__(job_id, client) - - if job_config is None: - job_config = QueryJobConfig() - if job_config.use_legacy_sql is None: - job_config.use_legacy_sql = False - - _helpers._set_sub_prop( - self._properties, ["configuration", "query", "query"], query - ) - - self._configuration = job_config - self._query_results = None - self._done_timeout = None - self._transport_timeout = None - - @property - def allow_large_results(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.allow_large_results`. - """ - return self._configuration.allow_large_results - - @property - def create_disposition(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.create_disposition`. - """ - return self._configuration.create_disposition - - @property - def default_dataset(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.default_dataset`. - """ - return self._configuration.default_dataset - - @property - def destination(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.destination`. - """ - return self._configuration.destination - - @property - def destination_encryption_configuration(self): - """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom - encryption configuration for the destination table. - - Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` - if using default encryption. - - See - :attr:`google.cloud.bigquery.job.QueryJobConfig.destination_encryption_configuration`. - """ - return self._configuration.destination_encryption_configuration - - @property - def dry_run(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.dry_run`. - """ - return self._configuration.dry_run - - @property - def flatten_results(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.flatten_results`. - """ - return self._configuration.flatten_results - - @property - def priority(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.priority`. - """ - return self._configuration.priority - - @property - def query(self): - """str: The query text used in this query job. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.query - """ - return _helpers._get_sub_prop( - self._properties, ["configuration", "query", "query"] - ) - - @property - def query_parameters(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.query_parameters`. - """ - return self._configuration.query_parameters - - @property - def udf_resources(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.udf_resources`. - """ - return self._configuration.udf_resources - - @property - def use_legacy_sql(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.use_legacy_sql`. - """ - return self._configuration.use_legacy_sql - - @property - def use_query_cache(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.use_query_cache`. - """ - return self._configuration.use_query_cache - - @property - def write_disposition(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.write_disposition`. - """ - return self._configuration.write_disposition - - @property - def maximum_billing_tier(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.maximum_billing_tier`. - """ - return self._configuration.maximum_billing_tier - - @property - def maximum_bytes_billed(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.maximum_bytes_billed`. - """ - return self._configuration.maximum_bytes_billed - - @property - def range_partitioning(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.range_partitioning`. - """ - return self._configuration.range_partitioning - - @property - def table_definitions(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.table_definitions`. - """ - return self._configuration.table_definitions - - @property - def time_partitioning(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.time_partitioning`. - """ - return self._configuration.time_partitioning - - @property - def clustering_fields(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.clustering_fields`. - """ - return self._configuration.clustering_fields - - @property - def schema_update_options(self): - """See - :attr:`google.cloud.bigquery.job.QueryJobConfig.schema_update_options`. - """ - return self._configuration.schema_update_options - - def to_api_repr(self): - """Generate a resource for :meth:`_begin`.""" - configuration = self._configuration.to_api_repr() - - resource = { - "jobReference": self._properties["jobReference"], - "configuration": configuration, - } - configuration["query"]["query"] = self.query - - return resource - - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) - - @classmethod - def from_api_repr(cls, resource, client): - """Factory: construct a job given its API representation - - Args: - resource (Dict): dataset job representation returned from the API - - client (google.cloud.bigquery.client.Client): - Client which holds credentials and project - configuration for the dataset. - - Returns: - google.cloud.bigquery.job.QueryJob: Job parsed from ``resource``. - """ - job_id, config = cls._get_resource_config(resource) - query = _helpers._get_sub_prop(config, ["query", "query"]) - job = cls(job_id, query, client=client) - job._set_properties(resource) - return job - - @property - def query_plan(self): - """Return query plan from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.query_plan - - Returns: - List[QueryPlanEntry]: - mappings describing the query plan, or an empty list - if the query has not yet completed. - """ - plan_entries = self._job_statistics().get("queryPlan", ()) - return [QueryPlanEntry.from_api_repr(entry) for entry in plan_entries] - - @property - def timeline(self): - """List(TimelineEntry): Return the query execution timeline - from job statistics. - """ - raw = self._job_statistics().get("timeline", ()) - return [TimelineEntry.from_api_repr(entry) for entry in raw] - - @property - def total_bytes_processed(self): - """Return total bytes processed from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.total_bytes_processed - - Returns: - Optional[int]: - Total bytes processed by the job, or None if job is not - yet complete. - """ - result = self._job_statistics().get("totalBytesProcessed") - if result is not None: - result = int(result) - return result - - @property - def total_bytes_billed(self): - """Return total bytes billed from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.total_bytes_billed - - Returns: - Optional[int]: - Total bytes processed by the job, or None if job is not - yet complete. - """ - result = self._job_statistics().get("totalBytesBilled") - if result is not None: - result = int(result) - return result - - @property - def billing_tier(self): - """Return billing tier from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.billing_tier - - Returns: - Optional[int]: - Billing tier used by the job, or None if job is not - yet complete. - """ - return self._job_statistics().get("billingTier") - - @property - def cache_hit(self): - """Return whether or not query results were served from cache. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.cache_hit - - Returns: - Optional[bool]: - whether the query results were returned from cache, or None - if job is not yet complete. - """ - return self._job_statistics().get("cacheHit") - - @property - def ddl_operation_performed(self): - """Optional[str]: Return the DDL operation performed. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_operation_performed - - """ - return self._job_statistics().get("ddlOperationPerformed") - - @property - def ddl_target_routine(self): - """Optional[google.cloud.bigquery.routine.RoutineReference]: Return the DDL target routine, present - for CREATE/DROP FUNCTION/PROCEDURE queries. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_target_routine - """ - prop = self._job_statistics().get("ddlTargetRoutine") - if prop is not None: - prop = RoutineReference.from_api_repr(prop) - return prop - - @property - def ddl_target_table(self): - """Optional[google.cloud.bigquery.table.TableReference]: Return the DDL target table, present - for CREATE/DROP TABLE/VIEW queries. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_target_table - """ - prop = self._job_statistics().get("ddlTargetTable") - if prop is not None: - prop = TableReference.from_api_repr(prop) - return prop - - @property - def num_dml_affected_rows(self): - """Return the number of DML rows affected by the job. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.num_dml_affected_rows - - Returns: - Optional[int]: - number of DML rows affected by the job, or None if job is not - yet complete. - """ - result = self._job_statistics().get("numDmlAffectedRows") - if result is not None: - result = int(result) - return result - - @property - def slot_millis(self): - """Union[int, None]: Slot-milliseconds used by this query job.""" - return _helpers._int_or_none(self._job_statistics().get("totalSlotMs")) - - @property - def statement_type(self): - """Return statement type from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type - - Returns: - Optional[str]: - type of statement used by the job, or None if job is not - yet complete. - """ - return self._job_statistics().get("statementType") - - @property - def referenced_tables(self): - """Return referenced tables from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.referenced_tables - - Returns: - List[Dict]: - mappings describing the query plan, or an empty list - if the query has not yet completed. - """ - tables = [] - datasets_by_project_name = {} - - for table in self._job_statistics().get("referencedTables", ()): - - t_project = table["projectId"] - - ds_id = table["datasetId"] - t_dataset = datasets_by_project_name.get((t_project, ds_id)) - if t_dataset is None: - t_dataset = DatasetReference(t_project, ds_id) - datasets_by_project_name[(t_project, ds_id)] = t_dataset - - t_name = table["tableId"] - tables.append(t_dataset.table(t_name)) - - return tables - - @property - def undeclared_query_parameters(self): - """Return undeclared query parameters from job statistics, if present. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.undeclared_query_parameters - - Returns: - List[Union[ \ - google.cloud.bigquery.query.ArrayQueryParameter, \ - google.cloud.bigquery.query.ScalarQueryParameter, \ - google.cloud.bigquery.query.StructQueryParameter \ - ]]: - Undeclared parameters, or an empty list if the query has - not yet completed. - """ - parameters = [] - undeclared = self._job_statistics().get("undeclaredQueryParameters", ()) - - for parameter in undeclared: - p_type = parameter["parameterType"] - - if "arrayType" in p_type: - klass = ArrayQueryParameter - elif "structTypes" in p_type: - klass = StructQueryParameter - else: - klass = ScalarQueryParameter - - parameters.append(klass.from_api_repr(parameter)) - - return parameters - - @property - def estimated_bytes_processed(self): - """Return the estimated number of bytes processed by the query. - - See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.estimated_bytes_processed - - Returns: - Optional[int]: - number of DML rows affected by the job, or None if job is not - yet complete. - """ - result = self._job_statistics().get("estimatedBytesProcessed") - if result is not None: - result = int(result) - return result - - def done(self, retry=DEFAULT_RETRY, timeout=None): - """Refresh the job and checks if it is complete. - - Args: - retry (Optional[google.api_core.retry.Retry]): - How to retry the call that retrieves query results. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - - Returns: - bool: True if the job is complete, False otherwise. - """ - # Since the API to getQueryResults can hang up to the timeout value - # (default of 10 seconds), set the timeout parameter to ensure that - # the timeout from the futures API is respected. See: - # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4135 - timeout_ms = None - if self._done_timeout is not None: - # Subtract a buffer for context switching, network latency, etc. - api_timeout = self._done_timeout - _TIMEOUT_BUFFER_SECS - api_timeout = max(min(api_timeout, 10), 0) - self._done_timeout -= api_timeout - self._done_timeout = max(0, self._done_timeout) - timeout_ms = int(api_timeout * 1000) - - # If an explicit timeout is not given, fall back to the transport timeout - # stored in _blocking_poll() in the process of polling for job completion. - transport_timeout = timeout if timeout is not None else self._transport_timeout - - # Do not refresh if the state is already done, as the job will not - # change once complete. - if self.state != _DONE_STATE: - self._query_results = self._client._get_query_results( - self.job_id, - retry, - project=self.project, - timeout_ms=timeout_ms, - location=self.location, - timeout=transport_timeout, - ) - - # Only reload the job once we know the query is complete. - # This will ensure that fields such as the destination table are - # correctly populated. - if self._query_results.complete: - self.reload(retry=retry, timeout=transport_timeout) - - return self.state == _DONE_STATE - - def _blocking_poll(self, timeout=None): - self._done_timeout = timeout - self._transport_timeout = timeout - super(QueryJob, self)._blocking_poll(timeout=timeout) - - @staticmethod - def _format_for_exception(query, job_id): - """Format a query for the output in exception message. - - Args: - query (str): The SQL query to format. - job_id (str): The ID of the job that ran the query. - - Returns: - str: A formatted query text. - """ - template = "\n\n(job ID: {job_id})\n\n{header}\n\n{ruler}\n{body}\n{ruler}" - - lines = query.splitlines() - max_line_len = max(len(line) for line in lines) - - header = "-----Query Job SQL Follows-----" - header = "{:^{total_width}}".format(header, total_width=max_line_len + 5) - - # Print out a "ruler" above and below the SQL so we can judge columns. - # Left pad for the line numbers (4 digits plus ":"). - ruler = " |" + " . |" * (max_line_len // 10) - - # Put line numbers next to the SQL. - body = "\n".join( - "{:4}:{}".format(n, line) for n, line in enumerate(lines, start=1) - ) - - return template.format(job_id=job_id, header=header, ruler=ruler, body=body) - - def _begin(self, client=None, retry=DEFAULT_RETRY, timeout=None): - """API call: begin the job via a POST request - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert - - Args: - client (Optional[google.cloud.bigquery.client.Client]): - The client to use. If not passed, falls back to the ``client`` - associated with the job object or``NoneType``. - retry (Optional[google.api_core.retry.Retry]): - How to retry the RPC. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - - Raises: - ValueError: If the job has already begun. - """ - - try: - super(QueryJob, self)._begin(client=client, retry=retry, timeout=timeout) - except exceptions.GoogleCloudError as exc: - exc.message += self._format_for_exception(self.query, self.job_id) - exc.query_job = self - raise - - def result( - self, - page_size=None, - max_results=None, - retry=DEFAULT_RETRY, - timeout=None, - start_index=None, - ): - """Start the job and wait for it to complete and get the result. - - Args: - page_size (Optional[int]): - The maximum number of rows in each page of results from this - request. Non-positive values are ignored. - max_results (Optional[int]): - The maximum total number of rows from this request. - retry (Optional[google.api_core.retry.Retry]): - How to retry the call that retrieves rows. - timeout (Optional[float]): - The number of seconds to wait for the underlying HTTP transport - before using ``retry``. - If multiple requests are made under the hood, ``timeout`` - applies to each individual request. - start_index (Optional[int]): - The zero-based index of the starting row to read. - - Returns: - google.cloud.bigquery.table.RowIterator: - Iterator of row data - :class:`~google.cloud.bigquery.table.Row`-s. During each - page, the iterator will have the ``total_rows`` attribute - set, which counts the total number of rows **in the result - set** (this is distinct from the total number of rows in the - current page: ``iterator.page.num_items``). - - If the query is a special query that produces no results, e.g. - a DDL query, an ``_EmptyRowIterator`` instance is returned. - - Raises: - google.cloud.exceptions.GoogleCloudError: - If the job failed. - concurrent.futures.TimeoutError: - If the job did not complete in the given timeout. - """ - try: - super(QueryJob, self).result(retry=retry, timeout=timeout) - - # Return an iterator instead of returning the job. - if not self._query_results: - self._query_results = self._client._get_query_results( - self.job_id, - retry, - project=self.project, - location=self.location, - timeout=timeout, - ) - except exceptions.GoogleCloudError as exc: - exc.message += self._format_for_exception(self.query, self.job_id) - exc.query_job = self - raise - except requests.exceptions.Timeout as exc: - six.raise_from(concurrent.futures.TimeoutError, exc) - - # If the query job is complete but there are no query results, this was - # special job, such as a DDL query. Return an empty result set to - # indicate success and avoid calling tabledata.list on a table which - # can't be read (such as a view table). - if self._query_results.total_rows is None: - return _EmptyRowIterator() - - schema = self._query_results.schema - dest_table_ref = self.destination - dest_table = Table(dest_table_ref, schema=schema) - dest_table._properties["numRows"] = self._query_results.total_rows - rows = self._client.list_rows( - dest_table, - page_size=page_size, - max_results=max_results, - start_index=start_index, - retry=retry, - timeout=timeout, - ) - rows._preserve_order = _contains_order_by(self.query) - return rows - - # If changing the signature of this method, make sure to apply the same - # changes to table.RowIterator.to_arrow() - def to_arrow( - self, - progress_bar_type=None, - bqstorage_client=None, - create_bqstorage_client=True, - ): - """[Beta] Create a class:`pyarrow.Table` by loading all pages of a - table or query. - - Args: - progress_bar_type (Optional[str]): - If set, use the `tqdm `_ library to - display a progress bar while the data downloads. Install the - ``tqdm`` package to use this feature. - - Possible values of ``progress_bar_type`` include: - - ``None`` - No progress bar. - ``'tqdm'`` - Use the :func:`tqdm.tqdm` function to print a progress bar - to :data:`sys.stderr`. - ``'tqdm_notebook'`` - Use the :func:`tqdm.tqdm_notebook` function to display a - progress bar as a Jupyter notebook widget. - ``'tqdm_gui'`` - Use the :func:`tqdm.tqdm_gui` function to display a - progress bar as a graphical dialog box. - bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): - A BigQuery Storage API client. If supplied, use the faster - BigQuery Storage API to fetch rows from BigQuery. This API - is a billable API. - - This method requires the ``pyarrow`` and - ``google-cloud-bigquery-storage`` libraries. - - Reading from a specific partition or snapshot is not - currently supported by this method. - create_bqstorage_client (Optional[bool]): - If ``True`` (default), create a BigQuery Storage API client - using the default API settings. The BigQuery Storage API - is a faster way to fetch rows from BigQuery. See the - ``bqstorage_client`` parameter for more information. - - This argument does nothing if ``bqstorage_client`` is supplied. - - ..versionadded:: 1.24.0 - - Returns: - pyarrow.Table - A :class:`pyarrow.Table` populated with row data and column - headers from the query results. The column headers are derived - from the destination table's schema. - - Raises: - ValueError: - If the :mod:`pyarrow` library cannot be imported. - - ..versionadded:: 1.17.0 - """ - return self.result().to_arrow( - progress_bar_type=progress_bar_type, - bqstorage_client=bqstorage_client, - create_bqstorage_client=create_bqstorage_client, - ) - - # If changing the signature of this method, make sure to apply the same - # changes to table.RowIterator.to_dataframe() - def to_dataframe( - self, - bqstorage_client=None, - dtypes=None, - progress_bar_type=None, - create_bqstorage_client=True, - date_as_object=True, - ): - """Return a pandas DataFrame from a QueryJob - - Args: - bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): - A BigQuery Storage API client. If supplied, use the faster - BigQuery Storage API to fetch rows from BigQuery. This - API is a billable API. - - This method requires the ``fastavro`` and - ``google-cloud-bigquery-storage`` libraries. - - Reading from a specific partition or snapshot is not - currently supported by this method. - - dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): - A dictionary of column names pandas ``dtype``s. The provided - ``dtype`` is used when constructing the series for the column - specified. Otherwise, the default pandas behavior is used. - - progress_bar_type (Optional[str]): - If set, use the `tqdm `_ library to - display a progress bar while the data downloads. Install the - ``tqdm`` package to use this feature. - - See - :func:`~google.cloud.bigquery.table.RowIterator.to_dataframe` - for details. - - ..versionadded:: 1.11.0 - create_bqstorage_client (Optional[bool]): - If ``True`` (default), create a BigQuery Storage API client - using the default API settings. The BigQuery Storage API - is a faster way to fetch rows from BigQuery. See the - ``bqstorage_client`` parameter for more information. - - This argument does nothing if ``bqstorage_client`` is supplied. - - ..versionadded:: 1.24.0 - - date_as_object (Optional[bool]): - If ``True`` (default), cast dates to objects. If ``False``, convert - to datetime64[ns] dtype. - - ..versionadded:: 1.26.0 - - Returns: - A :class:`~pandas.DataFrame` populated with row data and column - headers from the query results. The column headers are derived - from the destination table's schema. - - Raises: - ValueError: If the `pandas` library cannot be imported. - """ - return self.result().to_dataframe( - bqstorage_client=bqstorage_client, - dtypes=dtypes, - progress_bar_type=progress_bar_type, - create_bqstorage_client=create_bqstorage_client, - date_as_object=date_as_object, - ) - - def __iter__(self): - return iter(self.result()) - - -class QueryPlanEntryStep(object): - """Map a single step in a query plan entry. - - Args: - kind (str): step type. - - substeps (List): names of substeps. - """ - - def __init__(self, kind, substeps): - self.kind = kind - self.substeps = list(substeps) - - @classmethod - def from_api_repr(cls, resource): - """Factory: construct instance from the JSON repr. - - Args: - resource (Dict): JSON representation of the entry. - - Returns: - QueryPlanEntryStep: new instance built from the resource. - """ - return cls(kind=resource.get("kind"), substeps=resource.get("substeps", ())) - - def __eq__(self, other): - if not isinstance(other, self.__class__): - return NotImplemented - return self.kind == other.kind and self.substeps == other.substeps - - -class QueryPlanEntry(object): - """QueryPlanEntry represents a single stage of a query execution plan. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#ExplainQueryStage - for the underlying API representation within query statistics. - """ - - def __init__(self): - self._properties = {} - - @classmethod - def from_api_repr(cls, resource): - """Factory: construct instance from the JSON repr. - - Args: - resource(Dict[str: object]): - ExplainQueryStage representation returned from API. - - Returns: - google.cloud.bigquery.QueryPlanEntry: - Query plan entry parsed from ``resource``. - """ - entry = cls() - entry._properties = resource - return entry - - @property - def name(self): - """Optional[str]: Human-readable name of the stage.""" - return self._properties.get("name") - - @property - def entry_id(self): - """Optional[str]: Unique ID for the stage within the plan.""" - return self._properties.get("id") - - @property - def start(self): - """Optional[Datetime]: Datetime when the stage started.""" - if self._properties.get("startMs") is None: - return None - return _helpers._datetime_from_microseconds( - int(self._properties.get("startMs")) * 1000.0 - ) - - @property - def end(self): - """Optional[Datetime]: Datetime when the stage ended.""" - if self._properties.get("endMs") is None: - return None - return _helpers._datetime_from_microseconds( - int(self._properties.get("endMs")) * 1000.0 - ) - - @property - def input_stages(self): - """List(int): Entry IDs for stages that were inputs for this stage.""" - if self._properties.get("inputStages") is None: - return [] - return [ - _helpers._int_or_none(entry) - for entry in self._properties.get("inputStages") - ] - - @property - def parallel_inputs(self): - """Optional[int]: Number of parallel input segments within - the stage. - """ - return _helpers._int_or_none(self._properties.get("parallelInputs")) - - @property - def completed_parallel_inputs(self): - """Optional[int]: Number of parallel input segments completed.""" - return _helpers._int_or_none(self._properties.get("completedParallelInputs")) - - @property - def wait_ms_avg(self): - """Optional[int]: Milliseconds the average worker spent waiting to - be scheduled. - """ - return _helpers._int_or_none(self._properties.get("waitMsAvg")) - - @property - def wait_ms_max(self): - """Optional[int]: Milliseconds the slowest worker spent waiting to - be scheduled. - """ - return _helpers._int_or_none(self._properties.get("waitMsMax")) - - @property - def wait_ratio_avg(self): - """Optional[float]: Ratio of time the average worker spent waiting - to be scheduled, relative to the longest time spent by any worker in - any stage of the overall plan. - """ - return self._properties.get("waitRatioAvg") - - @property - def wait_ratio_max(self): - """Optional[float]: Ratio of time the slowest worker spent waiting - to be scheduled, relative to the longest time spent by any worker in - any stage of the overall plan. - """ - return self._properties.get("waitRatioMax") - - @property - def read_ms_avg(self): - """Optional[int]: Milliseconds the average worker spent reading - input. - """ - return _helpers._int_or_none(self._properties.get("readMsAvg")) - - @property - def read_ms_max(self): - """Optional[int]: Milliseconds the slowest worker spent reading - input. - """ - return _helpers._int_or_none(self._properties.get("readMsMax")) - - @property - def read_ratio_avg(self): - """Optional[float]: Ratio of time the average worker spent reading - input, relative to the longest time spent by any worker in any stage - of the overall plan. - """ - return self._properties.get("readRatioAvg") - - @property - def read_ratio_max(self): - """Optional[float]: Ratio of time the slowest worker spent reading - to be scheduled, relative to the longest time spent by any worker in - any stage of the overall plan. - """ - return self._properties.get("readRatioMax") - - @property - def compute_ms_avg(self): - """Optional[int]: Milliseconds the average worker spent on CPU-bound - processing. - """ - return _helpers._int_or_none(self._properties.get("computeMsAvg")) - - @property - def compute_ms_max(self): - """Optional[int]: Milliseconds the slowest worker spent on CPU-bound - processing. - """ - return _helpers._int_or_none(self._properties.get("computeMsMax")) - - @property - def compute_ratio_avg(self): - """Optional[float]: Ratio of time the average worker spent on - CPU-bound processing, relative to the longest time spent by any - worker in any stage of the overall plan. - """ - return self._properties.get("computeRatioAvg") - - @property - def compute_ratio_max(self): - """Optional[float]: Ratio of time the slowest worker spent on - CPU-bound processing, relative to the longest time spent by any - worker in any stage of the overall plan. - """ - return self._properties.get("computeRatioMax") - - @property - def write_ms_avg(self): - """Optional[int]: Milliseconds the average worker spent writing - output data. - """ - return _helpers._int_or_none(self._properties.get("writeMsAvg")) - - @property - def write_ms_max(self): - """Optional[int]: Milliseconds the slowest worker spent writing - output data. - """ - return _helpers._int_or_none(self._properties.get("writeMsMax")) - - @property - def write_ratio_avg(self): - """Optional[float]: Ratio of time the average worker spent writing - output data, relative to the longest time spent by any worker in any - stage of the overall plan. - """ - return self._properties.get("writeRatioAvg") - - @property - def write_ratio_max(self): - """Optional[float]: Ratio of time the slowest worker spent writing - output data, relative to the longest time spent by any worker in any - stage of the overall plan. - """ - return self._properties.get("writeRatioMax") - - @property - def records_read(self): - """Optional[int]: Number of records read by this stage.""" - return _helpers._int_or_none(self._properties.get("recordsRead")) - - @property - def records_written(self): - """Optional[int]: Number of records written by this stage.""" - return _helpers._int_or_none(self._properties.get("recordsWritten")) - - @property - def status(self): - """Optional[str]: status of this stage.""" - return self._properties.get("status") - - @property - def shuffle_output_bytes(self): - """Optional[int]: Number of bytes written by this stage to - intermediate shuffle. - """ - return _helpers._int_or_none(self._properties.get("shuffleOutputBytes")) - - @property - def shuffle_output_bytes_spilled(self): - """Optional[int]: Number of bytes written by this stage to - intermediate shuffle and spilled to disk. - """ - return _helpers._int_or_none(self._properties.get("shuffleOutputBytesSpilled")) - - @property - def steps(self): - """List(QueryPlanEntryStep): List of step operations performed by - each worker in the stage. - """ - return [ - QueryPlanEntryStep.from_api_repr(step) - for step in self._properties.get("steps", []) - ] - - -class TimelineEntry(object): - """TimelineEntry represents progress of a query job at a particular - point in time. - - See - https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#querytimelinesample - for the underlying API representation within query statistics. - """ - - def __init__(self): - self._properties = {} - - @classmethod - def from_api_repr(cls, resource): - """Factory: construct instance from the JSON repr. - - Args: - resource(Dict[str: object]): - QueryTimelineSample representation returned from API. - - Returns: - google.cloud.bigquery.TimelineEntry: - Timeline sample parsed from ``resource``. - """ - entry = cls() - entry._properties = resource - return entry - - @property - def elapsed_ms(self): - """Optional[int]: Milliseconds elapsed since start of query - execution.""" - return _helpers._int_or_none(self._properties.get("elapsedMs")) - - @property - def active_units(self): - """Optional[int]: Current number of input units being processed - by workers, reported as largest value since the last sample.""" - return _helpers._int_or_none(self._properties.get("activeUnits")) - - @property - def pending_units(self): - """Optional[int]: Current number of input units remaining for - query stages active at this sample time.""" - return _helpers._int_or_none(self._properties.get("pendingUnits")) - - @property - def completed_units(self): - """Optional[int]: Current number of input units completed by - this query.""" - return _helpers._int_or_none(self._properties.get("completedUnits")) - - @property - def slot_millis(self): - """Optional[int]: Cumulative slot-milliseconds consumed by - this query.""" - return _helpers._int_or_none(self._properties.get("totalSlotMs")) - - -class UnknownJob(_AsyncJob): - """A job whose type cannot be determined.""" - - @classmethod - def from_api_repr(cls, resource, client): - """Construct an UnknownJob from the JSON representation. - - Args: - resource (Dict): JSON representation of a job. - client (google.cloud.bigquery.client.Client): - Client connected to BigQuery API. - - Returns: - UnknownJob: Job corresponding to the resource. - """ - job_ref_properties = resource.get("jobReference", {"projectId": client.project}) - job_ref = _JobReference._from_api_repr(job_ref_properties) - job = cls(job_ref, client) - # Populate the job reference with the project, even if it has been - # redacted, because we know it should equal that of the request. - resource["jobReference"] = job_ref_properties - job._properties = resource - return job - - -class ScriptStackFrame(object): - """Stack frame showing the line/column/procedure name where the current - evaluation happened. - - Args: - resource (Map[str, Any]): JSON representation of object. - """ - - def __init__(self, resource): - self._properties = resource - - @property - def procedure_id(self): - """Optional[str]: Name of the active procedure. - - Omitted if in a top-level script. - """ - return self._properties.get("procedureId") - - @property - def text(self): - """str: Text of the current statement/expression.""" - return self._properties.get("text") - - @property - def start_line(self): - """int: One-based start line.""" - return _helpers._int_or_none(self._properties.get("startLine")) - - @property - def start_column(self): - """int: One-based start column.""" - return _helpers._int_or_none(self._properties.get("startColumn")) - - @property - def end_line(self): - """int: One-based end line.""" - return _helpers._int_or_none(self._properties.get("endLine")) - - @property - def end_column(self): - """int: One-based end column.""" - return _helpers._int_or_none(self._properties.get("endColumn")) - - -class ScriptStatistics(object): - """Statistics for a child job of a script. - - Args: - resource (Map[str, Any]): JSON representation of object. - """ - - def __init__(self, resource): - self._properties = resource - - @property - def stack_frames(self): - """List[ScriptStackFrame]: Stack trace where the current evaluation - happened. - - Shows line/column/procedure name of each frame on the stack at the - point where the current evaluation happened. - - The leaf frame is first, the primary script is last. - """ - return [ - ScriptStackFrame(frame) for frame in self._properties.get("stackFrames", []) - ] - - @property - def evaluation_kind(self): - """str: Indicates the type of child job. - - Possible values include ``STATEMENT`` and ``EXPRESSION``. - """ - return self._properties.get("evaluationKind") diff --git a/google/cloud/bigquery/job/__init__.py b/google/cloud/bigquery/job/__init__.py new file mode 100644 index 000000000..f51311b0b --- /dev/null +++ b/google/cloud/bigquery/job/__init__.py @@ -0,0 +1,87 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Define API Jobs.""" + +from google.cloud.bigquery.job.base import _AsyncJob +from google.cloud.bigquery.job.base import _error_result_to_exception +from google.cloud.bigquery.job.base import _DONE_STATE +from google.cloud.bigquery.job.base import _JobConfig +from google.cloud.bigquery.job.base import _JobReference +from google.cloud.bigquery.job.base import ReservationUsage +from google.cloud.bigquery.job.base import ScriptStatistics +from google.cloud.bigquery.job.base import ScriptStackFrame +from google.cloud.bigquery.job.base import TransactionInfo +from google.cloud.bigquery.job.base import UnknownJob +from google.cloud.bigquery.job.copy_ import CopyJob +from google.cloud.bigquery.job.copy_ import CopyJobConfig +from google.cloud.bigquery.job.copy_ import OperationType +from google.cloud.bigquery.job.extract import ExtractJob +from google.cloud.bigquery.job.extract import ExtractJobConfig +from google.cloud.bigquery.job.load import LoadJob +from google.cloud.bigquery.job.load import LoadJobConfig +from google.cloud.bigquery.job.query import _contains_order_by +from google.cloud.bigquery.job.query import DmlStats +from google.cloud.bigquery.job.query import QueryJob +from google.cloud.bigquery.job.query import QueryJobConfig +from google.cloud.bigquery.job.query import QueryPlanEntry +from google.cloud.bigquery.job.query import QueryPlanEntryStep +from google.cloud.bigquery.job.query import ScriptOptions +from google.cloud.bigquery.job.query import TimelineEntry +from google.cloud.bigquery.enums import Compression +from google.cloud.bigquery.enums import CreateDisposition +from google.cloud.bigquery.enums import DestinationFormat +from google.cloud.bigquery.enums import Encoding +from google.cloud.bigquery.enums import QueryPriority +from google.cloud.bigquery.enums import SchemaUpdateOption +from google.cloud.bigquery.enums import SourceFormat +from google.cloud.bigquery.enums import WriteDisposition + + +# Include classes previously in job.py for backwards compatibility. +__all__ = [ + "_AsyncJob", + "_error_result_to_exception", + "_DONE_STATE", + "_JobConfig", + "_JobReference", + "ReservationUsage", + "ScriptStatistics", + "ScriptStackFrame", + "UnknownJob", + "CopyJob", + "CopyJobConfig", + "OperationType", + "ExtractJob", + "ExtractJobConfig", + "LoadJob", + "LoadJobConfig", + "_contains_order_by", + "DmlStats", + "QueryJob", + "QueryJobConfig", + "QueryPlanEntry", + "QueryPlanEntryStep", + "ScriptOptions", + "TimelineEntry", + "Compression", + "CreateDisposition", + "DestinationFormat", + "Encoding", + "QueryPriority", + "SchemaUpdateOption", + "SourceFormat", + "TransactionInfo", + "WriteDisposition", +] diff --git a/google/cloud/bigquery/job/base.py b/google/cloud/bigquery/job/base.py new file mode 100644 index 000000000..e5fc592a6 --- /dev/null +++ b/google/cloud/bigquery/job/base.py @@ -0,0 +1,1002 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base classes and helpers for job classes.""" + +from collections import namedtuple +import copy +import http +import threading +import typing +from typing import Dict, Optional + +from google.api_core import exceptions +import google.api_core.future.polling + +from google.cloud.bigquery import _helpers +from google.cloud.bigquery.retry import DEFAULT_RETRY + +if typing.TYPE_CHECKING: # pragma: NO COVER + from google.api_core import retry as retries + + +_DONE_STATE = "DONE" +_STOPPED_REASON = "stopped" +_ERROR_REASON_TO_EXCEPTION = { + "accessDenied": http.client.FORBIDDEN, + "backendError": http.client.INTERNAL_SERVER_ERROR, + "billingNotEnabled": http.client.FORBIDDEN, + "billingTierLimitExceeded": http.client.BAD_REQUEST, + "blocked": http.client.FORBIDDEN, + "duplicate": http.client.CONFLICT, + "internalError": http.client.INTERNAL_SERVER_ERROR, + "invalid": http.client.BAD_REQUEST, + "invalidQuery": http.client.BAD_REQUEST, + "notFound": http.client.NOT_FOUND, + "notImplemented": http.client.NOT_IMPLEMENTED, + "quotaExceeded": http.client.FORBIDDEN, + "rateLimitExceeded": http.client.FORBIDDEN, + "resourceInUse": http.client.BAD_REQUEST, + "resourcesExceeded": http.client.BAD_REQUEST, + "responseTooLarge": http.client.FORBIDDEN, + "stopped": http.client.OK, + "tableUnavailable": http.client.BAD_REQUEST, +} + + +def _error_result_to_exception(error_result): + """Maps BigQuery error reasons to an exception. + + The reasons and their matching HTTP status codes are documented on + the `troubleshooting errors`_ page. + + .. _troubleshooting errors: https://cloud.google.com/bigquery\ + /troubleshooting-errors + + Args: + error_result (Mapping[str, str]): The error result from BigQuery. + + Returns: + google.cloud.exceptions.GoogleAPICallError: The mapped exception. + """ + reason = error_result.get("reason") + status_code = _ERROR_REASON_TO_EXCEPTION.get( + reason, http.client.INTERNAL_SERVER_ERROR + ) + return exceptions.from_http_status( + status_code, error_result.get("message", ""), errors=[error_result] + ) + + +ReservationUsage = namedtuple("ReservationUsage", "name slot_ms") +ReservationUsage.__doc__ = "Job resource usage for a reservation." +ReservationUsage.name.__doc__ = ( + 'Reservation name or "unreserved" for on-demand resources usage.' +) +ReservationUsage.slot_ms.__doc__ = ( + "Total slot milliseconds used by the reservation for a particular job." +) + + +class TransactionInfo(typing.NamedTuple): + """[Alpha] Information of a multi-statement transaction. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#TransactionInfo + + .. versionadded:: 2.24.0 + """ + + transaction_id: str + """Output only. ID of the transaction.""" + + @classmethod + def from_api_repr(cls, transaction_info: Dict[str, str]) -> "TransactionInfo": + return cls(transaction_info["transactionId"]) + + +class _JobReference(object): + """A reference to a job. + + Args: + job_id (str): ID of the job to run. + project (str): ID of the project where the job runs. + location (str): Location of where the job runs. + """ + + def __init__(self, job_id, project, location): + self._properties = {"jobId": job_id, "projectId": project} + # The location field must not be populated if it is None. + if location: + self._properties["location"] = location + + @property + def job_id(self): + """str: ID of the job.""" + return self._properties.get("jobId") + + @property + def project(self): + """str: ID of the project where the job runs.""" + return self._properties.get("projectId") + + @property + def location(self): + """str: Location where the job runs.""" + return self._properties.get("location") + + def _to_api_repr(self): + """Returns the API resource representation of the job reference.""" + return copy.deepcopy(self._properties) + + @classmethod + def _from_api_repr(cls, resource): + """Returns a job reference for an API resource representation.""" + job_id = resource.get("jobId") + project = resource.get("projectId") + location = resource.get("location") + job_ref = cls(job_id, project, location) + return job_ref + + +class _AsyncJob(google.api_core.future.polling.PollingFuture): + """Base class for asynchronous jobs. + + Args: + job_id (Union[str, _JobReference]): + Job's ID in the project associated with the client or a + fully-qualified job reference. + client (google.cloud.bigquery.client.Client): + Client which holds credentials and project configuration. + """ + + def __init__(self, job_id, client): + super(_AsyncJob, self).__init__() + + # The job reference can be either a plain job ID or the full resource. + # Populate the properties dictionary consistently depending on what has + # been passed in. + job_ref = job_id + if not isinstance(job_id, _JobReference): + job_ref = _JobReference(job_id, client.project, None) + self._properties = {"jobReference": job_ref._to_api_repr()} + + self._client = client + self._result_set = False + self._completion_lock = threading.Lock() + + @property + def job_id(self): + """str: ID of the job.""" + return _helpers._get_sub_prop(self._properties, ["jobReference", "jobId"]) + + @property + def parent_job_id(self): + """Return the ID of the parent job. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics.FIELDS.parent_job_id + + Returns: + Optional[str]: parent job id. + """ + return _helpers._get_sub_prop(self._properties, ["statistics", "parentJobId"]) + + @property + def script_statistics(self): + resource = _helpers._get_sub_prop( + self._properties, ["statistics", "scriptStatistics"] + ) + if resource is None: + return None + return ScriptStatistics(resource) + + @property + def num_child_jobs(self): + """The number of child jobs executed. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics.FIELDS.num_child_jobs + + Returns: + int + """ + count = _helpers._get_sub_prop(self._properties, ["statistics", "numChildJobs"]) + return int(count) if count is not None else 0 + + @property + def project(self): + """Project bound to the job. + + Returns: + str: the project (derived from the client). + """ + return _helpers._get_sub_prop(self._properties, ["jobReference", "projectId"]) + + @property + def location(self): + """str: Location where the job runs.""" + return _helpers._get_sub_prop(self._properties, ["jobReference", "location"]) + + def _require_client(self, client): + """Check client or verify over-ride. + + Args: + client (Optional[google.cloud.bigquery.client.Client]): + the client to use. If not passed, falls back to the + ``client`` stored on the current dataset. + + Returns: + google.cloud.bigquery.client.Client: + The client passed in or the currently bound client. + """ + if client is None: + client = self._client + return client + + @property + def job_type(self): + """Type of job. + + Returns: + str: one of 'load', 'copy', 'extract', 'query'. + """ + return self._JOB_TYPE + + @property + def path(self): + """URL path for the job's APIs. + + Returns: + str: the path based on project and job ID. + """ + return "/projects/%s/jobs/%s" % (self.project, self.job_id) + + @property + def labels(self): + """Dict[str, str]: Labels for the job.""" + return self._properties.setdefault("configuration", {}).setdefault("labels", {}) + + @property + def etag(self): + """ETag for the job resource. + + Returns: + Optional[str]: the ETag (None until set from the server). + """ + return self._properties.get("etag") + + @property + def self_link(self): + """URL for the job resource. + + Returns: + Optional[str]: the URL (None until set from the server). + """ + return self._properties.get("selfLink") + + @property + def user_email(self): + """E-mail address of user who submitted the job. + + Returns: + Optional[str]: the URL (None until set from the server). + """ + return self._properties.get("user_email") + + @property + def created(self): + """Datetime at which the job was created. + + Returns: + Optional[datetime.datetime]: + the creation time (None until set from the server). + """ + millis = _helpers._get_sub_prop( + self._properties, ["statistics", "creationTime"] + ) + if millis is not None: + return _helpers._datetime_from_microseconds(millis * 1000.0) + + @property + def started(self): + """Datetime at which the job was started. + + Returns: + Optional[datetime.datetime]: + the start time (None until set from the server). + """ + millis = _helpers._get_sub_prop(self._properties, ["statistics", "startTime"]) + if millis is not None: + return _helpers._datetime_from_microseconds(millis * 1000.0) + + @property + def ended(self): + """Datetime at which the job finished. + + Returns: + Optional[datetime.datetime]: + the end time (None until set from the server). + """ + millis = _helpers._get_sub_prop(self._properties, ["statistics", "endTime"]) + if millis is not None: + return _helpers._datetime_from_microseconds(millis * 1000.0) + + def _job_statistics(self): + """Helper for job-type specific statistics-based properties.""" + statistics = self._properties.get("statistics", {}) + return statistics.get(self._JOB_TYPE, {}) + + @property + def reservation_usage(self): + """Job resource usage breakdown by reservation. + + Returns: + List[google.cloud.bigquery.job.ReservationUsage]: + Reservation usage stats. Can be empty if not set from the server. + """ + usage_stats_raw = _helpers._get_sub_prop( + self._properties, ["statistics", "reservationUsage"], default=() + ) + return [ + ReservationUsage(name=usage["name"], slot_ms=int(usage["slotMs"])) + for usage in usage_stats_raw + ] + + @property + def transaction_info(self) -> Optional[TransactionInfo]: + """Information of the multi-statement transaction if this job is part of one. + + .. versionadded:: 2.24.0 + """ + info = self._properties.get("statistics", {}).get("transactionInfo") + if info is None: + return None + else: + return TransactionInfo.from_api_repr(info) + + @property + def error_result(self): + """Error information about the job as a whole. + + Returns: + Optional[Mapping]: the error information (None until set from the server). + """ + status = self._properties.get("status") + if status is not None: + return status.get("errorResult") + + @property + def errors(self): + """Information about individual errors generated by the job. + + Returns: + Optional[List[Mapping]]: + the error information (None until set from the server). + """ + status = self._properties.get("status") + if status is not None: + return status.get("errors") + + @property + def state(self): + """Status of the job. + + Returns: + Optional[str]: + the state (None until set from the server). + """ + status = self._properties.get("status", {}) + return status.get("state") + + def _set_properties(self, api_response): + """Update properties from resource in body of ``api_response`` + + Args: + api_response (Dict): response returned from an API call. + """ + cleaned = api_response.copy() + + statistics = cleaned.get("statistics", {}) + if "creationTime" in statistics: + statistics["creationTime"] = float(statistics["creationTime"]) + if "startTime" in statistics: + statistics["startTime"] = float(statistics["startTime"]) + if "endTime" in statistics: + statistics["endTime"] = float(statistics["endTime"]) + + # Save configuration to keep reference same in self._configuration. + cleaned_config = cleaned.pop("configuration", {}) + configuration = self._properties.pop("configuration", {}) + self._properties.clear() + self._properties.update(cleaned) + self._properties["configuration"] = configuration + self._properties["configuration"].update(cleaned_config) + + # For Future interface + self._set_future_result() + + @classmethod + def _check_resource_config(cls, resource): + """Helper for :meth:`from_api_repr` + + Args: + resource (Dict): resource for the job. + + Raises: + KeyError: + If the resource has no identifier, or + is missing the appropriate configuration. + """ + if "jobReference" not in resource or "jobId" not in resource["jobReference"]: + raise KeyError( + "Resource lacks required identity information: " + '["jobReference"]["jobId"]' + ) + if ( + "configuration" not in resource + or cls._JOB_TYPE not in resource["configuration"] + ): + raise KeyError( + "Resource lacks required configuration: " + '["configuration"]["%s"]' % cls._JOB_TYPE + ) + + def to_api_repr(self): + """Generate a resource for the job.""" + return copy.deepcopy(self._properties) + + _build_resource = to_api_repr # backward-compatibility alias + + def _begin(self, client=None, retry=DEFAULT_RETRY, timeout=None): + """API call: begin the job via a POST request + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + + Args: + client (Optional[google.cloud.bigquery.client.Client]): + The client to use. If not passed, falls back to the ``client`` + associated with the job object or``NoneType`` + retry (Optional[google.api_core.retry.Retry]): + How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + + Raises: + ValueError: + If the job has already begun. + """ + if self.state is not None: + raise ValueError("Job already begun.") + + client = self._require_client(client) + path = "/projects/%s/jobs" % (self.project,) + + # jobs.insert is idempotent because we ensure that every new + # job has an ID. + span_attributes = {"path": path} + api_response = client._call_api( + retry, + span_name="BigQuery.job.begin", + span_attributes=span_attributes, + job_ref=self, + method="POST", + path=path, + data=self.to_api_repr(), + timeout=timeout, + ) + self._set_properties(api_response) + + def exists( + self, client=None, retry: "retries.Retry" = DEFAULT_RETRY, timeout: float = None + ) -> bool: + """API call: test for the existence of the job via a GET request + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/get + + Args: + client (Optional[google.cloud.bigquery.client.Client]): + the client to use. If not passed, falls back to the + ``client`` stored on the current dataset. + + retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + + Returns: + bool: Boolean indicating existence of the job. + """ + client = self._require_client(client) + + extra_params = {"fields": "id"} + if self.location: + extra_params["location"] = self.location + + try: + span_attributes = {"path": self.path} + + client._call_api( + retry, + span_name="BigQuery.job.exists", + span_attributes=span_attributes, + job_ref=self, + method="GET", + path=self.path, + query_params=extra_params, + timeout=timeout, + ) + except exceptions.NotFound: + return False + else: + return True + + def reload( + self, client=None, retry: "retries.Retry" = DEFAULT_RETRY, timeout: float = None + ): + """API call: refresh job properties via a GET request. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/get + + Args: + client (Optional[google.cloud.bigquery.client.Client]): + the client to use. If not passed, falls back to the + ``client`` stored on the current dataset. + + retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + """ + client = self._require_client(client) + + extra_params = {} + if self.location: + extra_params["location"] = self.location + span_attributes = {"path": self.path} + + api_response = client._call_api( + retry, + span_name="BigQuery.job.reload", + span_attributes=span_attributes, + job_ref=self, + method="GET", + path=self.path, + query_params=extra_params, + timeout=timeout, + ) + self._set_properties(api_response) + + def cancel( + self, client=None, retry: "retries.Retry" = DEFAULT_RETRY, timeout: float = None + ) -> bool: + """API call: cancel job via a POST request + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/cancel + + Args: + client (Optional[google.cloud.bigquery.client.Client]): + the client to use. If not passed, falls back to the + ``client`` stored on the current dataset. + retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry`` + + Returns: + bool: Boolean indicating that the cancel request was sent. + """ + client = self._require_client(client) + + extra_params = {} + if self.location: + extra_params["location"] = self.location + + path = "{}/cancel".format(self.path) + span_attributes = {"path": path} + + api_response = client._call_api( + retry, + span_name="BigQuery.job.cancel", + span_attributes=span_attributes, + job_ref=self, + method="POST", + path=path, + query_params=extra_params, + timeout=timeout, + ) + self._set_properties(api_response["job"]) + # The Future interface requires that we return True if the *attempt* + # to cancel was successful. + return True + + # The following methods implement the PollingFuture interface. Note that + # the methods above are from the pre-Future interface and are left for + # compatibility. The only "overloaded" method is :meth:`cancel`, which + # satisfies both interfaces. + + def _set_future_result(self): + """Set the result or exception from the job if it is complete.""" + # This must be done in a lock to prevent the polling thread + # and main thread from both executing the completion logic + # at the same time. + with self._completion_lock: + # If the operation isn't complete or if the result has already been + # set, do not call set_result/set_exception again. + # Note: self._result_set is set to True in set_result and + # set_exception, in case those methods are invoked directly. + if not self.done(reload=False) or self._result_set: + return + + if self.error_result is not None: + exception = _error_result_to_exception(self.error_result) + self.set_exception(exception) + else: + self.set_result(self) + + def done( + self, + retry: "retries.Retry" = DEFAULT_RETRY, + timeout: float = None, + reload: bool = True, + ) -> bool: + """Checks if the job is complete. + + Args: + retry (Optional[google.api_core.retry.Retry]): + How to retry the RPC. If the job state is ``DONE``, retrying is aborted + early, as the job will not change anymore. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + reload (Optional[bool]): + If ``True``, make an API call to refresh the job state of + unfinished jobs before checking. Default ``True``. + + Returns: + bool: True if the job is complete, False otherwise. + """ + # Do not refresh is the state is already done, as the job will not + # change once complete. + if self.state != _DONE_STATE and reload: + self.reload(retry=retry, timeout=timeout) + return self.state == _DONE_STATE + + def result( + self, retry: "retries.Retry" = DEFAULT_RETRY, timeout: float = None + ) -> "_AsyncJob": + """Start the job and wait for it to complete and get the result. + + Args: + retry (Optional[google.api_core.retry.Retry]): + How to retry the RPC. If the job state is ``DONE``, retrying is aborted + early, as the job will not change anymore. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + If multiple requests are made under the hood, ``timeout`` + applies to each individual request. + + Returns: + _AsyncJob: This instance. + + Raises: + google.cloud.exceptions.GoogleAPICallError: + if the job failed. + concurrent.futures.TimeoutError: + if the job did not complete in the given timeout. + """ + if self.state is None: + self._begin(retry=retry, timeout=timeout) + + kwargs = {} if retry is DEFAULT_RETRY else {"retry": retry} + return super(_AsyncJob, self).result(timeout=timeout, **kwargs) + + def cancelled(self): + """Check if the job has been cancelled. + + This always returns False. It's not possible to check if a job was + cancelled in the API. This method is here to satisfy the interface + for :class:`google.api_core.future.Future`. + + Returns: + bool: False + """ + return ( + self.error_result is not None + and self.error_result.get("reason") == _STOPPED_REASON + ) + + +class _JobConfig(object): + """Abstract base class for job configuration objects. + + Args: + job_type (str): The key to use for the job configuration. + """ + + def __init__(self, job_type, **kwargs): + self._job_type = job_type + self._properties = {job_type: {}} + for prop, val in kwargs.items(): + setattr(self, prop, val) + + def __setattr__(self, name, value): + """Override to be able to raise error if an unknown property is being set""" + if not name.startswith("_") and not hasattr(type(self), name): + raise AttributeError( + "Property {} is unknown for {}.".format(name, type(self)) + ) + super(_JobConfig, self).__setattr__(name, value) + + @property + def labels(self): + """Dict[str, str]: Labels for the job. + + This method always returns a dict. Once a job has been created on the + server, its labels cannot be modified anymore. + + Raises: + ValueError: If ``value`` type is invalid. + """ + return self._properties.setdefault("labels", {}) + + @labels.setter + def labels(self, value): + if not isinstance(value, dict): + raise ValueError("Pass a dict") + self._properties["labels"] = value + + def _get_sub_prop(self, key, default=None): + """Get a value in the ``self._properties[self._job_type]`` dictionary. + + Most job properties are inside the dictionary related to the job type + (e.g. 'copy', 'extract', 'load', 'query'). Use this method to access + those properties:: + + self._get_sub_prop('destinationTable') + + This is equivalent to using the ``_helpers._get_sub_prop`` function:: + + _helpers._get_sub_prop( + self._properties, ['query', 'destinationTable']) + + Args: + key (str): + Key for the value to get in the + ``self._properties[self._job_type]`` dictionary. + default (Optional[object]): + Default value to return if the key is not found. + Defaults to :data:`None`. + + Returns: + object: The value if present or the default. + """ + return _helpers._get_sub_prop( + self._properties, [self._job_type, key], default=default + ) + + def _set_sub_prop(self, key, value): + """Set a value in the ``self._properties[self._job_type]`` dictionary. + + Most job properties are inside the dictionary related to the job type + (e.g. 'copy', 'extract', 'load', 'query'). Use this method to set + those properties:: + + self._set_sub_prop('useLegacySql', False) + + This is equivalent to using the ``_helper._set_sub_prop`` function:: + + _helper._set_sub_prop( + self._properties, ['query', 'useLegacySql'], False) + + Args: + key (str): + Key to set in the ``self._properties[self._job_type]`` + dictionary. + value (object): Value to set. + """ + _helpers._set_sub_prop(self._properties, [self._job_type, key], value) + + def _del_sub_prop(self, key): + """Remove ``key`` from the ``self._properties[self._job_type]`` dict. + + Most job properties are inside the dictionary related to the job type + (e.g. 'copy', 'extract', 'load', 'query'). Use this method to clear + those properties:: + + self._del_sub_prop('useLegacySql') + + This is equivalent to using the ``_helper._del_sub_prop`` function:: + + _helper._del_sub_prop( + self._properties, ['query', 'useLegacySql']) + + Args: + key (str): + Key to remove in the ``self._properties[self._job_type]`` + dictionary. + """ + _helpers._del_sub_prop(self._properties, [self._job_type, key]) + + def to_api_repr(self) -> dict: + """Build an API representation of the job config. + + Returns: + Dict: A dictionary in the format used by the BigQuery API. + """ + return copy.deepcopy(self._properties) + + def _fill_from_default(self, default_job_config): + """Merge this job config with a default job config. + + The keys in this object take precedence over the keys in the default + config. The merge is done at the top-level as well as for keys one + level below the job type. + + Args: + default_job_config (google.cloud.bigquery.job._JobConfig): + The default job config that will be used to fill in self. + + Returns: + google.cloud.bigquery.job._JobConfig: A new (merged) job config. + """ + if self._job_type != default_job_config._job_type: + raise TypeError( + "attempted to merge two incompatible job types: " + + repr(self._job_type) + + ", " + + repr(default_job_config._job_type) + ) + + # cls is one of the job config subclasses that provides the job_type argument to + # this base class on instantiation, thus missing-parameter warning is a false + # positive here. + new_job_config = self.__class__() # pytype: disable=missing-parameter + + default_job_properties = copy.deepcopy(default_job_config._properties) + for key in self._properties: + if key != self._job_type: + default_job_properties[key] = self._properties[key] + + default_job_properties[self._job_type].update(self._properties[self._job_type]) + new_job_config._properties = default_job_properties + + return new_job_config + + @classmethod + def from_api_repr(cls, resource: dict) -> "_JobConfig": + """Factory: construct a job configuration given its API representation + + Args: + resource (Dict): + A job configuration in the same representation as is returned + from the API. + + Returns: + google.cloud.bigquery.job._JobConfig: Configuration parsed from ``resource``. + """ + # cls is one of the job config subclasses that provides the job_type argument to + # this base class on instantiation, thus missing-parameter warning is a false + # positive here. + job_config = cls() # pytype: disable=missing-parameter + job_config._properties = resource + return job_config + + +class ScriptStackFrame(object): + """Stack frame showing the line/column/procedure name where the current + evaluation happened. + + Args: + resource (Map[str, Any]): JSON representation of object. + """ + + def __init__(self, resource): + self._properties = resource + + @property + def procedure_id(self): + """Optional[str]: Name of the active procedure. + + Omitted if in a top-level script. + """ + return self._properties.get("procedureId") + + @property + def text(self): + """str: Text of the current statement/expression.""" + return self._properties.get("text") + + @property + def start_line(self): + """int: One-based start line.""" + return _helpers._int_or_none(self._properties.get("startLine")) + + @property + def start_column(self): + """int: One-based start column.""" + return _helpers._int_or_none(self._properties.get("startColumn")) + + @property + def end_line(self): + """int: One-based end line.""" + return _helpers._int_or_none(self._properties.get("endLine")) + + @property + def end_column(self): + """int: One-based end column.""" + return _helpers._int_or_none(self._properties.get("endColumn")) + + +class ScriptStatistics(object): + """Statistics for a child job of a script. + + Args: + resource (Map[str, Any]): JSON representation of object. + """ + + def __init__(self, resource): + self._properties = resource + + @property + def stack_frames(self): + """List[ScriptStackFrame]: Stack trace where the current evaluation + happened. + + Shows line/column/procedure name of each frame on the stack at the + point where the current evaluation happened. + + The leaf frame is first, the primary script is last. + """ + return [ + ScriptStackFrame(frame) for frame in self._properties.get("stackFrames", []) + ] + + @property + def evaluation_kind(self): + """str: Indicates the type of child job. + + Possible values include ``STATEMENT`` and ``EXPRESSION``. + """ + return self._properties.get("evaluationKind") + + +class UnknownJob(_AsyncJob): + """A job whose type cannot be determined.""" + + @classmethod + def from_api_repr(cls, resource: dict, client) -> "UnknownJob": + """Construct an UnknownJob from the JSON representation. + + Args: + resource (Dict): JSON representation of a job. + client (google.cloud.bigquery.client.Client): + Client connected to BigQuery API. + + Returns: + UnknownJob: Job corresponding to the resource. + """ + job_ref_properties = resource.get("jobReference", {"projectId": client.project}) + job_ref = _JobReference._from_api_repr(job_ref_properties) + job = cls(job_ref, client) + # Populate the job reference with the project, even if it has been + # redacted, because we know it should equal that of the request. + resource["jobReference"] = job_ref_properties + job._properties = resource + return job diff --git a/google/cloud/bigquery/job/copy_.py b/google/cloud/bigquery/job/copy_.py new file mode 100644 index 000000000..c6ee98944 --- /dev/null +++ b/google/cloud/bigquery/job/copy_.py @@ -0,0 +1,261 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes for copy jobs.""" + +from typing import Optional + +from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +from google.cloud.bigquery import _helpers +from google.cloud.bigquery.table import TableReference + +from google.cloud.bigquery.job.base import _AsyncJob +from google.cloud.bigquery.job.base import _JobConfig +from google.cloud.bigquery.job.base import _JobReference + + +class OperationType: + """Different operation types supported in table copy job. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#operationtype + """ + + OPERATION_TYPE_UNSPECIFIED = "OPERATION_TYPE_UNSPECIFIED" + """Unspecified operation type.""" + + COPY = "COPY" + """The source and destination table have the same table type.""" + + SNAPSHOT = "SNAPSHOT" + """The source table type is TABLE and the destination table type is SNAPSHOT.""" + + RESTORE = "RESTORE" + """The source table type is SNAPSHOT and the destination table type is TABLE.""" + + +class CopyJobConfig(_JobConfig): + """Configuration options for copy jobs. + + All properties in this class are optional. Values which are :data:`None` -> + server defaults. Set properties on the constructed configuration by using + the property name as the name of a keyword argument. + """ + + def __init__(self, **kwargs): + super(CopyJobConfig, self).__init__("copy", **kwargs) + + @property + def create_disposition(self): + """google.cloud.bigquery.job.CreateDisposition: Specifies behavior + for creating tables. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.create_disposition + """ + return self._get_sub_prop("createDisposition") + + @create_disposition.setter + def create_disposition(self, value): + self._set_sub_prop("createDisposition", value) + + @property + def write_disposition(self): + """google.cloud.bigquery.job.WriteDisposition: Action that occurs if + the destination table already exists. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.write_disposition + """ + return self._get_sub_prop("writeDisposition") + + @write_disposition.setter + def write_disposition(self, value): + self._set_sub_prop("writeDisposition", value) + + @property + def destination_encryption_configuration(self): + """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom + encryption configuration for the destination table. + + Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` + if using default encryption. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.destination_encryption_configuration + """ + prop = self._get_sub_prop("destinationEncryptionConfiguration") + if prop is not None: + prop = EncryptionConfiguration.from_api_repr(prop) + return prop + + @destination_encryption_configuration.setter + def destination_encryption_configuration(self, value): + api_repr = value + if value is not None: + api_repr = value.to_api_repr() + self._set_sub_prop("destinationEncryptionConfiguration", api_repr) + + @property + def operation_type(self) -> str: + """The operation to perform with this copy job. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.operation_type + """ + return self._get_sub_prop( + "operationType", OperationType.OPERATION_TYPE_UNSPECIFIED + ) + + @operation_type.setter + def operation_type(self, value: Optional[str]): + if value is None: + value = OperationType.OPERATION_TYPE_UNSPECIFIED + self._set_sub_prop("operationType", value) + + +class CopyJob(_AsyncJob): + """Asynchronous job: copy data into a table from other tables. + + Args: + job_id (str): the job's ID, within the project belonging to ``client``. + + sources (List[google.cloud.bigquery.table.TableReference]): Table from which data is to be loaded. + + destination (google.cloud.bigquery.table.TableReference): Table into which data is to be loaded. + + client (google.cloud.bigquery.client.Client): + A client which holds credentials and project configuration + for the dataset (which requires a project). + + job_config (Optional[google.cloud.bigquery.job.CopyJobConfig]): + Extra configuration options for the copy job. + """ + + _JOB_TYPE = "copy" + + def __init__(self, job_id, sources, destination, client, job_config=None): + super(CopyJob, self).__init__(job_id, client) + + if not job_config: + job_config = CopyJobConfig() + + self._configuration = job_config + self._properties["configuration"] = job_config._properties + + if destination: + _helpers._set_sub_prop( + self._properties, + ["configuration", "copy", "destinationTable"], + destination.to_api_repr(), + ) + + if sources: + source_resources = [source.to_api_repr() for source in sources] + _helpers._set_sub_prop( + self._properties, + ["configuration", "copy", "sourceTables"], + source_resources, + ) + + @property + def destination(self): + """google.cloud.bigquery.table.TableReference: Table into which data + is to be loaded. + """ + return TableReference.from_api_repr( + _helpers._get_sub_prop( + self._properties, ["configuration", "copy", "destinationTable"] + ) + ) + + @property + def sources(self): + """List[google.cloud.bigquery.table.TableReference]): Table(s) from + which data is to be loaded. + """ + source_configs = _helpers._get_sub_prop( + self._properties, ["configuration", "copy", "sourceTables"] + ) + if source_configs is None: + single = _helpers._get_sub_prop( + self._properties, ["configuration", "copy", "sourceTable"] + ) + if single is None: + raise KeyError("Resource missing 'sourceTables' / 'sourceTable'") + source_configs = [single] + + sources = [] + for source_config in source_configs: + table_ref = TableReference.from_api_repr(source_config) + sources.append(table_ref) + return sources + + @property + def create_disposition(self): + """See + :attr:`google.cloud.bigquery.job.CopyJobConfig.create_disposition`. + """ + return self._configuration.create_disposition + + @property + def write_disposition(self): + """See + :attr:`google.cloud.bigquery.job.CopyJobConfig.write_disposition`. + """ + return self._configuration.write_disposition + + @property + def destination_encryption_configuration(self): + """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom + encryption configuration for the destination table. + + Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` + if using default encryption. + + See + :attr:`google.cloud.bigquery.job.CopyJobConfig.destination_encryption_configuration`. + """ + return self._configuration.destination_encryption_configuration + + def to_api_repr(self): + """Generate a resource for :meth:`_begin`.""" + # Exclude statistics, if set. + return { + "jobReference": self._properties["jobReference"], + "configuration": self._properties["configuration"], + } + + @classmethod + def from_api_repr(cls, resource, client): + """Factory: construct a job given its API representation + + .. note: + + This method assumes that the project found in the resource matches + the client's project. + + Args: + resource (Dict): dataset job representation returned from the API + client (google.cloud.bigquery.client.Client): + Client which holds credentials and project + configuration for the dataset. + + Returns: + google.cloud.bigquery.job.CopyJob: Job parsed from ``resource``. + """ + cls._check_resource_config(resource) + job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job = cls(job_ref, None, None, client=client) + job._set_properties(resource) + return job diff --git a/google/cloud/bigquery/job/extract.py b/google/cloud/bigquery/job/extract.py new file mode 100644 index 000000000..3373bcdef --- /dev/null +++ b/google/cloud/bigquery/job/extract.py @@ -0,0 +1,266 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes for extract (export) jobs.""" + +from google.cloud.bigquery import _helpers +from google.cloud.bigquery.model import ModelReference +from google.cloud.bigquery.table import Table +from google.cloud.bigquery.table import TableListItem +from google.cloud.bigquery.table import TableReference +from google.cloud.bigquery.job.base import _AsyncJob +from google.cloud.bigquery.job.base import _JobConfig +from google.cloud.bigquery.job.base import _JobReference + + +class ExtractJobConfig(_JobConfig): + """Configuration options for extract jobs. + + All properties in this class are optional. Values which are :data:`None` -> + server defaults. Set properties on the constructed configuration by using + the property name as the name of a keyword argument. + """ + + def __init__(self, **kwargs): + super(ExtractJobConfig, self).__init__("extract", **kwargs) + + @property + def compression(self): + """google.cloud.bigquery.job.Compression: Compression type to use for + exported files. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.compression + """ + return self._get_sub_prop("compression") + + @compression.setter + def compression(self, value): + self._set_sub_prop("compression", value) + + @property + def destination_format(self): + """google.cloud.bigquery.job.DestinationFormat: Exported file format. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.destination_format + """ + return self._get_sub_prop("destinationFormat") + + @destination_format.setter + def destination_format(self, value): + self._set_sub_prop("destinationFormat", value) + + @property + def field_delimiter(self): + """str: Delimiter to use between fields in the exported data. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.field_delimiter + """ + return self._get_sub_prop("fieldDelimiter") + + @field_delimiter.setter + def field_delimiter(self, value): + self._set_sub_prop("fieldDelimiter", value) + + @property + def print_header(self): + """bool: Print a header row in the exported data. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.print_header + """ + return self._get_sub_prop("printHeader") + + @print_header.setter + def print_header(self, value): + self._set_sub_prop("printHeader", value) + + @property + def use_avro_logical_types(self): + """bool: For loads of Avro data, governs whether Avro logical types are + converted to their corresponding BigQuery types (e.g. TIMESTAMP) rather than + raw types (e.g. INTEGER). + """ + return self._get_sub_prop("useAvroLogicalTypes") + + @use_avro_logical_types.setter + def use_avro_logical_types(self, value): + self._set_sub_prop("useAvroLogicalTypes", bool(value)) + + +class ExtractJob(_AsyncJob): + """Asynchronous job: extract data from a table into Cloud Storage. + + Args: + job_id (str): the job's ID. + + source (Union[ \ + google.cloud.bigquery.table.TableReference, \ + google.cloud.bigquery.model.ModelReference \ + ]): + Table or Model from which data is to be loaded or extracted. + + destination_uris (List[str]): + URIs describing where the extracted data will be written in Cloud + Storage, using the format ``gs:///``. + + client (google.cloud.bigquery.client.Client): + A client which holds credentials and project configuration. + + job_config (Optional[google.cloud.bigquery.job.ExtractJobConfig]): + Extra configuration options for the extract job. + """ + + _JOB_TYPE = "extract" + + def __init__(self, job_id, source, destination_uris, client, job_config=None): + super(ExtractJob, self).__init__(job_id, client) + + if job_config is None: + job_config = ExtractJobConfig() + + self._properties["configuration"] = job_config._properties + self._configuration = job_config + + if source: + source_ref = {"projectId": source.project, "datasetId": source.dataset_id} + + if isinstance(source, (Table, TableListItem, TableReference)): + source_ref["tableId"] = source.table_id + source_key = "sourceTable" + else: + source_ref["modelId"] = source.model_id + source_key = "sourceModel" + + _helpers._set_sub_prop( + self._properties, ["configuration", "extract", source_key], source_ref + ) + + if destination_uris: + _helpers._set_sub_prop( + self._properties, + ["configuration", "extract", "destinationUris"], + destination_uris, + ) + + @property + def source(self): + """Union[ \ + google.cloud.bigquery.table.TableReference, \ + google.cloud.bigquery.model.ModelReference \ + ]: Table or Model from which data is to be loaded or extracted. + """ + source_config = _helpers._get_sub_prop( + self._properties, ["configuration", "extract", "sourceTable"] + ) + if source_config: + return TableReference.from_api_repr(source_config) + else: + source_config = _helpers._get_sub_prop( + self._properties, ["configuration", "extract", "sourceModel"] + ) + return ModelReference.from_api_repr(source_config) + + @property + def destination_uris(self): + """List[str]: URIs describing where the extracted data will be + written in Cloud Storage, using the format + ``gs:///``. + """ + return _helpers._get_sub_prop( + self._properties, ["configuration", "extract", "destinationUris"] + ) + + @property + def compression(self): + """See + :attr:`google.cloud.bigquery.job.ExtractJobConfig.compression`. + """ + return self._configuration.compression + + @property + def destination_format(self): + """See + :attr:`google.cloud.bigquery.job.ExtractJobConfig.destination_format`. + """ + return self._configuration.destination_format + + @property + def field_delimiter(self): + """See + :attr:`google.cloud.bigquery.job.ExtractJobConfig.field_delimiter`. + """ + return self._configuration.field_delimiter + + @property + def print_header(self): + """See + :attr:`google.cloud.bigquery.job.ExtractJobConfig.print_header`. + """ + return self._configuration.print_header + + @property + def destination_uri_file_counts(self): + """Return file counts from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics4.FIELDS.destination_uri_file_counts + + Returns: + List[int]: + A list of integer counts, each representing the number of files + per destination URI or URI pattern specified in the extract + configuration. These values will be in the same order as the URIs + specified in the 'destinationUris' field. Returns None if job is + not yet complete. + """ + counts = self._job_statistics().get("destinationUriFileCounts") + if counts is not None: + return [int(count) for count in counts] + return None + + def to_api_repr(self): + """Generate a resource for :meth:`_begin`.""" + # Exclude statistics, if set. + return { + "jobReference": self._properties["jobReference"], + "configuration": self._properties["configuration"], + } + + @classmethod + def from_api_repr(cls, resource: dict, client) -> "ExtractJob": + """Factory: construct a job given its API representation + + .. note: + + This method assumes that the project found in the resource matches + the client's project. + + Args: + resource (Dict): dataset job representation returned from the API + + client (google.cloud.bigquery.client.Client): + Client which holds credentials and project + configuration for the dataset. + + Returns: + google.cloud.bigquery.job.ExtractJob: Job parsed from ``resource``. + """ + cls._check_resource_config(resource) + job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job = cls(job_ref, None, None, client=client) + job._set_properties(resource) + return job diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py new file mode 100644 index 000000000..aee055c1c --- /dev/null +++ b/google/cloud/bigquery/job/load.py @@ -0,0 +1,822 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes for load jobs.""" + +from typing import FrozenSet, List, Iterable, Optional + +from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +from google.cloud.bigquery.external_config import HivePartitioningOptions +from google.cloud.bigquery.format_options import ParquetOptions +from google.cloud.bigquery import _helpers +from google.cloud.bigquery.schema import SchemaField +from google.cloud.bigquery.schema import _to_schema_fields +from google.cloud.bigquery.table import RangePartitioning +from google.cloud.bigquery.table import TableReference +from google.cloud.bigquery.table import TimePartitioning +from google.cloud.bigquery.job.base import _AsyncJob +from google.cloud.bigquery.job.base import _JobConfig +from google.cloud.bigquery.job.base import _JobReference + + +class LoadJobConfig(_JobConfig): + """Configuration options for load jobs. + + All properties in this class are optional. Values which are :data:`None` -> + server defaults. Set properties on the constructed configuration by using + the property name as the name of a keyword argument. + """ + + def __init__(self, **kwargs): + super(LoadJobConfig, self).__init__("load", **kwargs) + + @property + def allow_jagged_rows(self): + """Optional[bool]: Allow missing trailing optional columns (CSV only). + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.allow_jagged_rows + """ + return self._get_sub_prop("allowJaggedRows") + + @allow_jagged_rows.setter + def allow_jagged_rows(self, value): + self._set_sub_prop("allowJaggedRows", value) + + @property + def allow_quoted_newlines(self): + """Optional[bool]: Allow quoted data containing newline characters (CSV only). + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.allow_quoted_newlines + """ + return self._get_sub_prop("allowQuotedNewlines") + + @allow_quoted_newlines.setter + def allow_quoted_newlines(self, value): + self._set_sub_prop("allowQuotedNewlines", value) + + @property + def autodetect(self): + """Optional[bool]: Automatically infer the schema from a sample of the data. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.autodetect + """ + return self._get_sub_prop("autodetect") + + @autodetect.setter + def autodetect(self, value): + self._set_sub_prop("autodetect", value) + + @property + def clustering_fields(self): + """Optional[List[str]]: Fields defining clustering for the table + + (Defaults to :data:`None`). + + Clustering fields are immutable after table creation. + + .. note:: + + BigQuery supports clustering for both partitioned and + non-partitioned tables. + """ + prop = self._get_sub_prop("clustering") + if prop is not None: + return list(prop.get("fields", ())) + + @clustering_fields.setter + def clustering_fields(self, value): + """Optional[List[str]]: Fields defining clustering for the table + + (Defaults to :data:`None`). + """ + if value is not None: + self._set_sub_prop("clustering", {"fields": value}) + else: + self._del_sub_prop("clustering") + + @property + def create_disposition(self): + """Optional[google.cloud.bigquery.job.CreateDisposition]: Specifies behavior + for creating tables. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.create_disposition + """ + return self._get_sub_prop("createDisposition") + + @create_disposition.setter + def create_disposition(self, value): + self._set_sub_prop("createDisposition", value) + + @property + def decimal_target_types(self) -> Optional[FrozenSet[str]]: + """Possible SQL data types to which the source decimal values are converted. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.decimal_target_types + + .. versionadded:: 2.21.0 + """ + prop = self._get_sub_prop("decimalTargetTypes") + if prop is not None: + prop = frozenset(prop) + return prop + + @decimal_target_types.setter + def decimal_target_types(self, value: Optional[Iterable[str]]): + if value is not None: + self._set_sub_prop("decimalTargetTypes", list(value)) + else: + self._del_sub_prop("decimalTargetTypes") + + @property + def destination_encryption_configuration(self): + """Optional[google.cloud.bigquery.encryption_configuration.EncryptionConfiguration]: Custom + encryption configuration for the destination table. + + Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` + if using default encryption. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.destination_encryption_configuration + """ + prop = self._get_sub_prop("destinationEncryptionConfiguration") + if prop is not None: + prop = EncryptionConfiguration.from_api_repr(prop) + return prop + + @destination_encryption_configuration.setter + def destination_encryption_configuration(self, value): + api_repr = value + if value is not None: + api_repr = value.to_api_repr() + self._set_sub_prop("destinationEncryptionConfiguration", api_repr) + else: + self._del_sub_prop("destinationEncryptionConfiguration") + + @property + def destination_table_description(self): + """Optional[str]: Description of the destination table. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.description + """ + prop = self._get_sub_prop("destinationTableProperties") + if prop is not None: + return prop["description"] + + @destination_table_description.setter + def destination_table_description(self, value): + keys = [self._job_type, "destinationTableProperties", "description"] + if value is not None: + _helpers._set_sub_prop(self._properties, keys, value) + else: + _helpers._del_sub_prop(self._properties, keys) + + @property + def destination_table_friendly_name(self): + """Optional[str]: Name given to destination table. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.friendly_name + """ + prop = self._get_sub_prop("destinationTableProperties") + if prop is not None: + return prop["friendlyName"] + + @destination_table_friendly_name.setter + def destination_table_friendly_name(self, value): + keys = [self._job_type, "destinationTableProperties", "friendlyName"] + if value is not None: + _helpers._set_sub_prop(self._properties, keys, value) + else: + _helpers._del_sub_prop(self._properties, keys) + + @property + def encoding(self): + """Optional[google.cloud.bigquery.job.Encoding]: The character encoding of the + data. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.encoding + """ + return self._get_sub_prop("encoding") + + @encoding.setter + def encoding(self, value): + self._set_sub_prop("encoding", value) + + @property + def field_delimiter(self): + """Optional[str]: The separator for fields in a CSV file. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.field_delimiter + """ + return self._get_sub_prop("fieldDelimiter") + + @field_delimiter.setter + def field_delimiter(self, value): + self._set_sub_prop("fieldDelimiter", value) + + @property + def hive_partitioning(self): + """Optional[:class:`~.external_config.HivePartitioningOptions`]: [Beta] When set, \ + it configures hive partitioning support. + + .. note:: + **Experimental**. This feature is experimental and might change or + have limited support. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.hive_partitioning_options + """ + prop = self._get_sub_prop("hivePartitioningOptions") + if prop is None: + return None + return HivePartitioningOptions.from_api_repr(prop) + + @hive_partitioning.setter + def hive_partitioning(self, value): + if value is not None: + if isinstance(value, HivePartitioningOptions): + value = value.to_api_repr() + else: + raise TypeError("Expected a HivePartitioningOptions instance or None.") + + self._set_sub_prop("hivePartitioningOptions", value) + + @property + def ignore_unknown_values(self): + """Optional[bool]: Ignore extra values not represented in the table schema. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.ignore_unknown_values + """ + return self._get_sub_prop("ignoreUnknownValues") + + @ignore_unknown_values.setter + def ignore_unknown_values(self, value): + self._set_sub_prop("ignoreUnknownValues", value) + + @property + def max_bad_records(self): + """Optional[int]: Number of invalid rows to ignore. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.max_bad_records + """ + return _helpers._int_or_none(self._get_sub_prop("maxBadRecords")) + + @max_bad_records.setter + def max_bad_records(self, value): + self._set_sub_prop("maxBadRecords", value) + + @property + def null_marker(self): + """Optional[str]: Represents a null value (CSV only). + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_marker + """ + return self._get_sub_prop("nullMarker") + + @null_marker.setter + def null_marker(self, value): + self._set_sub_prop("nullMarker", value) + + @property + def projection_fields(self) -> Optional[List[str]]: + """Optional[List[str]]: If + :attr:`google.cloud.bigquery.job.LoadJobConfig.source_format` is set to + "DATASTORE_BACKUP", indicates which entity properties to load into + BigQuery from a Cloud Datastore backup. + + Property names are case sensitive and must be top-level properties. If + no properties are specified, BigQuery loads all properties. If any + named property isn't found in the Cloud Datastore backup, an invalid + error is returned in the job result. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.projection_fields + """ + return self._get_sub_prop("projectionFields") + + @projection_fields.setter + def projection_fields(self, value: Optional[List[str]]): + self._set_sub_prop("projectionFields", value) + + @property + def quote_character(self): + """Optional[str]: Character used to quote data sections (CSV only). + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.quote + """ + return self._get_sub_prop("quote") + + @quote_character.setter + def quote_character(self, value): + self._set_sub_prop("quote", value) + + @property + def range_partitioning(self): + """Optional[google.cloud.bigquery.table.RangePartitioning]: + Configures range-based partitioning for destination table. + + .. note:: + **Beta**. The integer range partitioning feature is in a + pre-release state and might change or have limited support. + + Only specify at most one of + :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or + :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. + + Raises: + ValueError: + If the value is not + :class:`~google.cloud.bigquery.table.RangePartitioning` or + :data:`None`. + """ + resource = self._get_sub_prop("rangePartitioning") + if resource is not None: + return RangePartitioning(_properties=resource) + + @range_partitioning.setter + def range_partitioning(self, value): + resource = value + if isinstance(value, RangePartitioning): + resource = value._properties + elif value is not None: + raise ValueError( + "Expected value to be RangePartitioning or None, got {}.".format(value) + ) + self._set_sub_prop("rangePartitioning", resource) + + @property + def schema(self): + """Optional[Sequence[Union[ \ + :class:`~google.cloud.bigquery.schema.SchemaField`, \ + Mapping[str, Any] \ + ]]]: Schema of the destination table. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.schema + """ + schema = _helpers._get_sub_prop(self._properties, ["load", "schema", "fields"]) + if schema is None: + return + return [SchemaField.from_api_repr(field) for field in schema] + + @schema.setter + def schema(self, value): + if value is None: + self._del_sub_prop("schema") + return + + value = _to_schema_fields(value) + + _helpers._set_sub_prop( + self._properties, + ["load", "schema", "fields"], + [field.to_api_repr() for field in value], + ) + + @property + def schema_update_options(self): + """Optional[List[google.cloud.bigquery.job.SchemaUpdateOption]]: Specifies + updates to the destination table schema to allow as a side effect of + the load job. + """ + return self._get_sub_prop("schemaUpdateOptions") + + @schema_update_options.setter + def schema_update_options(self, values): + self._set_sub_prop("schemaUpdateOptions", values) + + @property + def skip_leading_rows(self): + """Optional[int]: Number of rows to skip when reading data (CSV only). + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.skip_leading_rows + """ + return _helpers._int_or_none(self._get_sub_prop("skipLeadingRows")) + + @skip_leading_rows.setter + def skip_leading_rows(self, value): + self._set_sub_prop("skipLeadingRows", str(value)) + + @property + def source_format(self): + """Optional[google.cloud.bigquery.job.SourceFormat]: File format of the data. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_format + """ + return self._get_sub_prop("sourceFormat") + + @source_format.setter + def source_format(self, value): + self._set_sub_prop("sourceFormat", value) + + @property + def time_partitioning(self): + """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based + partitioning for the destination table. + + Only specify at most one of + :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or + :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. + """ + prop = self._get_sub_prop("timePartitioning") + if prop is not None: + prop = TimePartitioning.from_api_repr(prop) + return prop + + @time_partitioning.setter + def time_partitioning(self, value): + api_repr = value + if value is not None: + api_repr = value.to_api_repr() + self._set_sub_prop("timePartitioning", api_repr) + else: + self._del_sub_prop("timePartitioning") + + @property + def use_avro_logical_types(self): + """Optional[bool]: For loads of Avro data, governs whether Avro logical types are + converted to their corresponding BigQuery types (e.g. TIMESTAMP) rather than + raw types (e.g. INTEGER). + """ + return self._get_sub_prop("useAvroLogicalTypes") + + @use_avro_logical_types.setter + def use_avro_logical_types(self, value): + self._set_sub_prop("useAvroLogicalTypes", bool(value)) + + @property + def write_disposition(self): + """Optional[google.cloud.bigquery.job.WriteDisposition]: Action that occurs if + the destination table already exists. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.write_disposition + """ + return self._get_sub_prop("writeDisposition") + + @write_disposition.setter + def write_disposition(self, value): + self._set_sub_prop("writeDisposition", value) + + @property + def parquet_options(self): + """Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional + properties to set if ``sourceFormat`` is set to PARQUET. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.parquet_options + """ + prop = self._get_sub_prop("parquetOptions") + if prop is not None: + prop = ParquetOptions.from_api_repr(prop) + return prop + + @parquet_options.setter + def parquet_options(self, value): + if value is not None: + self._set_sub_prop("parquetOptions", value.to_api_repr()) + else: + self._del_sub_prop("parquetOptions") + + +class LoadJob(_AsyncJob): + """Asynchronous job for loading data into a table. + + Can load from Google Cloud Storage URIs or from a file. + + Args: + job_id (str): the job's ID + + source_uris (Optional[Sequence[str]]): + URIs of one or more data files to be loaded. See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_uris + for supported URI formats. Pass None for jobs that load from a file. + + destination (google.cloud.bigquery.table.TableReference): reference to table into which data is to be loaded. + + client (google.cloud.bigquery.client.Client): + A client which holds credentials and project configuration + for the dataset (which requires a project). + """ + + _JOB_TYPE = "load" + + def __init__(self, job_id, source_uris, destination, client, job_config=None): + super(LoadJob, self).__init__(job_id, client) + + if not job_config: + job_config = LoadJobConfig() + + self._configuration = job_config + self._properties["configuration"] = job_config._properties + + if source_uris is not None: + _helpers._set_sub_prop( + self._properties, ["configuration", "load", "sourceUris"], source_uris + ) + + if destination is not None: + _helpers._set_sub_prop( + self._properties, + ["configuration", "load", "destinationTable"], + destination.to_api_repr(), + ) + + @property + def destination(self): + """google.cloud.bigquery.table.TableReference: table where loaded rows are written + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.destination_table + """ + dest_config = _helpers._get_sub_prop( + self._properties, ["configuration", "load", "destinationTable"] + ) + return TableReference.from_api_repr(dest_config) + + @property + def source_uris(self): + """Optional[Sequence[str]]: URIs of data files to be loaded. See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_uris + for supported URI formats. None for jobs that load from a file. + """ + return _helpers._get_sub_prop( + self._properties, ["configuration", "load", "sourceUris"] + ) + + @property + def allow_jagged_rows(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.allow_jagged_rows`. + """ + return self._configuration.allow_jagged_rows + + @property + def allow_quoted_newlines(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.allow_quoted_newlines`. + """ + return self._configuration.allow_quoted_newlines + + @property + def autodetect(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.autodetect`. + """ + return self._configuration.autodetect + + @property + def create_disposition(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.create_disposition`. + """ + return self._configuration.create_disposition + + @property + def encoding(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.encoding`. + """ + return self._configuration.encoding + + @property + def field_delimiter(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.field_delimiter`. + """ + return self._configuration.field_delimiter + + @property + def ignore_unknown_values(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.ignore_unknown_values`. + """ + return self._configuration.ignore_unknown_values + + @property + def max_bad_records(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.max_bad_records`. + """ + return self._configuration.max_bad_records + + @property + def null_marker(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.null_marker`. + """ + return self._configuration.null_marker + + @property + def quote_character(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.quote_character`. + """ + return self._configuration.quote_character + + @property + def skip_leading_rows(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.skip_leading_rows`. + """ + return self._configuration.skip_leading_rows + + @property + def source_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.source_format`. + """ + return self._configuration.source_format + + @property + def write_disposition(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.write_disposition`. + """ + return self._configuration.write_disposition + + @property + def schema(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.schema`. + """ + return self._configuration.schema + + @property + def destination_encryption_configuration(self): + """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom + encryption configuration for the destination table. + + Custom encryption configuration (e.g., Cloud KMS keys) + or :data:`None` if using default encryption. + + See + :attr:`google.cloud.bigquery.job.LoadJobConfig.destination_encryption_configuration`. + """ + return self._configuration.destination_encryption_configuration + + @property + def destination_table_description(self): + """Optional[str] name given to destination table. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.description + """ + return self._configuration.destination_table_description + + @property + def destination_table_friendly_name(self): + """Optional[str] name given to destination table. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.friendly_name + """ + return self._configuration.destination_table_friendly_name + + @property + def range_partitioning(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. + """ + return self._configuration.range_partitioning + + @property + def time_partitioning(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.time_partitioning`. + """ + return self._configuration.time_partitioning + + @property + def use_avro_logical_types(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.use_avro_logical_types`. + """ + return self._configuration.use_avro_logical_types + + @property + def clustering_fields(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.clustering_fields`. + """ + return self._configuration.clustering_fields + + @property + def schema_update_options(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.schema_update_options`. + """ + return self._configuration.schema_update_options + + @property + def input_file_bytes(self): + """Count of bytes loaded from source files. + + Returns: + Optional[int]: the count (None until set from the server). + + Raises: + ValueError: for invalid value types. + """ + return _helpers._int_or_none( + _helpers._get_sub_prop( + self._properties, ["statistics", "load", "inputFileBytes"] + ) + ) + + @property + def input_files(self): + """Count of source files. + + Returns: + Optional[int]: the count (None until set from the server). + """ + return _helpers._int_or_none( + _helpers._get_sub_prop( + self._properties, ["statistics", "load", "inputFiles"] + ) + ) + + @property + def output_bytes(self): + """Count of bytes saved to destination table. + + Returns: + Optional[int]: the count (None until set from the server). + """ + return _helpers._int_or_none( + _helpers._get_sub_prop( + self._properties, ["statistics", "load", "outputBytes"] + ) + ) + + @property + def output_rows(self): + """Count of rows saved to destination table. + + Returns: + Optional[int]: the count (None until set from the server). + """ + return _helpers._int_or_none( + _helpers._get_sub_prop( + self._properties, ["statistics", "load", "outputRows"] + ) + ) + + def to_api_repr(self): + """Generate a resource for :meth:`_begin`.""" + # Exclude statistics, if set. + return { + "jobReference": self._properties["jobReference"], + "configuration": self._properties["configuration"], + } + + @classmethod + def from_api_repr(cls, resource: dict, client) -> "LoadJob": + """Factory: construct a job given its API representation + + .. note: + + This method assumes that the project found in the resource matches + the client's project. + + Args: + resource (Dict): dataset job representation returned from the API + + client (google.cloud.bigquery.client.Client): + Client which holds credentials and project + configuration for the dataset. + + Returns: + google.cloud.bigquery.job.LoadJob: Job parsed from ``resource``. + """ + cls._check_resource_config(resource) + job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job = cls(job_ref, None, None, client) + job._set_properties(resource) + return job diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py new file mode 100644 index 000000000..0cb4798be --- /dev/null +++ b/google/cloud/bigquery/job/query.py @@ -0,0 +1,1997 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes for query jobs.""" + +import concurrent.futures +import copy +import re +import typing +from typing import Any, Dict, Optional, Union + +from google.api_core import exceptions +from google.api_core.future import polling as polling_future +import requests + +from google.cloud.bigquery.dataset import Dataset +from google.cloud.bigquery.dataset import DatasetListItem +from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +from google.cloud.bigquery.enums import KeyResultStatementKind +from google.cloud.bigquery.external_config import ExternalConfig +from google.cloud.bigquery import _helpers +from google.cloud.bigquery.query import _query_param_from_api_repr +from google.cloud.bigquery.query import ArrayQueryParameter +from google.cloud.bigquery.query import ScalarQueryParameter +from google.cloud.bigquery.query import StructQueryParameter +from google.cloud.bigquery.query import UDFResource +from google.cloud.bigquery.retry import DEFAULT_RETRY, DEFAULT_JOB_RETRY +from google.cloud.bigquery.routine import RoutineReference +from google.cloud.bigquery.table import _EmptyRowIterator +from google.cloud.bigquery.table import RangePartitioning +from google.cloud.bigquery.table import _table_arg_to_table_ref +from google.cloud.bigquery.table import TableReference +from google.cloud.bigquery.table import TimePartitioning +from google.cloud.bigquery._tqdm_helpers import wait_for_query + +from google.cloud.bigquery.job.base import _AsyncJob +from google.cloud.bigquery.job.base import _JobConfig +from google.cloud.bigquery.job.base import _JobReference + +if typing.TYPE_CHECKING: # pragma: NO COVER + # Assumption: type checks are only used by library developers and CI environments + # that have all optional dependencies installed, thus no conditional imports. + import pandas + import geopandas + import pyarrow + from google.api_core import retry as retries + from google.cloud import bigquery_storage + from google.cloud.bigquery.table import RowIterator + + +_CONTAINS_ORDER_BY = re.compile(r"ORDER\s+BY", re.IGNORECASE) +_TIMEOUT_BUFFER_SECS = 0.1 + + +def _contains_order_by(query): + """Do we need to preserve the order of the query results? + + This function has known false positives, such as with ordered window + functions: + + .. code-block:: sql + + SELECT SUM(x) OVER ( + window_name + PARTITION BY... + ORDER BY... + window_frame_clause) + FROM ... + + This false positive failure case means the behavior will be correct, but + downloading results with the BigQuery Storage API may be slower than it + otherwise would. This is preferable to the false negative case, where + results are expected to be in order but are not (due to parallel reads). + """ + return query and _CONTAINS_ORDER_BY.search(query) + + +def _from_api_repr_query_parameters(resource): + return [_query_param_from_api_repr(mapping) for mapping in resource] + + +def _to_api_repr_query_parameters(value): + return [query_parameter.to_api_repr() for query_parameter in value] + + +def _from_api_repr_udf_resources(resource): + udf_resources = [] + for udf_mapping in resource: + for udf_type, udf_value in udf_mapping.items(): + udf_resources.append(UDFResource(udf_type, udf_value)) + return udf_resources + + +def _to_api_repr_udf_resources(value): + return [{udf_resource.udf_type: udf_resource.value} for udf_resource in value] + + +def _from_api_repr_table_defs(resource): + return {k: ExternalConfig.from_api_repr(v) for k, v in resource.items()} + + +def _to_api_repr_table_defs(value): + return {k: ExternalConfig.to_api_repr(v) for k, v in value.items()} + + +class DmlStats(typing.NamedTuple): + """Detailed statistics for DML statements. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/DmlStats + """ + + inserted_row_count: int = 0 + """Number of inserted rows. Populated by DML INSERT and MERGE statements.""" + + deleted_row_count: int = 0 + """Number of deleted rows. populated by DML DELETE, MERGE and TRUNCATE statements. + """ + + updated_row_count: int = 0 + """Number of updated rows. Populated by DML UPDATE and MERGE statements.""" + + @classmethod + def from_api_repr(cls, stats: Dict[str, str]) -> "DmlStats": + # NOTE: The field order here must match the order of fields set at the + # class level. + api_fields = ("insertedRowCount", "deletedRowCount", "updatedRowCount") + + args = ( + int(stats.get(api_field, default_val)) + for api_field, default_val in zip(api_fields, cls.__new__.__defaults__) + ) + return cls(*args) + + +class ScriptOptions: + """Options controlling the execution of scripts. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#ScriptOptions + """ + + def __init__( + self, + statement_timeout_ms: Optional[int] = None, + statement_byte_budget: Optional[int] = None, + key_result_statement: Optional[KeyResultStatementKind] = None, + ): + self._properties = {} + self.statement_timeout_ms = statement_timeout_ms + self.statement_byte_budget = statement_byte_budget + self.key_result_statement = key_result_statement + + @classmethod + def from_api_repr(cls, resource: Dict[str, Any]) -> "ScriptOptions": + """Factory: construct instance from the JSON repr. + + Args: + resource(Dict[str: Any]): + ScriptOptions representation returned from API. + + Returns: + google.cloud.bigquery.ScriptOptions: + ScriptOptions sample parsed from ``resource``. + """ + entry = cls() + entry._properties = copy.deepcopy(resource) + return entry + + def to_api_repr(self) -> Dict[str, Any]: + """Construct the API resource representation.""" + return copy.deepcopy(self._properties) + + @property + def statement_timeout_ms(self) -> Union[int, None]: + """Timeout period for each statement in a script.""" + return _helpers._int_or_none(self._properties.get("statementTimeoutMs")) + + @statement_timeout_ms.setter + def statement_timeout_ms(self, value: Union[int, None]): + if value is not None: + value = str(value) + self._properties["statementTimeoutMs"] = value + + @property + def statement_byte_budget(self) -> Union[int, None]: + """Limit on the number of bytes billed per statement. + + Exceeding this budget results in an error. + """ + return _helpers._int_or_none(self._properties.get("statementByteBudget")) + + @statement_byte_budget.setter + def statement_byte_budget(self, value: Union[int, None]): + if value is not None: + value = str(value) + self._properties["statementByteBudget"] = value + + @property + def key_result_statement(self) -> Union[KeyResultStatementKind, None]: + """Determines which statement in the script represents the "key result". + + This is used to populate the schema and query results of the script job. + Default is ``KeyResultStatementKind.LAST``. + """ + return self._properties.get("keyResultStatement") + + @key_result_statement.setter + def key_result_statement(self, value: Union[KeyResultStatementKind, None]): + self._properties["keyResultStatement"] = value + + +class QueryJobConfig(_JobConfig): + """Configuration options for query jobs. + + All properties in this class are optional. Values which are :data:`None` -> + server defaults. Set properties on the constructed configuration by using + the property name as the name of a keyword argument. + """ + + def __init__(self, **kwargs): + super(QueryJobConfig, self).__init__("query", **kwargs) + + @property + def destination_encryption_configuration(self): + """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom + encryption configuration for the destination table. + + Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` + if using default encryption. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.destination_encryption_configuration + """ + prop = self._get_sub_prop("destinationEncryptionConfiguration") + if prop is not None: + prop = EncryptionConfiguration.from_api_repr(prop) + return prop + + @destination_encryption_configuration.setter + def destination_encryption_configuration(self, value): + api_repr = value + if value is not None: + api_repr = value.to_api_repr() + self._set_sub_prop("destinationEncryptionConfiguration", api_repr) + + @property + def allow_large_results(self): + """bool: Allow large query results tables (legacy SQL, only) + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.allow_large_results + """ + return self._get_sub_prop("allowLargeResults") + + @allow_large_results.setter + def allow_large_results(self, value): + self._set_sub_prop("allowLargeResults", value) + + @property + def create_disposition(self): + """google.cloud.bigquery.job.CreateDisposition: Specifies behavior + for creating tables. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.create_disposition + """ + return self._get_sub_prop("createDisposition") + + @create_disposition.setter + def create_disposition(self, value): + self._set_sub_prop("createDisposition", value) + + @property + def default_dataset(self): + """google.cloud.bigquery.dataset.DatasetReference: the default dataset + to use for unqualified table names in the query or :data:`None` if not + set. + + The ``default_dataset`` setter accepts: + + - a :class:`~google.cloud.bigquery.dataset.Dataset`, or + - a :class:`~google.cloud.bigquery.dataset.DatasetReference`, or + - a :class:`str` of the fully-qualified dataset ID in standard SQL + format. The value must included a project ID and dataset ID + separated by ``.``. For example: ``your-project.your_dataset``. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.default_dataset + """ + prop = self._get_sub_prop("defaultDataset") + if prop is not None: + prop = DatasetReference.from_api_repr(prop) + return prop + + @default_dataset.setter + def default_dataset(self, value): + if value is None: + self._set_sub_prop("defaultDataset", None) + return + + if isinstance(value, str): + value = DatasetReference.from_string(value) + + if isinstance(value, (Dataset, DatasetListItem)): + value = value.reference + + resource = value.to_api_repr() + self._set_sub_prop("defaultDataset", resource) + + @property + def destination(self): + """google.cloud.bigquery.table.TableReference: table where results are + written or :data:`None` if not set. + + The ``destination`` setter accepts: + + - a :class:`~google.cloud.bigquery.table.Table`, or + - a :class:`~google.cloud.bigquery.table.TableReference`, or + - a :class:`str` of the fully-qualified table ID in standard SQL + format. The value must included a project ID, dataset ID, and table + ID, each separated by ``.``. For example: + ``your-project.your_dataset.your_table``. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.destination_table + """ + prop = self._get_sub_prop("destinationTable") + if prop is not None: + prop = TableReference.from_api_repr(prop) + return prop + + @destination.setter + def destination(self, value): + if value is None: + self._set_sub_prop("destinationTable", None) + return + + value = _table_arg_to_table_ref(value) + resource = value.to_api_repr() + self._set_sub_prop("destinationTable", resource) + + @property + def dry_run(self): + """bool: :data:`True` if this query should be a dry run to estimate + costs. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfiguration.FIELDS.dry_run + """ + return self._properties.get("dryRun") + + @dry_run.setter + def dry_run(self, value): + self._properties["dryRun"] = value + + @property + def flatten_results(self): + """bool: Flatten nested/repeated fields in results. (Legacy SQL only) + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.flatten_results + """ + return self._get_sub_prop("flattenResults") + + @flatten_results.setter + def flatten_results(self, value): + self._set_sub_prop("flattenResults", value) + + @property + def maximum_billing_tier(self): + """int: Deprecated. Changes the billing tier to allow high-compute + queries. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.maximum_billing_tier + """ + return self._get_sub_prop("maximumBillingTier") + + @maximum_billing_tier.setter + def maximum_billing_tier(self, value): + self._set_sub_prop("maximumBillingTier", value) + + @property + def maximum_bytes_billed(self): + """int: Maximum bytes to be billed for this job or :data:`None` if not set. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.maximum_bytes_billed + """ + return _helpers._int_or_none(self._get_sub_prop("maximumBytesBilled")) + + @maximum_bytes_billed.setter + def maximum_bytes_billed(self, value): + self._set_sub_prop("maximumBytesBilled", str(value)) + + @property + def priority(self): + """google.cloud.bigquery.job.QueryPriority: Priority of the query. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.priority + """ + return self._get_sub_prop("priority") + + @priority.setter + def priority(self, value): + self._set_sub_prop("priority", value) + + @property + def query_parameters(self): + """List[Union[google.cloud.bigquery.query.ArrayQueryParameter, \ + google.cloud.bigquery.query.ScalarQueryParameter, \ + google.cloud.bigquery.query.StructQueryParameter]]: list of parameters + for parameterized query (empty by default) + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.query_parameters + """ + prop = self._get_sub_prop("queryParameters", default=[]) + return _from_api_repr_query_parameters(prop) + + @query_parameters.setter + def query_parameters(self, values): + self._set_sub_prop("queryParameters", _to_api_repr_query_parameters(values)) + + @property + def range_partitioning(self): + """Optional[google.cloud.bigquery.table.RangePartitioning]: + Configures range-based partitioning for destination table. + + .. note:: + **Beta**. The integer range partitioning feature is in a + pre-release state and might change or have limited support. + + Only specify at most one of + :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or + :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. + + Raises: + ValueError: + If the value is not + :class:`~google.cloud.bigquery.table.RangePartitioning` or + :data:`None`. + """ + resource = self._get_sub_prop("rangePartitioning") + if resource is not None: + return RangePartitioning(_properties=resource) + + @range_partitioning.setter + def range_partitioning(self, value): + resource = value + if isinstance(value, RangePartitioning): + resource = value._properties + elif value is not None: + raise ValueError( + "Expected value to be RangePartitioning or None, got {}.".format(value) + ) + self._set_sub_prop("rangePartitioning", resource) + + @property + def udf_resources(self): + """List[google.cloud.bigquery.query.UDFResource]: user + defined function resources (empty by default) + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.user_defined_function_resources + """ + prop = self._get_sub_prop("userDefinedFunctionResources", default=[]) + return _from_api_repr_udf_resources(prop) + + @udf_resources.setter + def udf_resources(self, values): + self._set_sub_prop( + "userDefinedFunctionResources", _to_api_repr_udf_resources(values) + ) + + @property + def use_legacy_sql(self): + """bool: Use legacy SQL syntax. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.use_legacy_sql + """ + return self._get_sub_prop("useLegacySql") + + @use_legacy_sql.setter + def use_legacy_sql(self, value): + self._set_sub_prop("useLegacySql", value) + + @property + def use_query_cache(self): + """bool: Look for the query result in the cache. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.use_query_cache + """ + return self._get_sub_prop("useQueryCache") + + @use_query_cache.setter + def use_query_cache(self, value): + self._set_sub_prop("useQueryCache", value) + + @property + def write_disposition(self): + """google.cloud.bigquery.job.WriteDisposition: Action that occurs if + the destination table already exists. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.write_disposition + """ + return self._get_sub_prop("writeDisposition") + + @write_disposition.setter + def write_disposition(self, value): + self._set_sub_prop("writeDisposition", value) + + @property + def table_definitions(self): + """Dict[str, google.cloud.bigquery.external_config.ExternalConfig]: + Definitions for external tables or :data:`None` if not set. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.external_table_definitions + """ + prop = self._get_sub_prop("tableDefinitions") + if prop is not None: + prop = _from_api_repr_table_defs(prop) + return prop + + @table_definitions.setter + def table_definitions(self, values): + self._set_sub_prop("tableDefinitions", _to_api_repr_table_defs(values)) + + @property + def time_partitioning(self): + """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies + time-based partitioning for the destination table. + + Only specify at most one of + :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or + :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. + + Raises: + ValueError: + If the value is not + :class:`~google.cloud.bigquery.table.TimePartitioning` or + :data:`None`. + """ + prop = self._get_sub_prop("timePartitioning") + if prop is not None: + prop = TimePartitioning.from_api_repr(prop) + return prop + + @time_partitioning.setter + def time_partitioning(self, value): + api_repr = value + if value is not None: + api_repr = value.to_api_repr() + self._set_sub_prop("timePartitioning", api_repr) + + @property + def clustering_fields(self): + """Optional[List[str]]: Fields defining clustering for the table + + (Defaults to :data:`None`). + + Clustering fields are immutable after table creation. + + .. note:: + + BigQuery supports clustering for both partitioned and + non-partitioned tables. + """ + prop = self._get_sub_prop("clustering") + if prop is not None: + return list(prop.get("fields", ())) + + @clustering_fields.setter + def clustering_fields(self, value): + """Optional[List[str]]: Fields defining clustering for the table + + (Defaults to :data:`None`). + """ + if value is not None: + self._set_sub_prop("clustering", {"fields": value}) + else: + self._del_sub_prop("clustering") + + @property + def schema_update_options(self): + """List[google.cloud.bigquery.job.SchemaUpdateOption]: Specifies + updates to the destination table schema to allow as a side effect of + the query job. + """ + return self._get_sub_prop("schemaUpdateOptions") + + @schema_update_options.setter + def schema_update_options(self, values): + self._set_sub_prop("schemaUpdateOptions", values) + + @property + def script_options(self) -> ScriptOptions: + """Connection properties which can modify the query behavior. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#scriptoptions + """ + prop = self._get_sub_prop("scriptOptions") + if prop is not None: + prop = ScriptOptions.from_api_repr(prop) + return prop + + @script_options.setter + def script_options(self, value: Union[ScriptOptions, None]): + if value is not None: + value = value.to_api_repr() + self._set_sub_prop("scriptOptions", value) + + def to_api_repr(self) -> dict: + """Build an API representation of the query job config. + + Returns: + Dict: A dictionary in the format used by the BigQuery API. + """ + resource = copy.deepcopy(self._properties) + + # Query parameters have an addition property associated with them + # to indicate if the query is using named or positional parameters. + query_parameters = resource["query"].get("queryParameters") + if query_parameters: + if query_parameters[0].get("name") is None: + resource["query"]["parameterMode"] = "POSITIONAL" + else: + resource["query"]["parameterMode"] = "NAMED" + + return resource + + +class QueryJob(_AsyncJob): + """Asynchronous job: query tables. + + Args: + job_id (str): the job's ID, within the project belonging to ``client``. + + query (str): SQL query string. + + client (google.cloud.bigquery.client.Client): + A client which holds credentials and project configuration + for the dataset (which requires a project). + + job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]): + Extra configuration options for the query job. + """ + + _JOB_TYPE = "query" + _UDF_KEY = "userDefinedFunctionResources" + + def __init__(self, job_id, query, client, job_config=None): + super(QueryJob, self).__init__(job_id, client) + + if job_config is None: + job_config = QueryJobConfig() + if job_config.use_legacy_sql is None: + job_config.use_legacy_sql = False + + self._properties["configuration"] = job_config._properties + self._configuration = job_config + + if query: + _helpers._set_sub_prop( + self._properties, ["configuration", "query", "query"], query + ) + + self._query_results = None + self._done_timeout = None + self._transport_timeout = None + + @property + def allow_large_results(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.allow_large_results`. + """ + return self._configuration.allow_large_results + + @property + def create_disposition(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.create_disposition`. + """ + return self._configuration.create_disposition + + @property + def default_dataset(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.default_dataset`. + """ + return self._configuration.default_dataset + + @property + def destination(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.destination`. + """ + return self._configuration.destination + + @property + def destination_encryption_configuration(self): + """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom + encryption configuration for the destination table. + + Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` + if using default encryption. + + See + :attr:`google.cloud.bigquery.job.QueryJobConfig.destination_encryption_configuration`. + """ + return self._configuration.destination_encryption_configuration + + @property + def dry_run(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.dry_run`. + """ + return self._configuration.dry_run + + @property + def flatten_results(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.flatten_results`. + """ + return self._configuration.flatten_results + + @property + def priority(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.priority`. + """ + return self._configuration.priority + + @property + def query(self): + """str: The query text used in this query job. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.query + """ + return _helpers._get_sub_prop( + self._properties, ["configuration", "query", "query"] + ) + + @property + def query_parameters(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.query_parameters`. + """ + return self._configuration.query_parameters + + @property + def udf_resources(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.udf_resources`. + """ + return self._configuration.udf_resources + + @property + def use_legacy_sql(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.use_legacy_sql`. + """ + return self._configuration.use_legacy_sql + + @property + def use_query_cache(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.use_query_cache`. + """ + return self._configuration.use_query_cache + + @property + def write_disposition(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.write_disposition`. + """ + return self._configuration.write_disposition + + @property + def maximum_billing_tier(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.maximum_billing_tier`. + """ + return self._configuration.maximum_billing_tier + + @property + def maximum_bytes_billed(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.maximum_bytes_billed`. + """ + return self._configuration.maximum_bytes_billed + + @property + def range_partitioning(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.range_partitioning`. + """ + return self._configuration.range_partitioning + + @property + def table_definitions(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.table_definitions`. + """ + return self._configuration.table_definitions + + @property + def time_partitioning(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.time_partitioning`. + """ + return self._configuration.time_partitioning + + @property + def clustering_fields(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.clustering_fields`. + """ + return self._configuration.clustering_fields + + @property + def schema_update_options(self): + """See + :attr:`google.cloud.bigquery.job.QueryJobConfig.schema_update_options`. + """ + return self._configuration.schema_update_options + + def to_api_repr(self): + """Generate a resource for :meth:`_begin`.""" + # Use to_api_repr to allow for some configuration properties to be set + # automatically. + configuration = self._configuration.to_api_repr() + return { + "jobReference": self._properties["jobReference"], + "configuration": configuration, + } + + @classmethod + def from_api_repr(cls, resource: dict, client) -> "QueryJob": + """Factory: construct a job given its API representation + + Args: + resource (Dict): dataset job representation returned from the API + + client (google.cloud.bigquery.client.Client): + Client which holds credentials and project + configuration for the dataset. + + Returns: + google.cloud.bigquery.job.QueryJob: Job parsed from ``resource``. + """ + cls._check_resource_config(resource) + job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job = cls(job_ref, None, client=client) + job._set_properties(resource) + return job + + @property + def query_plan(self): + """Return query plan from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.query_plan + + Returns: + List[google.cloud.bigquery.job.QueryPlanEntry]: + mappings describing the query plan, or an empty list + if the query has not yet completed. + """ + plan_entries = self._job_statistics().get("queryPlan", ()) + return [QueryPlanEntry.from_api_repr(entry) for entry in plan_entries] + + @property + def timeline(self): + """List(TimelineEntry): Return the query execution timeline + from job statistics. + """ + raw = self._job_statistics().get("timeline", ()) + return [TimelineEntry.from_api_repr(entry) for entry in raw] + + @property + def total_bytes_processed(self): + """Return total bytes processed from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.total_bytes_processed + + Returns: + Optional[int]: + Total bytes processed by the job, or None if job is not + yet complete. + """ + result = self._job_statistics().get("totalBytesProcessed") + if result is not None: + result = int(result) + return result + + @property + def total_bytes_billed(self): + """Return total bytes billed from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.total_bytes_billed + + Returns: + Optional[int]: + Total bytes processed by the job, or None if job is not + yet complete. + """ + result = self._job_statistics().get("totalBytesBilled") + if result is not None: + result = int(result) + return result + + @property + def billing_tier(self): + """Return billing tier from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.billing_tier + + Returns: + Optional[int]: + Billing tier used by the job, or None if job is not + yet complete. + """ + return self._job_statistics().get("billingTier") + + @property + def cache_hit(self): + """Return whether or not query results were served from cache. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.cache_hit + + Returns: + Optional[bool]: + whether the query results were returned from cache, or None + if job is not yet complete. + """ + return self._job_statistics().get("cacheHit") + + @property + def ddl_operation_performed(self): + """Optional[str]: Return the DDL operation performed. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_operation_performed + + """ + return self._job_statistics().get("ddlOperationPerformed") + + @property + def ddl_target_routine(self): + """Optional[google.cloud.bigquery.routine.RoutineReference]: Return the DDL target routine, present + for CREATE/DROP FUNCTION/PROCEDURE queries. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_target_routine + """ + prop = self._job_statistics().get("ddlTargetRoutine") + if prop is not None: + prop = RoutineReference.from_api_repr(prop) + return prop + + @property + def ddl_target_table(self): + """Optional[google.cloud.bigquery.table.TableReference]: Return the DDL target table, present + for CREATE/DROP TABLE/VIEW queries. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_target_table + """ + prop = self._job_statistics().get("ddlTargetTable") + if prop is not None: + prop = TableReference.from_api_repr(prop) + return prop + + @property + def num_dml_affected_rows(self): + """Return the number of DML rows affected by the job. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.num_dml_affected_rows + + Returns: + Optional[int]: + number of DML rows affected by the job, or None if job is not + yet complete. + """ + result = self._job_statistics().get("numDmlAffectedRows") + if result is not None: + result = int(result) + return result + + @property + def slot_millis(self): + """Union[int, None]: Slot-milliseconds used by this query job.""" + return _helpers._int_or_none(self._job_statistics().get("totalSlotMs")) + + @property + def statement_type(self): + """Return statement type from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type + + Returns: + Optional[str]: + type of statement used by the job, or None if job is not + yet complete. + """ + return self._job_statistics().get("statementType") + + @property + def referenced_tables(self): + """Return referenced tables from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.referenced_tables + + Returns: + List[Dict]: + mappings describing the query plan, or an empty list + if the query has not yet completed. + """ + tables = [] + datasets_by_project_name = {} + + for table in self._job_statistics().get("referencedTables", ()): + + t_project = table["projectId"] + + ds_id = table["datasetId"] + t_dataset = datasets_by_project_name.get((t_project, ds_id)) + if t_dataset is None: + t_dataset = DatasetReference(t_project, ds_id) + datasets_by_project_name[(t_project, ds_id)] = t_dataset + + t_name = table["tableId"] + tables.append(t_dataset.table(t_name)) + + return tables + + @property + def undeclared_query_parameters(self): + """Return undeclared query parameters from job statistics, if present. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.undeclared_query_parameters + + Returns: + List[Union[ \ + google.cloud.bigquery.query.ArrayQueryParameter, \ + google.cloud.bigquery.query.ScalarQueryParameter, \ + google.cloud.bigquery.query.StructQueryParameter \ + ]]: + Undeclared parameters, or an empty list if the query has + not yet completed. + """ + parameters = [] + undeclared = self._job_statistics().get("undeclaredQueryParameters", ()) + + for parameter in undeclared: + p_type = parameter["parameterType"] + + if "arrayType" in p_type: + klass = ArrayQueryParameter + elif "structTypes" in p_type: + klass = StructQueryParameter + else: + klass = ScalarQueryParameter + + parameters.append(klass.from_api_repr(parameter)) + + return parameters + + @property + def estimated_bytes_processed(self): + """Return the estimated number of bytes processed by the query. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.estimated_bytes_processed + + Returns: + Optional[int]: + number of DML rows affected by the job, or None if job is not + yet complete. + """ + result = self._job_statistics().get("estimatedBytesProcessed") + if result is not None: + result = int(result) + return result + + @property + def dml_stats(self) -> Optional[DmlStats]: + stats = self._job_statistics().get("dmlStats") + if stats is None: + return None + else: + return DmlStats.from_api_repr(stats) + + def _blocking_poll(self, timeout=None, **kwargs): + self._done_timeout = timeout + self._transport_timeout = timeout + super(QueryJob, self)._blocking_poll(timeout=timeout, **kwargs) + + @staticmethod + def _format_for_exception(query, job_id): + """Format a query for the output in exception message. + + Args: + query (str): The SQL query to format. + job_id (str): The ID of the job that ran the query. + + Returns: + str: A formatted query text. + """ + template = "\n\n(job ID: {job_id})\n\n{header}\n\n{ruler}\n{body}\n{ruler}" + + lines = query.splitlines() + max_line_len = max(len(line) for line in lines) + + header = "-----Query Job SQL Follows-----" + header = "{:^{total_width}}".format(header, total_width=max_line_len + 5) + + # Print out a "ruler" above and below the SQL so we can judge columns. + # Left pad for the line numbers (4 digits plus ":"). + ruler = " |" + " . |" * (max_line_len // 10) + + # Put line numbers next to the SQL. + body = "\n".join( + "{:4}:{}".format(n, line) for n, line in enumerate(lines, start=1) + ) + + return template.format(job_id=job_id, header=header, ruler=ruler, body=body) + + def _begin(self, client=None, retry=DEFAULT_RETRY, timeout=None): + """API call: begin the job via a POST request + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + + Args: + client (Optional[google.cloud.bigquery.client.Client]): + The client to use. If not passed, falls back to the ``client`` + associated with the job object or``NoneType``. + retry (Optional[google.api_core.retry.Retry]): + How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + + Raises: + ValueError: If the job has already begun. + """ + + try: + super(QueryJob, self)._begin(client=client, retry=retry, timeout=timeout) + except exceptions.GoogleAPICallError as exc: + exc.message += self._format_for_exception(self.query, self.job_id) + exc.query_job = self + raise + + def _reload_query_results( + self, retry: "retries.Retry" = DEFAULT_RETRY, timeout: float = None + ): + """Refresh the cached query results. + + Args: + retry (Optional[google.api_core.retry.Retry]): + How to retry the call that retrieves query results. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + """ + if self._query_results and self._query_results.complete: + return + + # Since the API to getQueryResults can hang up to the timeout value + # (default of 10 seconds), set the timeout parameter to ensure that + # the timeout from the futures API is respected. See: + # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4135 + timeout_ms = None + if self._done_timeout is not None: + # Subtract a buffer for context switching, network latency, etc. + api_timeout = self._done_timeout - _TIMEOUT_BUFFER_SECS + api_timeout = max(min(api_timeout, 10), 0) + self._done_timeout -= api_timeout + self._done_timeout = max(0, self._done_timeout) + timeout_ms = int(api_timeout * 1000) + + # If an explicit timeout is not given, fall back to the transport timeout + # stored in _blocking_poll() in the process of polling for job completion. + transport_timeout = timeout if timeout is not None else self._transport_timeout + + self._query_results = self._client._get_query_results( + self.job_id, + retry, + project=self.project, + timeout_ms=timeout_ms, + location=self.location, + timeout=transport_timeout, + ) + + def _done_or_raise(self, retry=DEFAULT_RETRY, timeout=None): + """Check if the query has finished running and raise if it's not. + + If the query has finished, also reload the job itself. + """ + # If an explicit timeout is not given, fall back to the transport timeout + # stored in _blocking_poll() in the process of polling for job completion. + transport_timeout = timeout if timeout is not None else self._transport_timeout + + try: + self._reload_query_results(retry=retry, timeout=transport_timeout) + except exceptions.GoogleAPIError as exc: + # Reloading also updates error details on self, thus no need for an + # explicit self.set_exception() call if reloading succeeds. + try: + self.reload(retry=retry, timeout=transport_timeout) + except exceptions.GoogleAPIError: + # Use the query results reload exception, as it generally contains + # much more useful error information. + self.set_exception(exc) + finally: + return + + # Only reload the job once we know the query is complete. + # This will ensure that fields such as the destination table are + # correctly populated. + if not self._query_results.complete: + raise polling_future._OperationNotComplete() + else: + try: + self.reload(retry=retry, timeout=transport_timeout) + except exceptions.GoogleAPIError as exc: + self.set_exception(exc) + + def result( + self, + page_size: int = None, + max_results: int = None, + retry: "retries.Retry" = DEFAULT_RETRY, + timeout: float = None, + start_index: int = None, + job_retry: "retries.Retry" = DEFAULT_JOB_RETRY, + ) -> Union["RowIterator", _EmptyRowIterator]: + """Start the job and wait for it to complete and get the result. + + Args: + page_size (Optional[int]): + The maximum number of rows in each page of results from this + request. Non-positive values are ignored. + max_results (Optional[int]): + The maximum total number of rows from this request. + retry (Optional[google.api_core.retry.Retry]): + How to retry the call that retrieves rows. This only + applies to making RPC calls. It isn't used to retry + failed jobs. This has a reasonable default that + should only be overridden with care. If the job state + is ``DONE``, retrying is aborted early even if the + results are not available, as this will not change + anymore. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + If multiple requests are made under the hood, ``timeout`` + applies to each individual request. + start_index (Optional[int]): + The zero-based index of the starting row to read. + job_retry (Optional[google.api_core.retry.Retry]): + How to retry failed jobs. The default retries + rate-limit-exceeded errors. Passing ``None`` disables + job retry. + + Not all jobs can be retried. If ``job_id`` was + provided to the query that created this job, then the + job returned by the query will not be retryable, and + an exception will be raised if non-``None`` + non-default ``job_retry`` is also provided. + + Returns: + google.cloud.bigquery.table.RowIterator: + Iterator of row data + :class:`~google.cloud.bigquery.table.Row`-s. During each + page, the iterator will have the ``total_rows`` attribute + set, which counts the total number of rows **in the result + set** (this is distinct from the total number of rows in the + current page: ``iterator.page.num_items``). + + If the query is a special query that produces no results, e.g. + a DDL query, an ``_EmptyRowIterator`` instance is returned. + + Raises: + google.cloud.exceptions.GoogleAPICallError: + If the job failed and retries aren't successful. + concurrent.futures.TimeoutError: + If the job did not complete in the given timeout. + TypeError: + If Non-``None`` and non-default ``job_retry`` is + provided and the job is not retryable. + """ + try: + retry_do_query = getattr(self, "_retry_do_query", None) + if retry_do_query is not None: + if job_retry is DEFAULT_JOB_RETRY: + job_retry = self._job_retry + else: + if job_retry is not None and job_retry is not DEFAULT_JOB_RETRY: + raise TypeError( + "`job_retry` was provided, but this job is" + " not retryable, because a custom `job_id` was" + " provided to the query that created this job." + ) + + first = True + + def do_get_result(): + nonlocal first + + if first: + first = False + else: + # Note that we won't get here if retry_do_query is + # None, because we won't use a retry. + + # The orinal job is failed. Create a new one. + job = retry_do_query() + + # If it's already failed, we might as well stop: + if job.done() and job.exception() is not None: + raise job.exception() + + # Become the new job: + self.__dict__.clear() + self.__dict__.update(job.__dict__) + + # This shouldn't be necessary, because once we have a good + # job, it should stay good,and we shouldn't have to retry. + # But let's be paranoid. :) + self._retry_do_query = retry_do_query + self._job_retry = job_retry + + super(QueryJob, self).result(retry=retry, timeout=timeout) + + # Since the job could already be "done" (e.g. got a finished job + # via client.get_job), the superclass call to done() might not + # set the self._query_results cache. + self._reload_query_results(retry=retry, timeout=timeout) + + if retry_do_query is not None and job_retry is not None: + do_get_result = job_retry(do_get_result) + + do_get_result() + + except exceptions.GoogleAPICallError as exc: + exc.message += self._format_for_exception(self.query, self.job_id) + exc.query_job = self + raise + except requests.exceptions.Timeout as exc: + raise concurrent.futures.TimeoutError from exc + + # If the query job is complete but there are no query results, this was + # special job, such as a DDL query. Return an empty result set to + # indicate success and avoid calling tabledata.list on a table which + # can't be read (such as a view table). + if self._query_results.total_rows is None: + return _EmptyRowIterator() + + rows = self._client._list_rows_from_query_results( + self.job_id, + self.location, + self.project, + self._query_results.schema, + total_rows=self._query_results.total_rows, + destination=self.destination, + page_size=page_size, + max_results=max_results, + start_index=start_index, + retry=retry, + timeout=timeout, + ) + rows._preserve_order = _contains_order_by(self.query) + return rows + + # If changing the signature of this method, make sure to apply the same + # changes to table.RowIterator.to_arrow(), except for the max_results parameter + # that should only exist here in the QueryJob method. + def to_arrow( + self, + progress_bar_type: str = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + create_bqstorage_client: bool = True, + max_results: Optional[int] = None, + ) -> "pyarrow.Table": + """[Beta] Create a class:`pyarrow.Table` by loading all pages of a + table or query. + + Args: + progress_bar_type (Optional[str]): + If set, use the `tqdm `_ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + Possible values of ``progress_bar_type`` include: + + ``None`` + No progress bar. + ``'tqdm'`` + Use the :func:`tqdm.tqdm` function to print a progress bar + to :data:`sys.stderr`. + ``'tqdm_notebook'`` + Use the :func:`tqdm.tqdm_notebook` function to display a + progress bar as a Jupyter notebook widget. + ``'tqdm_gui'`` + Use the :func:`tqdm.tqdm_gui` function to display a + progress bar as a graphical dialog box. + bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): + A BigQuery Storage API client. If supplied, use the faster + BigQuery Storage API to fetch rows from BigQuery. This API + is a billable API. + + This method requires the ``pyarrow`` and + ``google-cloud-bigquery-storage`` libraries. + + Reading from a specific partition or snapshot is not + currently supported by this method. + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API + is a faster way to fetch rows from BigQuery. See the + ``bqstorage_client`` parameter for more information. + + This argument does nothing if ``bqstorage_client`` is supplied. + + .. versionadded:: 1.24.0 + + max_results (Optional[int]): + Maximum number of rows to include in the result. No limit by default. + + .. versionadded:: 2.21.0 + + Returns: + pyarrow.Table + A :class:`pyarrow.Table` populated with row data and column + headers from the query results. The column headers are derived + from the destination table's schema. + + Raises: + ValueError: + If the :mod:`pyarrow` library cannot be imported. + + .. versionadded:: 1.17.0 + """ + query_result = wait_for_query(self, progress_bar_type, max_results=max_results) + return query_result.to_arrow( + progress_bar_type=progress_bar_type, + bqstorage_client=bqstorage_client, + create_bqstorage_client=create_bqstorage_client, + ) + + # If changing the signature of this method, make sure to apply the same + # changes to table.RowIterator.to_dataframe(), except for the max_results parameter + # that should only exist here in the QueryJob method. + def to_dataframe( + self, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + dtypes: Dict[str, Any] = None, + progress_bar_type: str = None, + create_bqstorage_client: bool = True, + date_as_object: bool = True, + max_results: Optional[int] = None, + geography_as_object: bool = False, + ) -> "pandas.DataFrame": + """Return a pandas DataFrame from a QueryJob + + Args: + bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): + A BigQuery Storage API client. If supplied, use the faster + BigQuery Storage API to fetch rows from BigQuery. This + API is a billable API. + + This method requires the ``fastavro`` and + ``google-cloud-bigquery-storage`` libraries. + + Reading from a specific partition or snapshot is not + currently supported by this method. + + dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): + A dictionary of column names pandas ``dtype``s. The provided + ``dtype`` is used when constructing the series for the column + specified. Otherwise, the default pandas behavior is used. + + progress_bar_type (Optional[str]): + If set, use the `tqdm `_ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + See + :func:`~google.cloud.bigquery.table.RowIterator.to_dataframe` + for details. + + .. versionadded:: 1.11.0 + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API + is a faster way to fetch rows from BigQuery. See the + ``bqstorage_client`` parameter for more information. + + This argument does nothing if ``bqstorage_client`` is supplied. + + .. versionadded:: 1.24.0 + + date_as_object (Optional[bool]): + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. + + .. versionadded:: 1.26.0 + + max_results (Optional[int]): + Maximum number of rows to include in the result. No limit by default. + + .. versionadded:: 2.21.0 + + geography_as_object (Optional[bool]): + If ``True``, convert GEOGRAPHY data to :mod:`shapely` + geometry objects. If ``False`` (default), don't cast + geography data to :mod:`shapely` geometry objects. + + .. versionadded:: 2.24.0 + + Returns: + pandas.DataFrame: + A :class:`~pandas.DataFrame` populated with row data + and column headers from the query results. The column + headers are derived from the destination table's + schema. + + Raises: + ValueError: + If the :mod:`pandas` library cannot be imported, or + the :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. Also if + `geography_as_object` is `True`, but the + :mod:`shapely` library cannot be imported. + """ + query_result = wait_for_query(self, progress_bar_type, max_results=max_results) + return query_result.to_dataframe( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + geography_as_object=geography_as_object, + ) + + # If changing the signature of this method, make sure to apply the same + # changes to table.RowIterator.to_dataframe(), except for the max_results parameter + # that should only exist here in the QueryJob method. + def to_geodataframe( + self, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + dtypes: Dict[str, Any] = None, + progress_bar_type: str = None, + create_bqstorage_client: bool = True, + date_as_object: bool = True, + max_results: Optional[int] = None, + geography_column: Optional[str] = None, + ) -> "geopandas.GeoDataFrame": + """Return a GeoPandas GeoDataFrame from a QueryJob + + Args: + bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): + A BigQuery Storage API client. If supplied, use the faster + BigQuery Storage API to fetch rows from BigQuery. This + API is a billable API. + + This method requires the ``fastavro`` and + ``google-cloud-bigquery-storage`` libraries. + + Reading from a specific partition or snapshot is not + currently supported by this method. + + dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): + A dictionary of column names pandas ``dtype``s. The provided + ``dtype`` is used when constructing the series for the column + specified. Otherwise, the default pandas behavior is used. + + progress_bar_type (Optional[str]): + If set, use the `tqdm `_ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + See + :func:`~google.cloud.bigquery.table.RowIterator.to_dataframe` + for details. + + .. versionadded:: 1.11.0 + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API + is a faster way to fetch rows from BigQuery. See the + ``bqstorage_client`` parameter for more information. + + This argument does nothing if ``bqstorage_client`` is supplied. + + .. versionadded:: 1.24.0 + + date_as_object (Optional[bool]): + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. + + .. versionadded:: 1.26.0 + + max_results (Optional[int]): + Maximum number of rows to include in the result. No limit by default. + + .. versionadded:: 2.21.0 + + geography_column (Optional[str]): + If there are more than one GEOGRAPHY column, + identifies which one to use to construct a GeoPandas + GeoDataFrame. This option can be ommitted if there's + only one GEOGRAPHY column. + + Returns: + geopandas.GeoDataFrame: + A :class:`geopandas.GeoDataFrame` populated with row + data and column headers from the query results. The + column headers are derived from the destination + table's schema. + + Raises: + ValueError: + If the :mod:`geopandas` library cannot be imported, or the + :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. + + .. versionadded:: 2.24.0 + """ + query_result = wait_for_query(self, progress_bar_type, max_results=max_results) + return query_result.to_geodataframe( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + geography_column=geography_column, + ) + + def __iter__(self): + return iter(self.result()) + + +class QueryPlanEntryStep(object): + """Map a single step in a query plan entry. + + Args: + kind (str): step type. + substeps (List): names of substeps. + """ + + def __init__(self, kind, substeps): + self.kind = kind + self.substeps = list(substeps) + + @classmethod + def from_api_repr(cls, resource: dict) -> "QueryPlanEntryStep": + """Factory: construct instance from the JSON repr. + + Args: + resource (Dict): JSON representation of the entry. + + Returns: + google.cloud.bigquery.job.QueryPlanEntryStep: + New instance built from the resource. + """ + return cls(kind=resource.get("kind"), substeps=resource.get("substeps", ())) + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return NotImplemented + return self.kind == other.kind and self.substeps == other.substeps + + +class QueryPlanEntry(object): + """QueryPlanEntry represents a single stage of a query execution plan. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#ExplainQueryStage + for the underlying API representation within query statistics. + """ + + def __init__(self): + self._properties = {} + + @classmethod + def from_api_repr(cls, resource: dict) -> "QueryPlanEntry": + """Factory: construct instance from the JSON repr. + + Args: + resource(Dict[str: object]): + ExplainQueryStage representation returned from API. + + Returns: + google.cloud.bigquery.job.QueryPlanEntry: + Query plan entry parsed from ``resource``. + """ + entry = cls() + entry._properties = resource + return entry + + @property + def name(self): + """Optional[str]: Human-readable name of the stage.""" + return self._properties.get("name") + + @property + def entry_id(self): + """Optional[str]: Unique ID for the stage within the plan.""" + return self._properties.get("id") + + @property + def start(self): + """Optional[Datetime]: Datetime when the stage started.""" + if self._properties.get("startMs") is None: + return None + return _helpers._datetime_from_microseconds( + int(self._properties.get("startMs")) * 1000.0 + ) + + @property + def end(self): + """Optional[Datetime]: Datetime when the stage ended.""" + if self._properties.get("endMs") is None: + return None + return _helpers._datetime_from_microseconds( + int(self._properties.get("endMs")) * 1000.0 + ) + + @property + def input_stages(self): + """List(int): Entry IDs for stages that were inputs for this stage.""" + if self._properties.get("inputStages") is None: + return [] + return [ + _helpers._int_or_none(entry) + for entry in self._properties.get("inputStages") + ] + + @property + def parallel_inputs(self): + """Optional[int]: Number of parallel input segments within + the stage. + """ + return _helpers._int_or_none(self._properties.get("parallelInputs")) + + @property + def completed_parallel_inputs(self): + """Optional[int]: Number of parallel input segments completed.""" + return _helpers._int_or_none(self._properties.get("completedParallelInputs")) + + @property + def wait_ms_avg(self): + """Optional[int]: Milliseconds the average worker spent waiting to + be scheduled. + """ + return _helpers._int_or_none(self._properties.get("waitMsAvg")) + + @property + def wait_ms_max(self): + """Optional[int]: Milliseconds the slowest worker spent waiting to + be scheduled. + """ + return _helpers._int_or_none(self._properties.get("waitMsMax")) + + @property + def wait_ratio_avg(self): + """Optional[float]: Ratio of time the average worker spent waiting + to be scheduled, relative to the longest time spent by any worker in + any stage of the overall plan. + """ + return self._properties.get("waitRatioAvg") + + @property + def wait_ratio_max(self): + """Optional[float]: Ratio of time the slowest worker spent waiting + to be scheduled, relative to the longest time spent by any worker in + any stage of the overall plan. + """ + return self._properties.get("waitRatioMax") + + @property + def read_ms_avg(self): + """Optional[int]: Milliseconds the average worker spent reading + input. + """ + return _helpers._int_or_none(self._properties.get("readMsAvg")) + + @property + def read_ms_max(self): + """Optional[int]: Milliseconds the slowest worker spent reading + input. + """ + return _helpers._int_or_none(self._properties.get("readMsMax")) + + @property + def read_ratio_avg(self): + """Optional[float]: Ratio of time the average worker spent reading + input, relative to the longest time spent by any worker in any stage + of the overall plan. + """ + return self._properties.get("readRatioAvg") + + @property + def read_ratio_max(self): + """Optional[float]: Ratio of time the slowest worker spent reading + to be scheduled, relative to the longest time spent by any worker in + any stage of the overall plan. + """ + return self._properties.get("readRatioMax") + + @property + def compute_ms_avg(self): + """Optional[int]: Milliseconds the average worker spent on CPU-bound + processing. + """ + return _helpers._int_or_none(self._properties.get("computeMsAvg")) + + @property + def compute_ms_max(self): + """Optional[int]: Milliseconds the slowest worker spent on CPU-bound + processing. + """ + return _helpers._int_or_none(self._properties.get("computeMsMax")) + + @property + def compute_ratio_avg(self): + """Optional[float]: Ratio of time the average worker spent on + CPU-bound processing, relative to the longest time spent by any + worker in any stage of the overall plan. + """ + return self._properties.get("computeRatioAvg") + + @property + def compute_ratio_max(self): + """Optional[float]: Ratio of time the slowest worker spent on + CPU-bound processing, relative to the longest time spent by any + worker in any stage of the overall plan. + """ + return self._properties.get("computeRatioMax") + + @property + def write_ms_avg(self): + """Optional[int]: Milliseconds the average worker spent writing + output data. + """ + return _helpers._int_or_none(self._properties.get("writeMsAvg")) + + @property + def write_ms_max(self): + """Optional[int]: Milliseconds the slowest worker spent writing + output data. + """ + return _helpers._int_or_none(self._properties.get("writeMsMax")) + + @property + def write_ratio_avg(self): + """Optional[float]: Ratio of time the average worker spent writing + output data, relative to the longest time spent by any worker in any + stage of the overall plan. + """ + return self._properties.get("writeRatioAvg") + + @property + def write_ratio_max(self): + """Optional[float]: Ratio of time the slowest worker spent writing + output data, relative to the longest time spent by any worker in any + stage of the overall plan. + """ + return self._properties.get("writeRatioMax") + + @property + def records_read(self): + """Optional[int]: Number of records read by this stage.""" + return _helpers._int_or_none(self._properties.get("recordsRead")) + + @property + def records_written(self): + """Optional[int]: Number of records written by this stage.""" + return _helpers._int_or_none(self._properties.get("recordsWritten")) + + @property + def status(self): + """Optional[str]: status of this stage.""" + return self._properties.get("status") + + @property + def shuffle_output_bytes(self): + """Optional[int]: Number of bytes written by this stage to + intermediate shuffle. + """ + return _helpers._int_or_none(self._properties.get("shuffleOutputBytes")) + + @property + def shuffle_output_bytes_spilled(self): + """Optional[int]: Number of bytes written by this stage to + intermediate shuffle and spilled to disk. + """ + return _helpers._int_or_none(self._properties.get("shuffleOutputBytesSpilled")) + + @property + def steps(self): + """List(QueryPlanEntryStep): List of step operations performed by + each worker in the stage. + """ + return [ + QueryPlanEntryStep.from_api_repr(step) + for step in self._properties.get("steps", []) + ] + + +class TimelineEntry(object): + """TimelineEntry represents progress of a query job at a particular + point in time. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#querytimelinesample + for the underlying API representation within query statistics. + """ + + def __init__(self): + self._properties = {} + + @classmethod + def from_api_repr(cls, resource): + """Factory: construct instance from the JSON repr. + + Args: + resource(Dict[str: object]): + QueryTimelineSample representation returned from API. + + Returns: + google.cloud.bigquery.TimelineEntry: + Timeline sample parsed from ``resource``. + """ + entry = cls() + entry._properties = resource + return entry + + @property + def elapsed_ms(self): + """Optional[int]: Milliseconds elapsed since start of query + execution.""" + return _helpers._int_or_none(self._properties.get("elapsedMs")) + + @property + def active_units(self): + """Optional[int]: Current number of input units being processed + by workers, reported as largest value since the last sample.""" + return _helpers._int_or_none(self._properties.get("activeUnits")) + + @property + def pending_units(self): + """Optional[int]: Current number of input units remaining for + query stages active at this sample time.""" + return _helpers._int_or_none(self._properties.get("pendingUnits")) + + @property + def completed_units(self): + """Optional[int]: Current number of input units completed by + this query.""" + return _helpers._int_or_none(self._properties.get("completedUnits")) + + @property + def slot_millis(self): + """Optional[int]: Cumulative slot-milliseconds consumed by + this query.""" + return _helpers._int_or_none(self._properties.get("totalSlotMs")) diff --git a/google/cloud/bigquery/magics/line_arg_parser/lexer.py b/google/cloud/bigquery/magics/line_arg_parser/lexer.py index 17e1ffdae..cd809c389 100644 --- a/google/cloud/bigquery/magics/line_arg_parser/lexer.py +++ b/google/cloud/bigquery/magics/line_arg_parser/lexer.py @@ -49,127 +49,59 @@ # the value of an option other than "--params", we do not really care about its # structure, and thus do not want to use any of the "Python tokens" for pattern matching. # -# Since token definition order is important, an OrderedDict is needed with tightly -# controlled member definitions (i.e. passed as a sequence, and *not* via kwargs). +# Token definition order is important, thus an OrderedDict is used. In addition, PEP 468 +# guarantees us that the order of kwargs is preserved in Python 3.6+. token_types = OrderedDict( - [ - ( - "state_parse_pos_args", - OrderedDict( - [ - ( - "GOTO_PARSE_NON_PARAMS_OPTIONS", - r"(?P(?=--))", # double dash - starting the options list - ), - ( - "DEST_VAR", - r"(?P[^\d\W]\w*)", # essentially a Python ID - ), - ] - ), - ), - ( - "state_parse_non_params_options", - OrderedDict( - [ - ( - "GOTO_PARSE_PARAMS_OPTION", - r"(?P(?=--params(?:\s|=|--|$)))", # the --params option - ), - ("OPTION_SPEC", r"(?P--\w+)"), - ("OPTION_EQ", r"(?P=)"), - ("OPT_VAL", r"(?P\S+?(?=\s|--|$))"), - ] - ), - ), - ( - "state_parse_params_option", - OrderedDict( - [ - ( - "PY_STRING", - r"(?P(?:{})|(?:{}))".format( - r"'(?:[^'\\]|\.)*'", - r'"(?:[^"\\]|\.)*"', # single and double quoted strings - ), - ), - ("PARAMS_OPT_SPEC", r"(?P--params(?=\s|=|--|$))"), - ("PARAMS_OPT_EQ", r"(?P=)"), - ( - "GOTO_PARSE_NON_PARAMS_OPTIONS", - r"(?P(?=--\w+))", # found another option spec - ), - ("PY_BOOL", r"(?PTrue|False)"), - ("DOLLAR_PY_ID", r"(?P\$[^\d\W]\w*)"), - ( - "PY_NUMBER", - r"(?P-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)", - ), - ("SQUOTE", r"(?P')"), - ("DQUOTE", r'(?P")'), - ("COLON", r"(?P:)"), - ("COMMA", r"(?P,)"), - ("LCURL", r"(?P\{)"), - ("RCURL", r"(?P})"), - ("LSQUARE", r"(?P\[)"), - ("RSQUARE", r"(?P])"), - ("LPAREN", r"(?P\()"), - ("RPAREN", r"(?P\))"), - ] - ), + state_parse_pos_args=OrderedDict( + GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P(?=--))", # double dash - starting the options list + DEST_VAR=r"(?P[^\d\W]\w*)", # essentially a Python ID + ), + state_parse_non_params_options=OrderedDict( + GOTO_PARSE_PARAMS_OPTION=r"(?P(?=--params(?:\s|=|--|$)))", # the --params option + OPTION_SPEC=r"(?P--\w+)", + OPTION_EQ=r"(?P=)", + OPT_VAL=r"(?P\S+?(?=\s|--|$))", + ), + state_parse_params_option=OrderedDict( + PY_STRING=r"(?P(?:{})|(?:{}))".format( # single and double quoted strings + r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"' ), - ( - "common", - OrderedDict( - [ - ("WS", r"(?P\s+)"), - ("EOL", r"(?P$)"), - ( - # anything not a whitespace or matched by something else - "UNKNOWN", - r"(?P\S+)", - ), - ] - ), - ), - ] + PARAMS_OPT_SPEC=r"(?P--params(?=\s|=|--|$))", + PARAMS_OPT_EQ=r"(?P=)", + GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P(?=--\w+))", # found another option spec + PY_BOOL=r"(?PTrue|False)", + DOLLAR_PY_ID=r"(?P\$[^\d\W]\w*)", + PY_NUMBER=r"(?P-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)", + SQUOTE=r"(?P')", + DQUOTE=r'(?P")', + COLON=r"(?P:)", + COMMA=r"(?P,)", + LCURL=r"(?P\{)", + RCURL=r"(?P})", + LSQUARE=r"(?P\[)", + RSQUARE=r"(?P])", + LPAREN=r"(?P\()", + RPAREN=r"(?P\))", + ), + common=OrderedDict( + WS=r"(?P\s+)", + EOL=r"(?P$)", + UNKNOWN=r"(?P\S+)", # anything not a whitespace or matched by something else + ), ) -# The _generate_next_value_() enum hook is only available in Python 3.6+, thus we -# need to do some acrobatics to implement an "auto str enum" base class. Implementation -# based on the recipe provided by the very author of the Enum library: -# https://stackoverflow.com/a/32313954/5040035 -class StrEnumMeta(enum.EnumMeta): - @classmethod - def __prepare__(metacls, name, bases, **kwargs): - # Having deterministic enum members definition order is nice. - return OrderedDict() - - def __new__(metacls, name, bases, oldclassdict): - # Scan through the declared enum members and convert any value that is a plain - # empty tuple into a `str` of the name instead. - newclassdict = enum._EnumDict() - for key, val in oldclassdict.items(): - if val == (): - val = key - newclassdict[key] = val - return super(StrEnumMeta, metacls).__new__(metacls, name, bases, newclassdict) +class AutoStrEnum(str, enum.Enum): + """Base enum class for for name=value str enums.""" + def _generate_next_value_(name, start, count, last_values): + return name -# The @six.add_metaclass decorator does not work, Enum complains about _sunder_ names, -# and we cannot use class syntax directly, because the Python 3 version would cause -# a syntax error under Python 2. -AutoStrEnum = StrEnumMeta( - "AutoStrEnum", - (str, enum.Enum), - {"__doc__": "Base enum class for for name=value str enums."}, -) -TokenType = AutoStrEnum( +TokenType = AutoStrEnum( # pytype: disable=wrong-arg-types "TokenType", [ - (name, name) + (name, enum.auto()) for name in itertools.chain.from_iterable(token_types.values()) if not name.startswith("GOTO_") ], @@ -177,10 +109,10 @@ def __new__(metacls, name, bases, oldclassdict): class LexerState(AutoStrEnum): - PARSE_POS_ARGS = () # parsing positional arguments - PARSE_NON_PARAMS_OPTIONS = () # parsing options other than "--params" - PARSE_PARAMS_OPTION = () # parsing the "--params" option - STATE_END = () + PARSE_POS_ARGS = enum.auto() # parsing positional arguments + PARSE_NON_PARAMS_OPTIONS = enum.auto() # parsing options other than "--params" + PARSE_PARAMS_OPTION = enum.auto() # parsing the "--params" option + STATE_END = enum.auto() class Lexer(object): diff --git a/google/cloud/bigquery/magics/magics.py b/google/cloud/bigquery/magics/magics.py index 4842c7680..d368bbeaa 100644 --- a/google/cloud/bigquery/magics/magics.py +++ b/google/cloud/bigquery/magics/magics.py @@ -14,6 +14,15 @@ """IPython Magics +To use these magics, you must first register them. Run the ``%load_ext`` magic +in a Jupyter notebook cell. + +.. code:: + + %load_ext google.cloud.bigquery + +This makes the ``%%bigquery`` magic available. + .. function:: %%bigquery IPython cell magic to run a query and display the result as a DataFrame @@ -139,6 +148,7 @@ import re import ast +import copy import functools import sys import time @@ -152,9 +162,8 @@ except ImportError: # pragma: NO COVER raise ImportError("This module can only be loaded in IPython.") -import six - from google.api_core import client_info +from google.api_core import client_options from google.api_core.exceptions import NotFound import google.auth from google.cloud import bigquery @@ -178,11 +187,14 @@ def __init__(self): self._project = None self._connection = None self._default_query_job_config = bigquery.QueryJobConfig() + self._bigquery_client_options = client_options.ClientOptions() + self._bqstorage_client_options = client_options.ClientOptions() + self._progress_bar_type = "tqdm" @property def credentials(self): """google.auth.credentials.Credentials: Credentials to use for queries - performed through IPython magics + performed through IPython magics. Note: These credentials do not need to be explicitly defined if you are @@ -217,7 +229,7 @@ def credentials(self, value): @property def project(self): """str: Default project to use for queries performed through IPython - magics + magics. Note: The project does not need to be explicitly defined if you have an @@ -239,6 +251,54 @@ def project(self): def project(self, value): self._project = value + @property + def bigquery_client_options(self): + """google.api_core.client_options.ClientOptions: client options to be + used through IPython magics. + + Note:: + The client options do not need to be explicitly defined if no + special network connections are required. Normally you would be + using the https://bigquery.googleapis.com/ end point. + + Example: + Manually setting the endpoint: + + >>> from google.cloud.bigquery import magics + >>> client_options = {} + >>> client_options['api_endpoint'] = "https://some.special.url" + >>> magics.context.bigquery_client_options = client_options + """ + return self._bigquery_client_options + + @bigquery_client_options.setter + def bigquery_client_options(self, value): + self._bigquery_client_options = value + + @property + def bqstorage_client_options(self): + """google.api_core.client_options.ClientOptions: client options to be + used through IPython magics for the storage client. + + Note:: + The client options do not need to be explicitly defined if no + special network connections are required. Normally you would be + using the https://bigquerystorage.googleapis.com/ end point. + + Example: + Manually setting the endpoint: + + >>> from google.cloud.bigquery import magics + >>> client_options = {} + >>> client_options['api_endpoint'] = "https://some.special.url" + >>> magics.context.bqstorage_client_options = client_options + """ + return self._bqstorage_client_options + + @bqstorage_client_options.setter + def bqstorage_client_options(self, value): + self._bqstorage_client_options = value + @property def default_query_job_config(self): """google.cloud.bigquery.job.QueryJobConfig: Default job @@ -261,6 +321,26 @@ def default_query_job_config(self): def default_query_job_config(self, value): self._default_query_job_config = value + @property + def progress_bar_type(self): + """str: Default progress bar type to use to display progress bar while + executing queries through IPython magics. + + Note:: + Install the ``tqdm`` package to use this feature. + + Example: + Manually setting the progress_bar_type: + + >>> from google.cloud.bigquery import magics + >>> magics.context.progress_bar_type = "tqdm" + """ + return self._progress_bar_type + + @progress_bar_type.setter + def progress_bar_type(self, value): + self._progress_bar_type = value + context = Context() @@ -410,6 +490,24 @@ def _create_dataset_if_necessary(client, dataset_id): "Standard SQL if this argument is not used." ), ) +@magic_arguments.argument( + "--bigquery_api_endpoint", + type=str, + default=None, + help=( + "The desired API endpoint, e.g., bigquery.googlepis.com. Defaults to this " + "option's value in the context bigquery_client_options." + ), +) +@magic_arguments.argument( + "--bqstorage_api_endpoint", + type=str, + default=None, + help=( + "The desired API endpoint, e.g., bigquerystorage.googlepis.com. Defaults to " + "this option's value in the context bqstorage_client_options." + ), +) @magic_arguments.argument( "--use_bqstorage_api", action="store_true", @@ -454,6 +552,15 @@ def _create_dataset_if_necessary(client, dataset_id): "name (ex. $my_dict_var)." ), ) +@magic_arguments.argument( + "--progress_bar_type", + type=str, + default=None, + help=( + "Sets progress bar type to display a progress bar while executing the query." + "Defaults to use tqdm. Install the ``tqdm`` package to use this feature." + ), +) def _cell_magic(line, query): """Underlying function for bigquery cell magic @@ -477,16 +584,16 @@ def _cell_magic(line, query): "--params is not a correctly formatted JSON string or a JSON " "serializable dictionary" ) - six.raise_from(rebranded_error, exc) + raise rebranded_error from exc except lap.exceptions.DuplicateQueryParamsError as exc: rebranded_error = ValueError("Duplicate --params option.") - six.raise_from(rebranded_error, exc) + raise rebranded_error from exc except lap.exceptions.ParseError as exc: rebranded_error = ValueError( "Unrecognized input, are option values correct? " "Error details: {}".format(exc.args[0]) ) - six.raise_from(rebranded_error, exc) + raise rebranded_error from exc args = magic_arguments.parse_argstring(_cell_magic, rest_of_args) @@ -508,18 +615,37 @@ def _cell_magic(line, query): ) raise NameError(msg) - params = _helpers.to_query_parameters(ast.literal_eval(params_option_value)) + params = _helpers.to_query_parameters(ast.literal_eval(params_option_value), {}) project = args.project or context.project + + bigquery_client_options = copy.deepcopy(context.bigquery_client_options) + if args.bigquery_api_endpoint: + if isinstance(bigquery_client_options, dict): + bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint + else: + bigquery_client_options.api_endpoint = args.bigquery_api_endpoint + client = bigquery.Client( project=project, credentials=context.credentials, default_query_job_config=context.default_query_job_config, client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT), + client_options=bigquery_client_options, ) if context._connection: client._connection = context._connection - bqstorage_client = _make_bqstorage_client(use_bqstorage_api, context.credentials) + + bqstorage_client_options = copy.deepcopy(context.bqstorage_client_options) + if args.bqstorage_api_endpoint: + if isinstance(bqstorage_client_options, dict): + bqstorage_client_options["api_endpoint"] = args.bqstorage_api_endpoint + else: + bqstorage_client_options.api_endpoint = args.bqstorage_api_endpoint + + bqstorage_client = _make_bqstorage_client( + client, use_bqstorage_api, bqstorage_client_options, + ) close_transports = functools.partial(_close_transports, client, bqstorage_client) @@ -545,7 +671,9 @@ def _cell_magic(line, query): _handle_error(ex, args.destination_var) return - result = rows.to_dataframe(bqstorage_client=bqstorage_client) + result = rows.to_dataframe( + bqstorage_client=bqstorage_client, create_bqstorage_client=False, + ) if args.destination_var: IPython.get_ipython().push({args.destination_var: result}) return @@ -598,12 +726,20 @@ def _cell_magic(line, query): ) return query_job + progress_bar = context.progress_bar_type or args.progress_bar_type + if max_results: result = query_job.result(max_results=max_results).to_dataframe( - bqstorage_client=bqstorage_client + bqstorage_client=None, + create_bqstorage_client=False, + progress_bar_type=progress_bar, ) else: - result = query_job.to_dataframe(bqstorage_client=bqstorage_client) + result = query_job.to_dataframe( + bqstorage_client=bqstorage_client, + create_bqstorage_client=False, + progress_bar_type=progress_bar, + ) if args.destination_var: IPython.get_ipython().push({args.destination_var: result}) @@ -632,12 +768,12 @@ def _split_args_line(line): return params_option_value, rest_of_args -def _make_bqstorage_client(use_bqstorage_api, credentials): +def _make_bqstorage_client(client, use_bqstorage_api, client_options): if not use_bqstorage_api: return None try: - from google.cloud import bigquery_storage_v1 + from google.cloud import bigquery_storage # noqa: F401 except ImportError as err: customized_error = ImportError( "The default BigQuery Storage API client cannot be used, install " @@ -645,7 +781,7 @@ def _make_bqstorage_client(use_bqstorage_api, credentials): "to use it. Alternatively, use the classic REST API by specifying " "the --use_rest_api magic option." ) - six.raise_from(customized_error, err) + raise customized_error from err try: from google.api_core.gapic_v1 import client_info as gapic_client_info @@ -653,10 +789,10 @@ def _make_bqstorage_client(use_bqstorage_api, credentials): customized_error = ImportError( "Install the grpcio package to use the BigQuery Storage API." ) - six.raise_from(customized_error, err) + raise customized_error from err - return bigquery_storage_v1.BigQueryReadClient( - credentials=credentials, + return client._ensure_bqstorage_client( + client_options=client_options, client_info=gapic_client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT), ) @@ -670,10 +806,10 @@ def _close_transports(client, bqstorage_client): Args: client (:class:`~google.cloud.bigquery.client.Client`): bqstorage_client - (Optional[:class:`~google.cloud.bigquery_storage_v1.BigQueryReadClient`]): + (Optional[:class:`~google.cloud.bigquery_storage.BigQueryReadClient`]): A client for the BigQuery Storage API. """ client.close() if bqstorage_client is not None: - bqstorage_client.transport.channel.close() + bqstorage_client._transport.grpc_channel.close() diff --git a/google/cloud/bigquery/model.py b/google/cloud/bigquery/model.py index d3fe8a937..2d3f6660f 100644 --- a/google/cloud/bigquery/model.py +++ b/google/cloud/bigquery/model.py @@ -19,7 +19,6 @@ import copy from google.protobuf import json_format -import six import google.cloud._helpers from google.api_core import datetime_helpers @@ -55,7 +54,7 @@ class Model(object): def __init__(self, model_ref): # Use _proto on read-only properties to use it's built-in type # conversion. - self._proto = types.Model() + self._proto = types.Model()._pb # Use _properties on read-write properties to match the REST API # semantics. The BigQuery API makes a distinction between an unset @@ -63,7 +62,7 @@ def __init__(self, model_ref): # buffer classes do not. self._properties = {} - if isinstance(model_ref, six.string_types): + if isinstance(model_ref, str): model_ref = ModelReference.from_string(model_ref) if model_ref: @@ -151,13 +150,13 @@ def modified(self): @property def model_type(self): - """google.cloud.bigquery_v2.gapic.enums.Model.ModelType: Type of the + """google.cloud.bigquery_v2.types.Model.ModelType: Type of the model resource. Read-only. The value is one of elements of the - :class:`~google.cloud.bigquery_v2.gapic.enums.Model.ModelType` + :class:`~google.cloud.bigquery_v2.types.Model.ModelType` enumeration. """ return self._proto.model_type @@ -280,7 +279,7 @@ def encryption_configuration(self, value): self._properties["encryptionConfiguration"] = api_repr @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "Model": """Factory: construct a model resource given its API representation Args: @@ -305,9 +304,15 @@ def from_api_repr(cls, resource): start_time = datetime_helpers.from_microseconds(1e3 * float(start_time)) training_run["startTime"] = datetime_helpers.to_rfc3339(start_time) - this._proto = json_format.ParseDict( - resource, types.Model(), ignore_unknown_fields=True - ) + try: + this._proto = json_format.ParseDict( + resource, types.Model()._pb, ignore_unknown_fields=True + ) + except json_format.ParseError: + resource["modelType"] = "MODEL_TYPE_UNSPECIFIED" + this._proto = json_format.ParseDict( + resource, types.Model()._pb, ignore_unknown_fields=True + ) return this def _build_resource(self, filter_fields): @@ -317,6 +322,14 @@ def _build_resource(self, filter_fields): def __repr__(self): return "Model(reference={})".format(repr(self.reference)) + def to_api_repr(self) -> dict: + """Construct the API resource representation of this model. + + Returns: + Dict[str, object]: Model reference represented as an API resource + """ + return json_format.MessageToDict(self._proto) + class ModelReference(object): """ModelReferences are pointers to models. @@ -326,7 +339,7 @@ class ModelReference(object): """ def __init__(self): - self._proto = types.ModelReference() + self._proto = types.ModelReference()._pb self._properties = {} @property @@ -370,12 +383,15 @@ def from_api_repr(cls, resource): # field values. ref._properties = resource ref._proto = json_format.ParseDict( - resource, types.ModelReference(), ignore_unknown_fields=True + resource, types.ModelReference()._pb, ignore_unknown_fields=True ) + return ref @classmethod - def from_string(cls, model_id, default_project=None): + def from_string( + cls, model_id: str, default_project: str = None + ) -> "ModelReference": """Construct a model reference from model ID string. Args: @@ -403,7 +419,7 @@ def from_string(cls, model_id, default_project=None): {"projectId": proj, "datasetId": dset, "modelId": model} ) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct the API resource representation of this model reference. Returns: @@ -440,7 +456,7 @@ def _model_arg_to_model_ref(value, default_project=None): This function keeps ModelReference and other kinds of objects unchanged. """ - if isinstance(value, six.string_types): + if isinstance(value, str): return ModelReference.from_string(value, default_project=default_project) if isinstance(value, Model): return value.reference diff --git a/google/cloud/bigquery/opentelemetry_tracing.py b/google/cloud/bigquery/opentelemetry_tracing.py index f7375c346..57f258ac4 100644 --- a/google/cloud/bigquery/opentelemetry_tracing.py +++ b/google/cloud/bigquery/opentelemetry_tracing.py @@ -23,16 +23,11 @@ from opentelemetry.trace.status import Status HAS_OPENTELEMETRY = True + _warned_telemetry = True except ImportError: - logger.info( - "This service is instrumented using OpenTelemetry." - "OpenTelemetry could not be imported; please" - "add opentelemetry-api and opentelemetry-instrumentation" - "packages in order to get BigQuery Tracing data." - ) - HAS_OPENTELEMETRY = False + _warned_telemetry = False _default_attributes = { "db.system": "BigQuery" @@ -64,8 +59,18 @@ def create_span(name, attributes=None, client=None, job_ref=None): Raised if a span could not be yielded or issue with call to OpenTelemetry. """ + global _warned_telemetry final_attributes = _get_final_span_attributes(attributes, client, job_ref) if not HAS_OPENTELEMETRY: + if not _warned_telemetry: + logger.debug( + "This service is instrumented using OpenTelemetry. " + "OpenTelemetry could not be imported; please " + "add opentelemetry-api and opentelemetry-instrumentation " + "packages in order to get BigQuery Tracing data." + ) + _warned_telemetry = True + yield None return tracer = trace.get_tracer(__name__) diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index f2ed6337e..1f449f189 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -16,6 +16,9 @@ from collections import OrderedDict import copy +import datetime +import decimal +from typing import Optional, Union from google.cloud.bigquery.table import _parse_schema_resource from google.cloud.bigquery._helpers import _rows_from_json @@ -23,6 +26,11 @@ from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_PARAM +_SCALAR_VALUE_TYPE = Optional[ + Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date] +] + + class UDFResource(object): """Describe a single user-defined function (UDF) resource. @@ -48,12 +56,259 @@ def __ne__(self, other): return not self == other +class _AbstractQueryParameterType: + """Base class for representing query parameter types. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/QueryParameter#queryparametertype + """ + + @classmethod + def from_api_repr(cls, resource): + """Factory: construct parameter type from JSON resource. + + Args: + resource (Dict): JSON mapping of parameter + + Returns: + google.cloud.bigquery.query.QueryParameterType: Instance + """ + raise NotImplementedError + + def to_api_repr(self): + """Construct JSON API representation for the parameter type. + + Returns: + Dict: JSON mapping + """ + raise NotImplementedError + + +class ScalarQueryParameterType(_AbstractQueryParameterType): + """Type representation for scalar query parameters. + + Args: + type_ (str): + One of 'STRING', 'INT64', 'FLOAT64', 'NUMERIC', 'BOOL', 'TIMESTAMP', + 'DATETIME', or 'DATE'. + name (Optional[str]): + The name of the query parameter. Primarily used if the type is + one of the subfields in ``StructQueryParameterType`` instance. + description (Optional[str]): + The query parameter description. Primarily used if the type is + one of the subfields in ``StructQueryParameterType`` instance. + """ + + def __init__(self, type_, *, name=None, description=None): + self._type = type_ + self.name = name + self.description = description + + @classmethod + def from_api_repr(cls, resource): + """Factory: construct parameter type from JSON resource. + + Args: + resource (Dict): JSON mapping of parameter + + Returns: + google.cloud.bigquery.query.ScalarQueryParameterType: Instance + """ + type_ = resource["type"] + return cls(type_) + + def to_api_repr(self): + """Construct JSON API representation for the parameter type. + + Returns: + Dict: JSON mapping + """ + # Name and description are only used if the type is a field inside a struct + # type, but it's StructQueryParameterType's responsibilty to use these two + # attributes in the API representation when needed. Here we omit them. + return {"type": self._type} + + def with_name(self, new_name: Union[str, None]): + """Return a copy of the instance with ``name`` set to ``new_name``. + + Args: + name (Union[str, None]): + The new name of the query parameter type. If ``None``, the existing + name is cleared. + + Returns: + google.cloud.bigquery.query.ScalarQueryParameterType: + A new instance with updated name. + """ + return type(self)(self._type, name=new_name, description=self.description) + + def __repr__(self): + name = f", name={self.name!r}" if self.name is not None else "" + description = ( + f", description={self.description!r}" + if self.description is not None + else "" + ) + return f"{self.__class__.__name__}({self._type!r}{name}{description})" + + +class ArrayQueryParameterType(_AbstractQueryParameterType): + """Type representation for array query parameters. + + Args: + array_type (Union[ScalarQueryParameterType, StructQueryParameterType]): + The type of array elements. + name (Optional[str]): + The name of the query parameter. Primarily used if the type is + one of the subfields in ``StructQueryParameterType`` instance. + description (Optional[str]): + The query parameter description. Primarily used if the type is + one of the subfields in ``StructQueryParameterType`` instance. + """ + + def __init__(self, array_type, *, name=None, description=None): + self._array_type = array_type + self.name = name + self.description = description + + @classmethod + def from_api_repr(cls, resource): + """Factory: construct parameter type from JSON resource. + + Args: + resource (Dict): JSON mapping of parameter + + Returns: + google.cloud.bigquery.query.ArrayQueryParameterType: Instance + """ + array_item_type = resource["arrayType"]["type"] + + if array_item_type in {"STRUCT", "RECORD"}: + klass = StructQueryParameterType + else: + klass = ScalarQueryParameterType + + item_type_instance = klass.from_api_repr(resource["arrayType"]) + return cls(item_type_instance) + + def to_api_repr(self): + """Construct JSON API representation for the parameter type. + + Returns: + Dict: JSON mapping + """ + # Name and description are only used if the type is a field inside a struct + # type, but it's StructQueryParameterType's responsibilty to use these two + # attributes in the API representation when needed. Here we omit them. + return { + "type": "ARRAY", + "arrayType": self._array_type.to_api_repr(), + } + + def __repr__(self): + name = f", name={self.name!r}" if self.name is not None else "" + description = ( + f", description={self.description!r}" + if self.description is not None + else "" + ) + return f"{self.__class__.__name__}({self._array_type!r}{name}{description})" + + +class StructQueryParameterType(_AbstractQueryParameterType): + """Type representation for struct query parameters. + + Args: + fields (Iterable[Union[ \ + ArrayQueryParameterType, ScalarQueryParameterType, StructQueryParameterType \ + ]]): + An non-empty iterable describing the struct's field types. + name (Optional[str]): + The name of the query parameter. Primarily used if the type is + one of the subfields in ``StructQueryParameterType`` instance. + description (Optional[str]): + The query parameter description. Primarily used if the type is + one of the subfields in ``StructQueryParameterType`` instance. + """ + + def __init__(self, *fields, name=None, description=None): + if not fields: + raise ValueError("Struct type must have at least one field defined.") + + self._fields = fields # fields is a tuple (immutable), no shallow copy needed + self.name = name + self.description = description + + @property + def fields(self): + return self._fields # no copy needed, self._fields is an immutable sequence + + @classmethod + def from_api_repr(cls, resource): + """Factory: construct parameter type from JSON resource. + + Args: + resource (Dict): JSON mapping of parameter + + Returns: + google.cloud.bigquery.query.StructQueryParameterType: Instance + """ + fields = [] + + for struct_field in resource["structTypes"]: + type_repr = struct_field["type"] + if type_repr["type"] in {"STRUCT", "RECORD"}: + klass = StructQueryParameterType + elif type_repr["type"] == "ARRAY": + klass = ArrayQueryParameterType + else: + klass = ScalarQueryParameterType + + type_instance = klass.from_api_repr(type_repr) + type_instance.name = struct_field.get("name") + type_instance.description = struct_field.get("description") + fields.append(type_instance) + + return cls(*fields) + + def to_api_repr(self): + """Construct JSON API representation for the parameter type. + + Returns: + Dict: JSON mapping + """ + fields = [] + + for field in self._fields: + item = {"type": field.to_api_repr()} + if field.name is not None: + item["name"] = field.name + if field.description is not None: + item["description"] = field.description + + fields.append(item) + + return { + "type": "STRUCT", + "structTypes": fields, + } + + def __repr__(self): + name = f", name={self.name!r}" if self.name is not None else "" + description = ( + f", description={self.description!r}" + if self.description is not None + else "" + ) + items = ", ".join(repr(field) for field in self._fields) + return f"{self.__class__.__name__}({items}{name}{description})" + + class _AbstractQueryParameter(object): """Base class for named / positional query parameters. """ @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "ScalarQueryParameter": """Factory: construct parameter from JSON resource. Args: @@ -64,7 +319,7 @@ def from_api_repr(cls, resource): """ raise NotImplementedError - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct JSON API representation for the parameter. Returns: @@ -77,35 +332,46 @@ class ScalarQueryParameter(_AbstractQueryParameter): """Named / positional query parameters for scalar values. Args: - name (Optional[str]): + name: Parameter name, used via ``@foo`` syntax. If None, the parameter can only be addressed via position (``?``). - type_ (str): - Name of parameter type. One of 'STRING', 'INT64', - 'FLOAT64', 'NUMERIC', 'BOOL', 'TIMESTAMP', 'DATETIME', or - 'DATE'. + type_: + Name of parameter type. See + :class:`google.cloud.bigquery.enums.SqlTypeNames` and + :class:`google.cloud.bigquery.enums.SqlParameterScalarTypes` for + supported types. - value (Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date]): + value: The scalar parameter value. """ - def __init__(self, name, type_, value): + def __init__( + self, + name: Optional[str], + type_: Optional[Union[str, ScalarQueryParameterType]], + value: _SCALAR_VALUE_TYPE, + ): self.name = name - self.type_ = type_ + if isinstance(type_, ScalarQueryParameterType): + self.type_ = type_._type + else: + self.type_ = type_ self.value = value @classmethod - def positional(cls, type_, value): + def positional( + cls, type_: Union[str, ScalarQueryParameterType], value: _SCALAR_VALUE_TYPE + ) -> "ScalarQueryParameter": """Factory for positional paramater. Args: - type_ (str): + type_: Name of parameter type. One of 'STRING', 'INT64', - 'FLOAT64', 'NUMERIC', 'BOOL', 'TIMESTAMP', 'DATETIME', or + 'FLOAT64', 'NUMERIC', 'BIGNUMERIC', 'BOOL', 'TIMESTAMP', 'DATETIME', or 'DATE'. - value (Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date]): + value: The scalar parameter value. Returns: @@ -114,7 +380,7 @@ def positional(cls, type_, value): return cls(None, type_, value) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "ScalarQueryParameter": """Factory: construct parameter from JSON resource. Args: @@ -136,7 +402,7 @@ def from_api_repr(cls, resource): return cls(name, type_, converted) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct JSON API representation for the parameter. Returns: @@ -184,28 +450,43 @@ class ArrayQueryParameter(_AbstractQueryParameter): Parameter name, used via ``@foo`` syntax. If None, the parameter can only be addressed via position (``?``). - array_type (str): - Name of type of array elements. One of `'STRING'`, `'INT64'`, - `'FLOAT64'`, `'NUMERIC'`, `'BOOL'`, `'TIMESTAMP'`, or `'DATE'`. + array_type (Union[str, ScalarQueryParameterType, StructQueryParameterType]): + The type of array elements. If given as a string, it must be one of + `'STRING'`, `'INT64'`, `'FLOAT64'`, `'NUMERIC'`, `'BIGNUMERIC'`, `'BOOL'`, + `'TIMESTAMP'`, `'DATE'`, or `'STRUCT'`/`'RECORD'`. + If the type is ``'STRUCT'``/``'RECORD'`` and ``values`` is empty, + the exact item type cannot be deduced, thus a ``StructQueryParameterType`` + instance needs to be passed in. - values (List[appropriate scalar type]): The parameter array values. + values (List[appropriate type]): The parameter array values. """ def __init__(self, name, array_type, values): self.name = name - self.array_type = array_type self.values = values + if isinstance(array_type, str): + if not values and array_type in {"RECORD", "STRUCT"}: + raise ValueError( + "Missing detailed struct item type info for an empty array, " + "please provide a StructQueryParameterType instance." + ) + self.array_type = array_type + @classmethod - def positional(cls, array_type, values): + def positional(cls, array_type: str, values: list) -> "ArrayQueryParameter": """Factory for positional parameters. Args: - array_type (str): - Name of type of array elements. One of `'STRING'`, `'INT64'`, - `'FLOAT64'`, `'NUMERIC'`, `'BOOL'`, `'TIMESTAMP'`, or `'DATE'`. + array_type (Union[str, ScalarQueryParameterType, StructQueryParameterType]): + The type of array elements. If given as a string, it must be one of + `'STRING'`, `'INT64'`, `'FLOAT64'`, `'NUMERIC'`, `'BIGNUMERIC'`, + `'BOOL'`, `'TIMESTAMP'`, `'DATE'`, or `'STRUCT'`/`'RECORD'`. + If the type is ``'STRUCT'``/``'RECORD'`` and ``values`` is empty, + the exact item type cannot be deduced, thus a ``StructQueryParameterType`` + instance needs to be passed in. - values (List[appropriate scalar type]): The parameter array values. + values (List[appropriate type]): The parameter array values. Returns: google.cloud.bigquery.query.ArrayQueryParameter: Instance without name @@ -242,7 +523,7 @@ def _from_api_repr_scalar(cls, resource): return cls(name, array_type, converted) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "ArrayQueryParameter": """Factory: construct parameter from JSON resource. Args: @@ -256,29 +537,47 @@ def from_api_repr(cls, resource): return cls._from_api_repr_struct(resource) return cls._from_api_repr_scalar(resource) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct JSON API representation for the parameter. Returns: Dict: JSON mapping """ values = self.values - if self.array_type == "RECORD" or self.array_type == "STRUCT": + + if self.array_type in {"RECORD", "STRUCT"} or isinstance( + self.array_type, StructQueryParameterType + ): reprs = [value.to_api_repr() for value in values] - a_type = reprs[0]["parameterType"] a_values = [repr_["parameterValue"] for repr_ in reprs] + + if reprs: + a_type = reprs[0]["parameterType"] + else: + # This assertion always evaluates to True because the + # constructor disallows STRUCT/RECORD type defined as a + # string with empty values. + assert isinstance(self.array_type, StructQueryParameterType) + a_type = self.array_type.to_api_repr() else: - a_type = {"type": self.array_type} - converter = _SCALAR_VALUE_TO_JSON_PARAM.get(self.array_type) + # Scalar array item type. + if isinstance(self.array_type, str): + a_type = {"type": self.array_type} + else: + a_type = self.array_type.to_api_repr() + + converter = _SCALAR_VALUE_TO_JSON_PARAM.get(a_type["type"]) if converter is not None: values = [converter(value) for value in values] a_values = [{"value": value} for value in values] + resource = { "parameterType": {"type": "ARRAY", "arrayType": a_type}, "parameterValue": {"arrayValues": a_values}, } if self.name is not None: resource["name"] = self.name + return resource def _key(self): @@ -289,7 +588,14 @@ def _key(self): Returns: Tuple: The contents of this :class:`~google.cloud.bigquery.query.ArrayQueryParameter`. """ - return (self.name, self.array_type.upper(), self.values) + if isinstance(self.array_type, str): + item_type = self.array_type + elif isinstance(self.array_type, ScalarQueryParameterType): + item_type = self.array_type._type + else: + item_type = "STRUCT" + + return (self.name, item_type.upper(), self.values) def __eq__(self, other): if not isinstance(other, ArrayQueryParameter): @@ -350,7 +656,7 @@ def positional(cls, *sub_params): return cls(None, *sub_params) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "StructQueryParameter": """Factory: construct parameter from JSON resource. Args: @@ -390,7 +696,7 @@ def from_api_repr(cls, resource): instance.struct_values[key] = converted return instance - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct JSON API representation for the parameter. Returns: @@ -542,7 +848,7 @@ def total_rows(self): https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.total_rows Returns: - Optional[int}: Count generated on the server (None until set by the server). + Optional[int]: Count generated on the server (None until set by the server). """ total_rows = self._properties.get("totalRows") if total_rows is not None: @@ -585,7 +891,7 @@ def rows(self): Returns: Optional[List[google.cloud.bigquery.table.Row]]: - Fields describing the schema (None until set by the server). + Rows containing the results of the query. """ return _rows_from_json(self._properties.get("rows", ()), self.schema) diff --git a/google/cloud/bigquery/retry.py b/google/cloud/bigquery/retry.py index 4bc4b757f..830582322 100644 --- a/google/cloud/bigquery/retry.py +++ b/google/cloud/bigquery/retry.py @@ -14,6 +14,8 @@ from google.api_core import exceptions from google.api_core import retry +from google.auth import exceptions as auth_exceptions +import requests.exceptions _RETRYABLE_REASONS = frozenset( @@ -21,11 +23,18 @@ ) _UNSTRUCTURED_RETRYABLE_TYPES = ( + ConnectionError, exceptions.TooManyRequests, exceptions.InternalServerError, exceptions.BadGateway, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + auth_exceptions.TransportError, ) +_DEFAULT_JOB_DEADLINE = 60.0 * 10.0 # seconds + def _should_retry(exc): """Predicate for determining when to retry. @@ -33,10 +42,7 @@ def _should_retry(exc): We retry if and only if the 'reason' is 'backendError' or 'rateLimitExceeded'. """ - if not hasattr(exc, "errors"): - return False - - if len(exc.errors) == 0: + if not hasattr(exc, "errors") or len(exc.errors) == 0: # Check for unstructured error returns, e.g. from GFE return isinstance(exc, _UNSTRUCTURED_RETRYABLE_TYPES) @@ -44,7 +50,7 @@ def _should_retry(exc): return reason in _RETRYABLE_REASONS -DEFAULT_RETRY = retry.Retry(predicate=_should_retry) +DEFAULT_RETRY = retry.Retry(predicate=_should_retry, deadline=600.0) """The default retry object. Any method with a ``retry`` parameter will be retried automatically, @@ -53,3 +59,28 @@ def _should_retry(exc): on ``DEFAULT_RETRY``. For example, to change the deadline to 30 seconds, pass ``retry=bigquery.DEFAULT_RETRY.with_deadline(30)``. """ + +DEFAULT_TIMEOUT = 5.0 * 60.0 +"""The default API timeout. + +This is the time to wait per request. To adjust the total wait time, set a +deadline on the retry object. +""" + +job_retry_reasons = "rateLimitExceeded", "backendError" + + +def _job_should_retry(exc): + if not hasattr(exc, "errors") or len(exc.errors) == 0: + return False + + reason = exc.errors[0]["reason"] + return reason in job_retry_reasons + + +DEFAULT_JOB_RETRY = retry.Retry( + predicate=_job_should_retry, deadline=_DEFAULT_JOB_DEADLINE +) +""" +The default job retry object. +""" diff --git a/google/cloud/bigquery/routine/__init__.py b/google/cloud/bigquery/routine/__init__.py new file mode 100644 index 000000000..7353073c8 --- /dev/null +++ b/google/cloud/bigquery/routine/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""User-Defined Routines.""" + + +from google.cloud.bigquery.enums import DeterminismLevel +from google.cloud.bigquery.routine.routine import Routine +from google.cloud.bigquery.routine.routine import RoutineArgument +from google.cloud.bigquery.routine.routine import RoutineReference +from google.cloud.bigquery.routine.routine import RoutineType + + +__all__ = ( + "DeterminismLevel", + "Routine", + "RoutineArgument", + "RoutineReference", + "RoutineType", +) diff --git a/google/cloud/bigquery/routine.py b/google/cloud/bigquery/routine/routine.py similarity index 84% rename from google/cloud/bigquery/routine.py rename to google/cloud/bigquery/routine/routine.py index 03423c01b..a776212c3 100644 --- a/google/cloud/bigquery/routine.py +++ b/google/cloud/bigquery/routine/routine.py @@ -17,11 +17,25 @@ """Define resources for the BigQuery Routines API.""" from google.protobuf import json_format -import six import google.cloud._helpers from google.cloud.bigquery import _helpers import google.cloud.bigquery_v2.types +from google.cloud.bigquery_v2.types import StandardSqlTableType + + +class RoutineType: + """The fine-grained type of the routine. + + https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#routinetype + + .. versionadded:: 2.22.0 + """ + + ROUTINE_TYPE_UNSPECIFIED = "ROUTINE_TYPE_UNSPECIFIED" + SCALAR_FUNCTION = "SCALAR_FUNCTION" + PROCEDURE = "PROCEDURE" + TABLE_VALUED_FUNCTION = "TABLE_VALUED_FUNCTION" class Routine(object): @@ -49,12 +63,14 @@ class Routine(object): "modified": "lastModifiedTime", "reference": "routineReference", "return_type": "returnType", + "return_table_type": "returnTableType", "type_": "routineType", "description": "description", + "determinism_level": "determinismLevel", } def __init__(self, routine_ref, **kwargs): - if isinstance(routine_ref, six.string_types): + if isinstance(routine_ref, str): routine_ref = RoutineReference.from_string(routine_ref) self._properties = {"routineReference": routine_ref.to_api_repr()} @@ -189,18 +205,50 @@ def return_type(self): resource = self._properties.get(self._PROPERTY_TO_API_FIELD["return_type"]) if not resource: return resource + output = google.cloud.bigquery_v2.types.StandardSqlDataType() - output = json_format.ParseDict(resource, output, ignore_unknown_fields=True) - return output + raw_protobuf = json_format.ParseDict( + resource, output._pb, ignore_unknown_fields=True + ) + return type(output).wrap(raw_protobuf) @return_type.setter def return_type(self, value): if value: - resource = json_format.MessageToDict(value) + resource = json_format.MessageToDict(value._pb) else: resource = None self._properties[self._PROPERTY_TO_API_FIELD["return_type"]] = resource + @property + def return_table_type(self) -> StandardSqlTableType: + """The return type of a Table Valued Function (TVF) routine. + + .. versionadded:: 2.22.0 + """ + resource = self._properties.get( + self._PROPERTY_TO_API_FIELD["return_table_type"] + ) + if not resource: + return resource + + output = google.cloud.bigquery_v2.types.StandardSqlTableType() + raw_protobuf = json_format.ParseDict( + resource, output._pb, ignore_unknown_fields=True + ) + return type(output).wrap(raw_protobuf) + + @return_table_type.setter + def return_table_type(self, value): + if not value: + resource = None + else: + resource = { + "columns": [json_format.MessageToDict(col._pb) for col in value.columns] + } + + self._properties[self._PROPERTY_TO_API_FIELD["return_table_type"]] = resource + @property def imported_libraries(self): """List[str]: The path of the imported JavaScript libraries. @@ -251,8 +299,19 @@ def description(self): def description(self, value): self._properties[self._PROPERTY_TO_API_FIELD["description"]] = value + @property + def determinism_level(self): + """Optional[str]: (experimental) The determinism level of the JavaScript UDF + if defined. + """ + return self._properties.get(self._PROPERTY_TO_API_FIELD["determinism_level"]) + + @determinism_level.setter + def determinism_level(self, value): + self._properties[self._PROPERTY_TO_API_FIELD["determinism_level"]] = value + @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "Routine": """Factory: construct a routine given its API representation. Args: @@ -267,7 +326,7 @@ def from_api_repr(cls, resource): ref._properties = resource return ref - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct the API resource representation of this routine. Returns: @@ -357,20 +416,23 @@ def data_type(self): resource = self._properties.get(self._PROPERTY_TO_API_FIELD["data_type"]) if not resource: return resource + output = google.cloud.bigquery_v2.types.StandardSqlDataType() - output = json_format.ParseDict(resource, output, ignore_unknown_fields=True) - return output + raw_protobuf = json_format.ParseDict( + resource, output._pb, ignore_unknown_fields=True + ) + return type(output).wrap(raw_protobuf) @data_type.setter def data_type(self, value): if value: - resource = json_format.MessageToDict(value) + resource = json_format.MessageToDict(value._pb) else: resource = None self._properties[self._PROPERTY_TO_API_FIELD["data_type"]] = resource @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "RoutineArgument": """Factory: construct a routine argument given its API representation. Args: @@ -384,7 +446,7 @@ def from_api_repr(cls, resource): ref._properties = resource return ref - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct the API resource representation of this routine argument. Returns: @@ -421,17 +483,17 @@ def __init__(self): @property def project(self): """str: ID of the project containing the routine.""" - return self._properties["projectId"] + return self._properties["projectId"] # pytype: disable=key-error @property def dataset_id(self): """str: ID of dataset containing the routine.""" - return self._properties["datasetId"] + return self._properties["datasetId"] # pytype: disable=key-error @property def routine_id(self): """str: The routine ID.""" - return self._properties["routineId"] + return self._properties["routineId"] # pytype: disable=key-error @property def path(self): @@ -443,7 +505,7 @@ def path(self): ) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "RoutineReference": """Factory: construct a routine reference given its API representation. Args: @@ -459,7 +521,9 @@ def from_api_repr(cls, resource): return ref @classmethod - def from_string(cls, routine_id, default_project=None): + def from_string( + cls, routine_id: str, default_project: str = None + ) -> "RoutineReference": """Factory: construct a routine reference from routine ID string. Args: @@ -487,7 +551,7 @@ def from_string(cls, routine_id, default_project=None): {"projectId": proj, "datasetId": dset, "routineId": routine} ) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct the API resource representation of this routine reference. Returns: diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py index c1b2588be..157db7ce6 100644 --- a/google/cloud/bigquery/schema.py +++ b/google/cloud/bigquery/schema.py @@ -14,33 +14,36 @@ """Schemas for BigQuery tables / queries.""" -from six.moves import collections_abc +import collections +from typing import Optional from google.cloud.bigquery_v2 import types +_DEFAULT_VALUE = object() _STRUCT_TYPES = ("RECORD", "STRUCT") # SQL types reference: # https://cloud.google.com/bigquery/data-types#legacy_sql_data_types # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types LEGACY_TO_STANDARD_TYPES = { - "STRING": types.StandardSqlDataType.STRING, - "BYTES": types.StandardSqlDataType.BYTES, - "INTEGER": types.StandardSqlDataType.INT64, - "INT64": types.StandardSqlDataType.INT64, - "FLOAT": types.StandardSqlDataType.FLOAT64, - "FLOAT64": types.StandardSqlDataType.FLOAT64, - "NUMERIC": types.StandardSqlDataType.NUMERIC, - "BOOLEAN": types.StandardSqlDataType.BOOL, - "BOOL": types.StandardSqlDataType.BOOL, - "GEOGRAPHY": types.StandardSqlDataType.GEOGRAPHY, - "RECORD": types.StandardSqlDataType.STRUCT, - "STRUCT": types.StandardSqlDataType.STRUCT, - "TIMESTAMP": types.StandardSqlDataType.TIMESTAMP, - "DATE": types.StandardSqlDataType.DATE, - "TIME": types.StandardSqlDataType.TIME, - "DATETIME": types.StandardSqlDataType.DATETIME, + "STRING": types.StandardSqlDataType.TypeKind.STRING, + "BYTES": types.StandardSqlDataType.TypeKind.BYTES, + "INTEGER": types.StandardSqlDataType.TypeKind.INT64, + "INT64": types.StandardSqlDataType.TypeKind.INT64, + "FLOAT": types.StandardSqlDataType.TypeKind.FLOAT64, + "FLOAT64": types.StandardSqlDataType.TypeKind.FLOAT64, + "NUMERIC": types.StandardSqlDataType.TypeKind.NUMERIC, + "BIGNUMERIC": types.StandardSqlDataType.TypeKind.BIGNUMERIC, + "BOOLEAN": types.StandardSqlDataType.TypeKind.BOOL, + "BOOL": types.StandardSqlDataType.TypeKind.BOOL, + "GEOGRAPHY": types.StandardSqlDataType.TypeKind.GEOGRAPHY, + "RECORD": types.StandardSqlDataType.TypeKind.STRUCT, + "STRUCT": types.StandardSqlDataType.TypeKind.STRUCT, + "TIMESTAMP": types.StandardSqlDataType.TypeKind.TIMESTAMP, + "DATE": types.StandardSqlDataType.TypeKind.DATE, + "TIME": types.StandardSqlDataType.TypeKind.TIME, + "DATETIME": types.StandardSqlDataType.TypeKind.DATETIME, # no direct conversion from ARRAY, the latter is represented by mode="REPEATED" } """String names of the legacy SQL types to integer codes of Standard SQL types.""" @@ -65,6 +68,15 @@ class SchemaField(object): policy_tags (Optional[PolicyTagList]): The policy tag list for the field. + precision (Optional[int]): + Precison (number of digits) of fields with NUMERIC or BIGNUMERIC type. + + scale (Optional[int]): + Scale (digits after decimal) of fields with NUMERIC or BIGNUMERIC type. + + max_length (Optional[int]): + Maximim length of fields with STRING or BYTES type. + """ def __init__( @@ -72,19 +84,58 @@ def __init__( name, field_type, mode="NULLABLE", - description=None, + description=_DEFAULT_VALUE, fields=(), policy_tags=None, + precision=_DEFAULT_VALUE, + scale=_DEFAULT_VALUE, + max_length=_DEFAULT_VALUE, ): - self._name = name - self._field_type = field_type - self._mode = mode - self._description = description + self._properties = { + "name": name, + "type": field_type, + } + if mode is not None: + self._properties["mode"] = mode.upper() + if description is not _DEFAULT_VALUE: + self._properties["description"] = description + if precision is not _DEFAULT_VALUE: + self._properties["precision"] = precision + if scale is not _DEFAULT_VALUE: + self._properties["scale"] = scale + if max_length is not _DEFAULT_VALUE: + self._properties["maxLength"] = max_length self._fields = tuple(fields) - self._policy_tags = policy_tags + + self._policy_tags = self._determine_policy_tags(field_type, policy_tags) + + @staticmethod + def _determine_policy_tags( + field_type: str, given_policy_tags: Optional["PolicyTagList"] + ) -> Optional["PolicyTagList"]: + """Return the given policy tags, or their suitable representation if `None`. + + Args: + field_type: The type of the schema field. + given_policy_tags: The policy tags to maybe ajdust. + """ + if given_policy_tags is not None: + return given_policy_tags + + if field_type is not None and field_type.upper() in _STRUCT_TYPES: + return None + + return PolicyTagList() + + @staticmethod + def __get_int(api_repr, name): + v = api_repr.get(name, _DEFAULT_VALUE) + if v is not _DEFAULT_VALUE: + v = int(v) + return v @classmethod - def from_api_repr(cls, api_repr): + def from_api_repr(cls, api_repr: dict) -> "SchemaField": """Return a ``SchemaField`` object deserialized from a dictionary. Args: @@ -95,24 +146,33 @@ def from_api_repr(cls, api_repr): Returns: google.cloud.biquery.schema.SchemaField: The ``SchemaField`` object. """ + field_type = api_repr["type"].upper() + # Handle optional properties with default values mode = api_repr.get("mode", "NULLABLE") - description = api_repr.get("description") + description = api_repr.get("description", _DEFAULT_VALUE) fields = api_repr.get("fields", ()) + policy_tags = cls._determine_policy_tags( + field_type, PolicyTagList.from_api_repr(api_repr.get("policyTags")) + ) + return cls( - field_type=api_repr["type"].upper(), + field_type=field_type, fields=[cls.from_api_repr(f) for f in fields], mode=mode.upper(), description=description, name=api_repr["name"], - policy_tags=PolicyTagList.from_api_repr(api_repr.get("policyTags")), + policy_tags=policy_tags, + precision=cls.__get_int(api_repr, "precision"), + scale=cls.__get_int(api_repr, "scale"), + max_length=cls.__get_int(api_repr, "maxLength"), ) @property def name(self): """str: The name of the field.""" - return self._name + return self._properties["name"] @property def field_type(self): @@ -121,7 +181,7 @@ def field_type(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type """ - return self._field_type + return self._properties["type"] @property def mode(self): @@ -130,17 +190,32 @@ def mode(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.mode """ - return self._mode + return self._properties.get("mode") @property def is_nullable(self): """bool: whether 'mode' is 'nullable'.""" - return self._mode == "NULLABLE" + return self.mode == "NULLABLE" @property def description(self): """Optional[str]: description for the field.""" - return self._description + return self._properties.get("description") + + @property + def precision(self): + """Optional[int]: Precision (number of digits) for the NUMERIC field.""" + return self._properties.get("precision") + + @property + def scale(self): + """Optional[int]: Scale (digits after decimal) for the NUMERIC field.""" + return self._properties.get("scale") + + @property + def max_length(self): + """Optional[int]: Maximum length for the STRING or BYTES field.""" + return self._properties.get("maxLength") @property def fields(self): @@ -157,27 +232,21 @@ def policy_tags(self): """ return self._policy_tags - def to_api_repr(self): + def to_api_repr(self) -> dict: """Return a dictionary representing this schema field. Returns: Dict: A dictionary representing the SchemaField in a serialized form. """ - # Put together the basic representation. See http://bit.ly/2hOAT5u. - answer = { - "mode": self.mode.upper(), - "name": self.name, - "type": self.field_type.upper(), - "description": self.description, - } + answer = self._properties.copy() # If this is a RECORD type, then sub-fields are also included, # add this to the serialized representation. if self.field_type.upper() in _STRUCT_TYPES: answer["fields"] = [f.to_api_repr() for f in self.fields] - - # If this contains a policy tag definition, include that as well: - if self.policy_tags is not None: + else: + # Explicitly include policy tag definition (we must not do it for RECORD + # fields, because those are not leaf fields). answer["policyTags"] = self.policy_tags.to_api_repr() # Done; return the serialized dictionary. @@ -191,16 +260,32 @@ def _key(self): Returns: Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`. """ + field_type = self.field_type.upper() + if field_type == "STRING" or field_type == "BYTES": + if self.max_length is not None: + field_type = f"{field_type}({self.max_length})" + elif field_type.endswith("NUMERIC"): + if self.precision is not None: + if self.scale is not None: + field_type = f"{field_type}({self.precision}, {self.scale})" + else: + field_type = f"{field_type}({self.precision})" + + policy_tags = ( + () if self._policy_tags is None else tuple(sorted(self._policy_tags.names)) + ) + return ( - self._name, - self._field_type.upper(), - self._mode.upper(), - self._description, + self.name, + field_type, + # Mode is always str, if not given it defaults to a str value + self.mode.upper(), # pytype: disable=attribute-error + self.description, self._fields, - self._policy_tags, + policy_tags, ) - def to_standard_sql(self): + def to_standard_sql(self) -> types.StandardSqlField: """Return the field as the standard SQL field representation object. Returns: @@ -209,26 +294,34 @@ def to_standard_sql(self): sql_type = types.StandardSqlDataType() if self.mode == "REPEATED": - sql_type.type_kind = types.StandardSqlDataType.ARRAY + sql_type.type_kind = types.StandardSqlDataType.TypeKind.ARRAY else: sql_type.type_kind = LEGACY_TO_STANDARD_TYPES.get( - self.field_type, types.StandardSqlDataType.TYPE_KIND_UNSPECIFIED + self.field_type, + types.StandardSqlDataType.TypeKind.TYPE_KIND_UNSPECIFIED, ) - if sql_type.type_kind == types.StandardSqlDataType.ARRAY: # noqa: E721 + if sql_type.type_kind == types.StandardSqlDataType.TypeKind.ARRAY: # noqa: E721 array_element_type = LEGACY_TO_STANDARD_TYPES.get( - self.field_type, types.StandardSqlDataType.TYPE_KIND_UNSPECIFIED + self.field_type, + types.StandardSqlDataType.TypeKind.TYPE_KIND_UNSPECIFIED, ) sql_type.array_element_type.type_kind = array_element_type # ARRAY cannot directly contain other arrays, only scalar types and STRUCTs # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#array-type - if array_element_type == types.StandardSqlDataType.STRUCT: # noqa: E721 + if ( + array_element_type + == types.StandardSqlDataType.TypeKind.STRUCT # noqa: E721 + ): sql_type.array_element_type.struct_type.fields.extend( field.to_standard_sql() for field in self.fields ) - elif sql_type.type_kind == types.StandardSqlDataType.STRUCT: # noqa: E721 + elif ( + sql_type.type_kind + == types.StandardSqlDataType.TypeKind.STRUCT # noqa: E721 + ): sql_type.struct_type.fields.extend( field.to_standard_sql() for field in self.fields ) @@ -260,21 +353,7 @@ def _parse_schema_resource(info): Optional[Sequence[google.cloud.bigquery.schema.SchemaField`]: A list of parsed fields, or ``None`` if no "fields" key found. """ - if "fields" not in info: - return () - - schema = [] - for r_field in info["fields"]: - name = r_field["name"] - field_type = r_field["type"] - mode = r_field.get("mode", "NULLABLE") - description = r_field.get("description") - sub_fields = _parse_schema_resource(r_field) - policy_tags = PolicyTagList.from_api_repr(r_field.get("policyTags")) - schema.append( - SchemaField(name, field_type, mode, description, sub_fields, policy_tags) - ) - return schema + return [SchemaField.from_api_repr(f) for f in info.get("fields", ())] def _build_schema_resource(fields): @@ -310,7 +389,7 @@ def _to_schema_fields(schema): instance or a compatible mapping representation of the field. """ for field in schema: - if not isinstance(field, (SchemaField, collections_abc.Mapping)): + if not isinstance(field, (SchemaField, collections.abc.Mapping)): raise ValueError( "Schema items must either be fields or compatible " "mapping representations." @@ -367,7 +446,7 @@ def __repr__(self): return "PolicyTagList{}".format(self._key()) @classmethod - def from_api_repr(cls, api_repr): + def from_api_repr(cls, api_repr: dict) -> "PolicyTagList": """Return a :class:`PolicyTagList` object deserialized from a dict. This method creates a new ``PolicyTagList`` instance that points to @@ -390,7 +469,7 @@ def from_api_repr(cls, api_repr): names = api_repr.get("names", ()) return cls(names=names) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Return a dictionary representing this object. This method returns the properties dict of the ``PolicyTagList`` diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index d9e5f7773..609c0b57e 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -19,33 +19,34 @@ import copy import datetime import functools -import logging import operator -import pytz +import typing +from typing import Any, Dict, Iterable, Iterator, Optional, Tuple import warnings -import six - -try: - # Needed for the to_bqstorage() method. - from google.cloud import bigquery_storage_v1beta1 -except ImportError: # pragma: NO COVER - bigquery_storage_v1beta1 = None - try: import pandas except ImportError: # pragma: NO COVER pandas = None try: - import pyarrow -except ImportError: # pragma: NO COVER - pyarrow = None + import geopandas +except ImportError: + geopandas = None +else: + _COORDINATE_REFERENCE_SYSTEM = "EPSG:4326" + +try: + import shapely.geos +except ImportError: + shapely = None +else: + _read_wkt = shapely.geos.WKTReader(shapely.geos.lgeos).read try: - import tqdm + import pyarrow except ImportError: # pragma: NO COVER - tqdm = None + pyarrow = None import google.api_core.exceptions from google.api_core.page_iterator import HTTPIterator @@ -53,32 +54,40 @@ import google.cloud._helpers from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pandas_helpers +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields -from google.cloud.bigquery.exceptions import PyarrowMissingWarning +from google.cloud.bigquery._tqdm_helpers import get_progress_bar from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +if typing.TYPE_CHECKING: # pragma: NO COVER + # Unconditionally import optional dependencies again to tell pytype that + # they are not None, avoiding false "no attribute" errors. + import pandas + import geopandas + import pyarrow + from google.cloud import bigquery_storage -_LOGGER = logging.getLogger(__name__) -_NO_BQSTORAGE_ERROR = ( - "The google-cloud-bigquery-storage library is not installed, " - "please install google-cloud-bigquery-storage to use bqstorage features." -) _NO_PANDAS_ERROR = ( "The pandas library is not installed, please install " "pandas to use the to_dataframe() function." ) +_NO_GEOPANDAS_ERROR = ( + "The geopandas library is not installed, please install " + "geopandas to use the to_geodataframe() function." +) +_NO_SHAPELY_ERROR = ( + "The shapely library is not installed, please install " + "shapely to use the geography_as_object option." +) _NO_PYARROW_ERROR = ( "The pyarrow library is not installed, please install " "pyarrow to use the to_arrow() function." ) -_NO_TQDM_ERROR = ( - "A progress bar was requested, but there was an error loading the tqdm " - "library. Please install tqdm to use the progress bar functionality." -) + _TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"' @@ -159,7 +168,9 @@ def path(self): ) @classmethod - def from_string(cls, table_id, default_project=None): + def from_string( + cls, table_id: str, default_project: str = None + ) -> "TableReference": """Construct a table reference from table ID string. Args: @@ -198,7 +209,7 @@ def from_string(cls, table_id, default_project=None): ) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "TableReference": """Factory: construct a table reference given its API representation Args: @@ -216,7 +227,7 @@ def from_api_repr(cls, resource): table_id = resource["tableId"] return cls(DatasetReference(project, dataset_id), table_id) - def to_api_repr(self): + def to_api_repr(self) -> dict: """Construct the API resource representation of this table reference. Returns: @@ -228,7 +239,7 @@ def to_api_repr(self): "tableId": self._table_id, } - def to_bqstorage(self, v1beta1=False): + def to_bqstorage(self) -> str: """Construct a BigQuery Storage API representation of this table. Install the ``google-cloud-bigquery-storage`` package to use this @@ -237,41 +248,21 @@ def to_bqstorage(self, v1beta1=False): If the ``table_id`` contains a partition identifier (e.g. ``my_table$201812``) or a snapshot identifier (e.g. ``mytable@1234567890``), it is ignored. Use - :class:`google.cloud.bigquery_storage_v1.types.ReadSession.TableReadOptions` + :class:`google.cloud.bigquery_storage.types.ReadSession.TableReadOptions` to filter rows by partition. Use - :class:`google.cloud.bigquery_storage_v1.types.ReadSession.TableModifiers` + :class:`google.cloud.bigquery_storage.types.ReadSession.TableModifiers` to select a specific snapshot to read from. - Args: - v1beta1 (Optiona[bool]): - If :data:`True`, return representation compatible with BigQuery - Storage ``v1beta1`` version. Defaults to :data:`False`. - Returns: - Union[str, google.cloud.bigquery_storage_v1beta1.types.TableReference:]: - A reference to this table in the BigQuery Storage API. - - Raises: - ValueError: - If ``v1beta1`` compatibility is requested, but the - :mod:`google.cloud.bigquery_storage_v1beta1` module cannot be imported. + str: A reference to this table in the BigQuery Storage API. """ - if v1beta1 and bigquery_storage_v1beta1 is None: - raise ValueError(_NO_BQSTORAGE_ERROR) table_id, _, _ = self._table_id.partition("@") table_id, _, _ = table_id.partition("$") - if v1beta1: - table_ref = bigquery_storage_v1beta1.types.TableReference( - project_id=self._project, - dataset_id=self._dataset_id, - table_id=table_id, - ) - else: - table_ref = "projects/{}/datasets/{}/tables/{}".format( - self._project, self._dataset_id, table_id, - ) + table_ref = "projects/{}/datasets/{}/tables/{}".format( + self._project, self._dataset_id, table_id, + ) return table_ref @@ -286,9 +277,16 @@ def _key(self): return (self._project, self._dataset_id, self._table_id) def __eq__(self, other): - if not isinstance(other, TableReference): + if isinstance(other, (Table, TableListItem)): + return ( + self.project == other.project + and self.dataset_id == other.dataset_id + and self.table_id == other.table_id + ) + elif isinstance(other, TableReference): + return self._key() == other._key() + else: return NotImplemented - return self._key() == other._key() def __ne__(self, other): return not self == other @@ -296,6 +294,9 @@ def __ne__(self, other): def __hash__(self): return hash(self._key()) + def __str__(self): + return f"{self.project}.{self.dataset_id}.{self.table_id}" + def __repr__(self): from google.cloud.bigquery.dataset import DatasetReference @@ -324,15 +325,39 @@ class Table(object): """ _PROPERTY_TO_API_FIELD = { - "friendly_name": "friendlyName", + "clustering_fields": "clustering", + "created": "creationTime", + "dataset_id": ["tableReference", "datasetId"], + "description": "description", + "encryption_configuration": "encryptionConfiguration", + "etag": "etag", "expires": "expirationTime", - "time_partitioning": "timePartitioning", - "partitioning_type": "timePartitioning", + "external_data_configuration": "externalDataConfiguration", + "friendly_name": "friendlyName", + "full_table_id": "id", + "labels": "labels", + "location": "location", + "modified": "lastModifiedTime", + "mview_enable_refresh": "materializedView", + "mview_last_refresh_time": ["materializedView", "lastRefreshTime"], + "mview_query": "materializedView", + "mview_refresh_interval": "materializedView", + "num_bytes": "numBytes", + "num_rows": "numRows", "partition_expiration": "timePartitioning", + "partitioning_type": "timePartitioning", + "project": ["tableReference", "projectId"], + "range_partitioning": "rangePartitioning", + "time_partitioning": "timePartitioning", + "schema": "schema", + "snapshot_definition": "snapshotDefinition", + "streaming_buffer": "streamingBuffer", + "self_link": "selfLink", + "table_id": ["tableReference", "tableId"], + "time_partitioning": "timePartitioning", + "type": "type", "view_use_legacy_sql": "view", "view_query": "view", - "external_data_configuration": "externalDataConfiguration", - "encryption_configuration": "encryptionConfiguration", "require_partition_filter": "requirePartitionFilter", } @@ -346,17 +371,23 @@ def __init__(self, table_ref, schema=None): @property def project(self): """str: Project bound to the table.""" - return self._properties["tableReference"]["projectId"] + return _helpers._get_sub_prop( + self._properties, self._PROPERTY_TO_API_FIELD["project"] + ) @property def dataset_id(self): """str: ID of dataset containing the table.""" - return self._properties["tableReference"]["datasetId"] + return _helpers._get_sub_prop( + self._properties, self._PROPERTY_TO_API_FIELD["dataset_id"] + ) @property def table_id(self): """str: ID of the table.""" - return self._properties["tableReference"]["tableId"] + return _helpers._get_sub_prop( + self._properties, self._PROPERTY_TO_API_FIELD["table_id"] + ) reference = property(_reference_getter) @@ -375,11 +406,15 @@ def require_partition_filter(self): partition filter that can be used for partition elimination to be specified. """ - return self._properties.get("requirePartitionFilter") + return self._properties.get( + self._PROPERTY_TO_API_FIELD["require_partition_filter"] + ) @require_partition_filter.setter def require_partition_filter(self, value): - self._properties["requirePartitionFilter"] = value + self._properties[ + self._PROPERTY_TO_API_FIELD["require_partition_filter"] + ] = value @property def schema(self): @@ -395,7 +430,7 @@ def schema(self): is not a :class:`~google.cloud.bigquery.schema.SchemaField` instance or a compatible mapping representation of the field. """ - prop = self._properties.get("schema") + prop = self._properties.get(self._PROPERTY_TO_API_FIELD["schema"]) if not prop: return [] else: @@ -403,11 +438,13 @@ def schema(self): @schema.setter def schema(self, value): + api_field = self._PROPERTY_TO_API_FIELD["schema"] + if value is None: - self._properties["schema"] = None + self._properties[api_field] = None else: value = _to_schema_fields(value) - self._properties["schema"] = {"fields": _build_schema_resource(value)} + self._properties[api_field] = {"fields": _build_schema_resource(value)} @property def labels(self): @@ -420,13 +457,13 @@ def labels(self): Raises: ValueError: If ``value`` type is invalid. """ - return self._properties.setdefault("labels", {}) + return self._properties.setdefault(self._PROPERTY_TO_API_FIELD["labels"], {}) @labels.setter def labels(self, value): if not isinstance(value, dict): raise ValueError("Pass a dict") - self._properties["labels"] = value + self._properties[self._PROPERTY_TO_API_FIELD["labels"]] = value @property def encryption_configuration(self): @@ -440,7 +477,9 @@ def encryption_configuration(self): `_ in the BigQuery documentation. """ - prop = self._properties.get("encryptionConfiguration") + prop = self._properties.get( + self._PROPERTY_TO_API_FIELD["encryption_configuration"] + ) if prop is not None: prop = EncryptionConfiguration.from_api_repr(prop) return prop @@ -450,14 +489,16 @@ def encryption_configuration(self, value): api_repr = value if value is not None: api_repr = value.to_api_repr() - self._properties["encryptionConfiguration"] = api_repr + self._properties[ + self._PROPERTY_TO_API_FIELD["encryption_configuration"] + ] = api_repr @property def created(self): """Union[datetime.datetime, None]: Datetime at which the table was created (:data:`None` until set from the server). """ - creation_time = self._properties.get("creationTime") + creation_time = self._properties.get(self._PROPERTY_TO_API_FIELD["created"]) if creation_time is not None: # creation_time will be in milliseconds. return google.cloud._helpers._datetime_from_microseconds( @@ -469,14 +510,14 @@ def etag(self): """Union[str, None]: ETag for the table resource (:data:`None` until set from the server). """ - return self._properties.get("etag") + return self._properties.get(self._PROPERTY_TO_API_FIELD["etag"]) @property def modified(self): """Union[datetime.datetime, None]: Datetime at which the table was last modified (:data:`None` until set from the server). """ - modified_time = self._properties.get("lastModifiedTime") + modified_time = self._properties.get(self._PROPERTY_TO_API_FIELD["modified"]) if modified_time is not None: # modified_time will be in milliseconds. return google.cloud._helpers._datetime_from_microseconds( @@ -488,39 +529,44 @@ def num_bytes(self): """Union[int, None]: The size of the table in bytes (:data:`None` until set from the server). """ - return _helpers._int_or_none(self._properties.get("numBytes")) + return _helpers._int_or_none( + self._properties.get(self._PROPERTY_TO_API_FIELD["num_bytes"]) + ) @property def num_rows(self): """Union[int, None]: The number of rows in the table (:data:`None` until set from the server). """ - return _helpers._int_or_none(self._properties.get("numRows")) + return _helpers._int_or_none( + self._properties.get(self._PROPERTY_TO_API_FIELD["num_rows"]) + ) @property def self_link(self): """Union[str, None]: URL for the table resource (:data:`None` until set from the server). """ - return self._properties.get("selfLink") + return self._properties.get(self._PROPERTY_TO_API_FIELD["self_link"]) @property def full_table_id(self): """Union[str, None]: ID for the table (:data:`None` until set from the server). - In the format ``project_id:dataset_id.table_id``. + In the format ``project-id:dataset_id.table_id``. """ - return self._properties.get("id") + return self._properties.get(self._PROPERTY_TO_API_FIELD["full_table_id"]) @property def table_type(self): """Union[str, None]: The type of the table (:data:`None` until set from the server). - Possible values are ``'TABLE'``, ``'VIEW'``, or ``'EXTERNAL'``. + Possible values are ``'TABLE'``, ``'VIEW'``, ``'MATERIALIZED_VIEW'`` or + ``'EXTERNAL'``. """ - return self._properties.get("type") + return self._properties.get(self._PROPERTY_TO_API_FIELD["type"]) @property def range_partitioning(self): @@ -541,7 +587,9 @@ def range_partitioning(self): :class:`~google.cloud.bigquery.table.RangePartitioning` or :data:`None`. """ - resource = self._properties.get("rangePartitioning") + resource = self._properties.get( + self._PROPERTY_TO_API_FIELD["range_partitioning"] + ) if resource is not None: return RangePartitioning(_properties=resource) @@ -554,7 +602,7 @@ def range_partitioning(self, value): raise ValueError( "Expected value to be RangePartitioning or None, got {}.".format(value) ) - self._properties["rangePartitioning"] = resource + self._properties[self._PROPERTY_TO_API_FIELD["range_partitioning"]] = resource @property def time_partitioning(self): @@ -571,7 +619,7 @@ def time_partitioning(self): :class:`~google.cloud.bigquery.table.TimePartitioning` or :data:`None`. """ - prop = self._properties.get("timePartitioning") + prop = self._properties.get(self._PROPERTY_TO_API_FIELD["time_partitioning"]) if prop is not None: return TimePartitioning.from_api_repr(prop) @@ -584,7 +632,7 @@ def time_partitioning(self, value): raise ValueError( "value must be google.cloud.bigquery.table.TimePartitioning " "or None" ) - self._properties["timePartitioning"] = api_repr + self._properties[self._PROPERTY_TO_API_FIELD["time_partitioning"]] = api_repr @property def partitioning_type(self): @@ -609,9 +657,10 @@ def partitioning_type(self, value): PendingDeprecationWarning, stacklevel=2, ) + api_field = self._PROPERTY_TO_API_FIELD["partitioning_type"] if self.time_partitioning is None: - self._properties["timePartitioning"] = {} - self._properties["timePartitioning"]["type"] = value + self._properties[api_field] = {} + self._properties[api_field]["type"] = value @property def partition_expiration(self): @@ -638,9 +687,11 @@ def partition_expiration(self, value): PendingDeprecationWarning, stacklevel=2, ) + api_field = self._PROPERTY_TO_API_FIELD["partition_expiration"] + if self.time_partitioning is None: - self._properties["timePartitioning"] = {"type": TimePartitioningType.DAY} - self._properties["timePartitioning"]["expirationMs"] = str(value) + self._properties[api_field] = {"type": TimePartitioningType.DAY} + self._properties[api_field]["expirationMs"] = str(value) @property def clustering_fields(self): @@ -652,10 +703,10 @@ def clustering_fields(self): .. note:: - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. + BigQuery supports clustering for both partitioned and + non-partitioned tables. """ - prop = self._properties.get("clustering") + prop = self._properties.get(self._PROPERTY_TO_API_FIELD["clustering_fields"]) if prop is not None: return list(prop.get("fields", ())) @@ -665,12 +716,15 @@ def clustering_fields(self, value): (Defaults to :data:`None`). """ + api_field = self._PROPERTY_TO_API_FIELD["clustering_fields"] + if value is not None: - prop = self._properties.setdefault("clustering", {}) + prop = self._properties.setdefault(api_field, {}) prop["fields"] = value else: - if "clustering" in self._properties: - del self._properties["clustering"] + # In order to allow unsetting clustering fields completely, we explicitly + # set this property to None (as oposed to merely removing the key). + self._properties[api_field] = None @property def description(self): @@ -680,13 +734,13 @@ def description(self): Raises: ValueError: For invalid value types. """ - return self._properties.get("description") + return self._properties.get(self._PROPERTY_TO_API_FIELD["description"]) @description.setter def description(self, value): - if not isinstance(value, six.string_types) and value is not None: + if not isinstance(value, str) and value is not None: raise ValueError("Pass a string, or None") - self._properties["description"] = value + self._properties[self._PROPERTY_TO_API_FIELD["description"]] = value @property def expires(self): @@ -696,7 +750,7 @@ def expires(self): Raises: ValueError: For invalid value types. """ - expiration_time = self._properties.get("expirationTime") + expiration_time = self._properties.get(self._PROPERTY_TO_API_FIELD["expires"]) if expiration_time is not None: # expiration_time will be in milliseconds. return google.cloud._helpers._datetime_from_microseconds( @@ -708,7 +762,9 @@ def expires(self, value): if not isinstance(value, datetime.datetime) and value is not None: raise ValueError("Pass a datetime, or None") value_ms = google.cloud._helpers._millis_from_datetime(value) - self._properties["expirationTime"] = _helpers._str_or_none(value_ms) + self._properties[ + self._PROPERTY_TO_API_FIELD["expires"] + ] = _helpers._str_or_none(value_ms) @property def friendly_name(self): @@ -717,13 +773,13 @@ def friendly_name(self): Raises: ValueError: For invalid value types. """ - return self._properties.get("friendlyName") + return self._properties.get(self._PROPERTY_TO_API_FIELD["friendly_name"]) @friendly_name.setter def friendly_name(self, value): - if not isinstance(value, six.string_types) and value is not None: + if not isinstance(value, str) and value is not None: raise ValueError("Pass a string, or None") - self._properties["friendlyName"] = value + self._properties[self._PROPERTY_TO_API_FIELD["friendly_name"]] = value @property def location(self): @@ -731,7 +787,7 @@ def location(self): Defaults to :data:`None`. """ - return self._properties.get("location") + return self._properties.get(self._PROPERTY_TO_API_FIELD["location"]) @property def view_query(self): @@ -744,18 +800,17 @@ def view_query(self): Raises: ValueError: For invalid value types. """ - view = self._properties.get("view") - if view is not None: - return view.get("query") + api_field = self._PROPERTY_TO_API_FIELD["view_query"] + return _helpers._get_sub_prop(self._properties, [api_field, "query"]) @view_query.setter def view_query(self, value): - if not isinstance(value, six.string_types): + if not isinstance(value, str): raise ValueError("Pass a string") - view = self._properties.get("view") - if view is None: - view = self._properties["view"] = {} - view["query"] = value + + api_field = self._PROPERTY_TO_API_FIELD["view_query"] + _helpers._set_sub_prop(self._properties, [api_field, "query"], value) + view = self._properties[api_field] # The service defaults useLegacySql to True, but this # client uses Standard SQL by default. if view.get("useLegacySql") is None: @@ -764,7 +819,7 @@ def view_query(self, value): @view_query.deleter def view_query(self): """Delete SQL query defining the table as a view.""" - self._properties.pop("view", None) + self._properties.pop(self._PROPERTY_TO_API_FIELD["view_query"], None) view_use_legacy_sql = property(_view_use_legacy_sql_getter) @@ -772,16 +827,90 @@ def view_query(self): def view_use_legacy_sql(self, value): if not isinstance(value, bool): raise ValueError("Pass a boolean") - if self._properties.get("view") is None: - self._properties["view"] = {} - self._properties["view"]["useLegacySql"] = value + + api_field = self._PROPERTY_TO_API_FIELD["view_query"] + if self._properties.get(api_field) is None: + self._properties[api_field] = {} + self._properties[api_field]["useLegacySql"] = value + + @property + def mview_query(self): + """Optional[str]: SQL query defining the table as a materialized + view (defaults to :data:`None`). + """ + api_field = self._PROPERTY_TO_API_FIELD["mview_query"] + return _helpers._get_sub_prop(self._properties, [api_field, "query"]) + + @mview_query.setter + def mview_query(self, value): + api_field = self._PROPERTY_TO_API_FIELD["mview_query"] + _helpers._set_sub_prop(self._properties, [api_field, "query"], str(value)) + + @mview_query.deleter + def mview_query(self): + """Delete SQL query defining the table as a materialized view.""" + self._properties.pop(self._PROPERTY_TO_API_FIELD["mview_query"], None) + + @property + def mview_last_refresh_time(self): + """Optional[datetime.datetime]: Datetime at which the materialized view was last + refreshed (:data:`None` until set from the server). + """ + refresh_time = _helpers._get_sub_prop( + self._properties, self._PROPERTY_TO_API_FIELD["mview_last_refresh_time"] + ) + if refresh_time is not None: + # refresh_time will be in milliseconds. + return google.cloud._helpers._datetime_from_microseconds( + 1000 * int(refresh_time) + ) + + @property + def mview_enable_refresh(self): + """Optional[bool]: Enable automatic refresh of the materialized view + when the base table is updated. The default value is :data:`True`. + """ + api_field = self._PROPERTY_TO_API_FIELD["mview_enable_refresh"] + return _helpers._get_sub_prop(self._properties, [api_field, "enableRefresh"]) + + @mview_enable_refresh.setter + def mview_enable_refresh(self, value): + api_field = self._PROPERTY_TO_API_FIELD["mview_enable_refresh"] + return _helpers._set_sub_prop( + self._properties, [api_field, "enableRefresh"], value + ) + + @property + def mview_refresh_interval(self): + """Optional[datetime.timedelta]: The maximum frequency at which this + materialized view will be refreshed. The default value is 1800000 + milliseconds (30 minutes). + """ + api_field = self._PROPERTY_TO_API_FIELD["mview_refresh_interval"] + refresh_interval = _helpers._get_sub_prop( + self._properties, [api_field, "refreshIntervalMs"] + ) + if refresh_interval is not None: + return datetime.timedelta(milliseconds=int(refresh_interval)) + + @mview_refresh_interval.setter + def mview_refresh_interval(self, value): + if value is None: + refresh_interval_ms = None + else: + refresh_interval_ms = str(value // datetime.timedelta(milliseconds=1)) + + api_field = self._PROPERTY_TO_API_FIELD["mview_refresh_interval"] + _helpers._set_sub_prop( + self._properties, [api_field, "refreshIntervalMs"], refresh_interval_ms, + ) @property def streaming_buffer(self): """google.cloud.bigquery.StreamingBuffer: Information about a table's streaming buffer. """ - sb = self._properties.get("streamingBuffer") + sb = self._properties.get(self._PROPERTY_TO_API_FIELD["streaming_buffer"]) if sb is not None: return StreamingBuffer(sb) @@ -793,7 +922,9 @@ def external_data_configuration(self): Raises: ValueError: For invalid value types. """ - prop = self._properties.get("externalDataConfiguration") + prop = self._properties.get( + self._PROPERTY_TO_API_FIELD["external_data_configuration"] + ) if prop is not None: prop = ExternalConfig.from_api_repr(prop) return prop @@ -805,10 +936,25 @@ def external_data_configuration(self, value): api_repr = value if value is not None: api_repr = value.to_api_repr() - self._properties["externalDataConfiguration"] = api_repr + self._properties[ + self._PROPERTY_TO_API_FIELD["external_data_configuration"] + ] = api_repr + + @property + def snapshot_definition(self) -> Optional["SnapshotDefinition"]: + """Information about the snapshot. This value is set via snapshot creation. + + See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table.FIELDS.snapshot_definition + """ + snapshot_info = self._properties.get( + self._PROPERTY_TO_API_FIELD["snapshot_definition"] + ) + if snapshot_info is not None: + snapshot_info = SnapshotDefinition(snapshot_info) + return snapshot_info @classmethod - def from_string(cls, full_table_id): + def from_string(cls, full_table_id: str) -> "Table": """Construct a table from fully-qualified table ID. Args: @@ -832,7 +978,7 @@ def from_string(cls, full_table_id): return cls(TableReference.from_string(full_table_id)) @classmethod - def from_api_repr(cls, resource): + def from_api_repr(cls, resource: dict) -> "Table": """Factory: construct a table given its API representation Args: @@ -858,9 +1004,15 @@ def from_api_repr(cls, resource): "Resource lacks required identity information:" '["tableReference"]["tableId"]' ) - project_id = resource["tableReference"]["projectId"] - table_id = resource["tableReference"]["tableId"] - dataset_id = resource["tableReference"]["datasetId"] + project_id = _helpers._get_sub_prop( + resource, cls._PROPERTY_TO_API_FIELD["project"] + ) + table_id = _helpers._get_sub_prop( + resource, cls._PROPERTY_TO_API_FIELD["table_id"] + ) + dataset_id = _helpers._get_sub_prop( + resource, cls._PROPERTY_TO_API_FIELD["dataset_id"] + ) dataset_ref = dataset.DatasetReference(project_id, dataset_id) table = cls(dataset_ref.table(table_id)) @@ -868,7 +1020,7 @@ def from_api_repr(cls, resource): return table - def to_api_repr(self): + def to_api_repr(self) -> dict: """Constructs the API resource of this table Returns: @@ -876,24 +1028,36 @@ def to_api_repr(self): """ return copy.deepcopy(self._properties) - def to_bqstorage(self, v1beta1=False): + def to_bqstorage(self) -> str: """Construct a BigQuery Storage API representation of this table. - Args: - v1beta1 (Optiona[bool]): - If :data:`True`, return representation compatible with BigQuery - Storage ``v1beta1`` version. Defaults to :data:`False`. - Returns: - Union[str, google.cloud.bigquery_storage_v1beta1.types.TableReference:]: - A reference to this table in the BigQuery Storage API. + str: A reference to this table in the BigQuery Storage API. """ - return self.reference.to_bqstorage(v1beta1=v1beta1) + return self.reference.to_bqstorage() def _build_resource(self, filter_fields): """Generate a resource for ``update``.""" return _helpers._build_resource_from_properties(self, filter_fields) + def __eq__(self, other): + if isinstance(other, Table): + return ( + self._properties["tableReference"] + == other._properties["tableReference"] + ) + elif isinstance(other, (TableReference, TableListItem)): + return ( + self.project == other.project + and self.dataset_id == other.dataset_id + and self.table_id == other.table_id + ) + else: + return NotImplemented + + def __hash__(self): + return hash((self.project, self.dataset_id, self.table_id)) + def __repr__(self): return "Table({})".format(repr(self.reference)) @@ -1063,15 +1227,15 @@ def clustering_fields(self): .. note:: - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. + BigQuery supports clustering for both partitioned and + non-partitioned tables. """ prop = self._properties.get("clustering") if prop is not None: return list(prop.get("fields", ())) @classmethod - def from_string(cls, full_table_id): + def from_string(cls, full_table_id: str) -> "TableListItem": """Construct a table from fully-qualified table ID. Args: @@ -1096,19 +1260,34 @@ def from_string(cls, full_table_id): {"tableReference": TableReference.from_string(full_table_id).to_api_repr()} ) - def to_bqstorage(self, v1beta1=False): + def to_bqstorage(self) -> str: """Construct a BigQuery Storage API representation of this table. - Args: - v1beta1 (Optiona[bool]): - If :data:`True`, return representation compatible with BigQuery - Storage ``v1beta1`` version. Defaults to :data:`False`. + Returns: + str: A reference to this table in the BigQuery Storage API. + """ + return self.reference.to_bqstorage() + + def to_api_repr(self) -> dict: + """Constructs the API resource of this table Returns: - Union[str, google.cloud.bigquery_storage_v1beta1.types.TableReference:]: - A reference to this table in the BigQuery Storage API. + Dict[str, object]: Table represented as an API resource """ - return self.reference.to_bqstorage(v1beta1=v1beta1) + return copy.deepcopy(self._properties) + + def __eq__(self, other): + if isinstance(other, (Table, TableReference, TableListItem)): + return ( + self.project == other.project + and self.dataset_id == other.dataset_id + and self.table_id == other.table_id + ) + else: + return NotImplemented + + def __hash__(self): + return hash((self.project, self.dataset_id, self.table_id)) def _row_from_mapping(mapping, schema): @@ -1169,6 +1348,29 @@ def __init__(self, resource): ) +class SnapshotDefinition: + """Information about base table and snapshot time of the snapshot. + + See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#snapshotdefinition + + Args: + resource: Snapshot definition representation returned from the API. + """ + + def __init__(self, resource: Dict[str, Any]): + self.base_table_reference = None + if "baseTableReference" in resource: + self.base_table_reference = TableReference.from_api_repr( + resource["baseTableReference"] + ) + + self.snapshot_time = None + if "snapshotTime" in resource: + self.snapshot_time = google.cloud._helpers._rfc3339_to_datetime( + resource["snapshotTime"] + ) + + class Row(object): """A BigQuery row. @@ -1196,7 +1398,7 @@ def values(self): """ return copy.deepcopy(self._xxx_values) - def keys(self): + def keys(self) -> Iterable[str]: """Return the keys for using a row as a dict. Returns: @@ -1207,9 +1409,9 @@ def keys(self): >>> list(Row(('a', 'b'), {'x': 0, 'y': 1}).keys()) ['x', 'y'] """ - return six.iterkeys(self._xxx_field_to_index) + return self._xxx_field_to_index.keys() - def items(self): + def items(self) -> Iterable[Tuple[str, Any]]: """Return items as ``(key, value)`` pairs. Returns: @@ -1221,10 +1423,10 @@ def items(self): >>> list(Row(('a', 'b'), {'x': 0, 'y': 1}).items()) [('x', 'a'), ('y', 'b')] """ - for key, index in six.iteritems(self._xxx_field_to_index): + for key, index in self._xxx_field_to_index.items(): yield (key, copy.deepcopy(self._xxx_values[index])) - def get(self, key, default=None): + def get(self, key: str, default: Any = None) -> Any: """Return a value for key, with a default value if it does not exist. Args: @@ -1271,7 +1473,7 @@ def __len__(self): return len(self._xxx_values) def __getitem__(self, key): - if isinstance(key, six.string_types): + if isinstance(key, str): value = self._xxx_field_to_index.get(key) if value is None: raise KeyError("no row field {!r}".format(key)) @@ -1310,7 +1512,9 @@ class RowIterator(HTTPIterator): """A class for iterating through HTTP/JSON API row list responses. Args: - client (google.cloud.bigquery.Client): The API client. + client (Optional[google.cloud.bigquery.Client]): + The API client instance. This should always be non-`None`, except for + subclasses that do not use it, namely the ``_EmptyRowIterator``. api_request (Callable[google.cloud._http.JSONConnection.api_request]): The function to use to make API requests. path (str): The method path to query for the list of items. @@ -1337,7 +1541,11 @@ class RowIterator(HTTPIterator): call the BigQuery Storage API to fetch rows. selected_fields (Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]): A subset of columns to select from this table. - + total_rows (Optional[int]): + Total number of rows in the table. + first_page_response (Optional[dict]): + API response for the first page of results. These are returned when + the first page is requested. """ def __init__( @@ -1352,6 +1560,8 @@ def __init__( extra_params=None, table=None, selected_fields=None, + total_rows=None, + first_page_response=None, ): super(RowIterator, self).__init__( client, @@ -1369,11 +1579,53 @@ def __init__( self._field_to_index = _helpers._field_to_index_mapping(schema) self._page_size = page_size self._preserve_order = False - self._project = client.project + self._project = client.project if client is not None else None self._schema = schema self._selected_fields = selected_fields self._table = table - self._total_rows = getattr(table, "num_rows", None) + self._total_rows = total_rows + self._first_page_response = first_page_response + + def _is_completely_cached(self): + """Check if all results are completely cached. + + This is useful to know, because we can avoid alternative download + mechanisms. + """ + if self._first_page_response is None or self.next_page_token: + return False + + return self._first_page_response.get(self._next_token) is None + + def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): + """Returns if the BigQuery Storage API can be used. + + Returns: + bool + True if the BigQuery Storage client can be used or created. + """ + using_bqstorage_api = bqstorage_client or create_bqstorage_client + if not using_bqstorage_api: + return False + + if self._is_completely_cached(): + return False + + if self.max_results is not None: + return False + + try: + from google.cloud import bigquery_storage # noqa: F401 + except ImportError: + return False + + try: + _helpers.BQ_STORAGE_VERSIONS.verify_version() + except LegacyBigQueryStorageError as exc: + warnings.warn(str(exc)) + return False + + return True def _get_next_page_response(self): """Requests the next page from the path provided. @@ -1382,6 +1634,11 @@ def _get_next_page_response(self): Dict[str, object]: The parsed JSON response of the next page's contents. """ + if self._first_page_response: + response = self._first_page_response + self._first_page_response = None + return response + params = self._get_query_params() if self._page_size is not None: if self.page_number and "startIndex" in params: @@ -1402,42 +1659,37 @@ def total_rows(self): """int: The total number of rows in the table.""" return self._total_rows - def _get_progress_bar(self, progress_bar_type): - """Construct a tqdm progress bar object, if tqdm is installed.""" - if tqdm is None: - if progress_bar_type is not None: - warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3) - return None + def _maybe_warn_max_results( + self, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"], + ): + """Issue a warning if BQ Storage client is not ``None`` with ``max_results`` set. - description = "Downloading" - unit = "rows" + This helper method should be used directly in the relevant top-level public + methods, so that the warning is issued for the correct line in user code. - try: - if progress_bar_type == "tqdm": - return tqdm.tqdm(desc=description, total=self.total_rows, unit=unit) - elif progress_bar_type == "tqdm_notebook": - return tqdm.tqdm_notebook( - desc=description, total=self.total_rows, unit=unit - ) - elif progress_bar_type == "tqdm_gui": - return tqdm.tqdm_gui(desc=description, total=self.total_rows, unit=unit) - except (KeyError, TypeError): - # Protect ourselves from any tqdm errors. In case of - # unexpected tqdm behavior, just fall back to showing - # no progress bar. - warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3) - return None + Args: + bqstorage_client: + The BigQuery Storage client intended to use for downloading result rows. + """ + if bqstorage_client is not None and self.max_results is not None: + warnings.warn( + "Cannot use bqstorage_client if max_results is set, " + "reverting to fetching data with the REST endpoint.", + stacklevel=3, + ) def _to_page_iterable( self, bqstorage_download, tabledata_list_download, bqstorage_client=None ): - if bqstorage_client is not None: - for item in bqstorage_download(): - yield item - return + if not self._validate_bqstorage(bqstorage_client, False): + bqstorage_client = None - for item in tabledata_list_download(): - yield item + result_pages = ( + bqstorage_download() + if bqstorage_client is not None + else tabledata_list_download() + ) + yield from result_pages def _to_arrow_iterable(self, bqstorage_client=None): """Create an iterable of arrow RecordBatches, to process the table as a stream.""" @@ -1450,7 +1702,7 @@ def _to_arrow_iterable(self, bqstorage_client=None): selected_fields=self._selected_fields, ) tabledata_list_download = functools.partial( - _pandas_helpers.download_arrow_tabledata_list, iter(self.pages), self.schema + _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema ) return self._to_page_iterable( bqstorage_download, @@ -1462,10 +1714,10 @@ def _to_arrow_iterable(self, bqstorage_client=None): # changes to job.QueryJob.to_arrow() def to_arrow( self, - progress_bar_type=None, - bqstorage_client=None, - create_bqstorage_client=True, - ): + progress_bar_type: str = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + create_bqstorage_client: bool = True, + ) -> "pyarrow.Table": """[Beta] Create a class:`pyarrow.Table` by loading all pages of a table or query. @@ -1506,7 +1758,7 @@ def to_arrow( This argument does nothing if ``bqstorage_client`` is supplied. - ..versionadded:: 1.24.0 + .. versionadded:: 1.24.0 Returns: pyarrow.Table @@ -1517,29 +1769,26 @@ def to_arrow( Raises: ValueError: If the :mod:`pyarrow` library cannot be imported. - ..versionadded:: 1.17.0 + .. versionadded:: 1.17.0 """ if pyarrow is None: raise ValueError(_NO_PYARROW_ERROR) - if ( - bqstorage_client or create_bqstorage_client - ) and self.max_results is not None: - warnings.warn( - "Cannot use bqstorage_client if max_results is set, " - "reverting to fetching data with the tabledata.list endpoint.", - stacklevel=2, - ) + self._maybe_warn_max_results(bqstorage_client) + + if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client): create_bqstorage_client = False bqstorage_client = None owns_bqstorage_client = False if not bqstorage_client and create_bqstorage_client: - bqstorage_client = self.client._create_bqstorage_client() + bqstorage_client = self.client._ensure_bqstorage_client() owns_bqstorage_client = bqstorage_client is not None try: - progress_bar = self._get_progress_bar(progress_bar_type) + progress_bar = get_progress_bar( + progress_bar_type, "Downloading", self.total_rows, "rows" + ) record_batches = [] for record_batch in self._to_arrow_iterable( @@ -1559,7 +1808,7 @@ def to_arrow( progress_bar.close() finally: if owns_bqstorage_client: - bqstorage_client.transport.channel.close() + bqstorage_client._transport.grpc_channel.close() if record_batches: return pyarrow.Table.from_batches(record_batches) @@ -1568,7 +1817,12 @@ def to_arrow( arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema) return pyarrow.Table.from_batches(record_batches, schema=arrow_schema) - def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None): + def to_dataframe_iterable( + self, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + dtypes: Dict[str, Any] = None, + max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, + ) -> "pandas.DataFrame": """Create an iterable of pandas DataFrames, to process the table as a stream. Args: @@ -1588,6 +1842,17 @@ def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None): ``dtype`` is used when constructing the series for the column specified. Otherwise, the default pandas behavior is used. + max_queue_size (Optional[int]): + The maximum number of result pages to hold in the internal queue when + streaming query results over the BigQuery Storage API. Ignored if + Storage API is not used. + + By default, the max queue size is set to the number of BQ Storage streams + created by the server. If ``max_queue_size`` is :data:`None`, the queue + size is infinite. + + .. versionadded:: 2.14.0 + Returns: pandas.DataFrame: A generator of :class:`~pandas.DataFrame`. @@ -1601,6 +1866,8 @@ def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None): if dtypes is None: dtypes = {} + self._maybe_warn_max_results(bqstorage_client) + column_names = [field.name for field in self._schema] bqstorage_download = functools.partial( _pandas_helpers.download_dataframe_bqstorage, @@ -1611,9 +1878,10 @@ def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None): dtypes, preserve_order=self._preserve_order, selected_fields=self._selected_fields, + max_queue_size=max_queue_size, ) tabledata_list_download = functools.partial( - _pandas_helpers.download_dataframe_tabledata_list, + _pandas_helpers.download_dataframe_row_iterator, iter(self.pages), self.schema, dtypes, @@ -1628,12 +1896,13 @@ def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None): # changes to job.QueryJob.to_dataframe() def to_dataframe( self, - bqstorage_client=None, - dtypes=None, - progress_bar_type=None, - create_bqstorage_client=True, - date_as_object=True, - ): + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + dtypes: Dict[str, Any] = None, + progress_bar_type: str = None, + create_bqstorage_client: bool = True, + date_as_object: bool = True, + geography_as_object: bool = False, + ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. Args: @@ -1671,7 +1940,7 @@ def to_dataframe( Use the :func:`tqdm.tqdm_gui` function to display a progress bar as a graphical dialog box. - ..versionadded:: 1.11.0 + .. versionadded:: 1.11.0 create_bqstorage_client (Optional[bool]): If ``True`` (default), create a BigQuery Storage API client using the default API settings. The BigQuery Storage API @@ -1680,13 +1949,20 @@ def to_dataframe( This argument does nothing if ``bqstorage_client`` is supplied. - ..versionadded:: 1.24.0 + .. versionadded:: 1.24.0 date_as_object (Optional[bool]): If ``True`` (default), cast dates to objects. If ``False``, convert to datetime64[ns] dtype. - ..versionadded:: 1.26.0 + .. versionadded:: 1.26.0 + + geography_as_object (Optional[bool]): + If ``True``, convert GEOGRAPHY data to :mod:`shapely` + geometry objects. If ``False`` (default), don't cast + geography data to :mod:`shapely` geometry objects. + + .. versionadded:: 2.24.0 Returns: pandas.DataFrame: @@ -1696,105 +1972,190 @@ def to_dataframe( Raises: ValueError: - If the :mod:`pandas` library cannot be imported, or the - :mod:`google.cloud.bigquery_storage_v1` module is - required but cannot be imported. + If the :mod:`pandas` library cannot be imported, or + the :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. Also if + `geography_as_object` is `True`, but the + :mod:`shapely` library cannot be imported. """ if pandas is None: raise ValueError(_NO_PANDAS_ERROR) + if geography_as_object and shapely is None: + raise ValueError(_NO_SHAPELY_ERROR) + if dtypes is None: dtypes = {} - if ( - bqstorage_client or create_bqstorage_client - ) and self.max_results is not None: - warnings.warn( - "Cannot use bqstorage_client if max_results is set, " - "reverting to fetching data with the tabledata.list endpoint.", - stacklevel=2, - ) + self._maybe_warn_max_results(bqstorage_client) + + if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client): create_bqstorage_client = False bqstorage_client = None - if pyarrow is not None: - # If pyarrow is available, calling to_arrow, then converting to a - # pandas dataframe is about 2x faster. This is because pandas.concat is - # rarely no-copy, whereas pyarrow.Table.from_batches + to_pandas is - # usually no-copy. - record_batch = self.to_arrow( - progress_bar_type=progress_bar_type, - bqstorage_client=bqstorage_client, - create_bqstorage_client=create_bqstorage_client, - ) + record_batch = self.to_arrow( + progress_bar_type=progress_bar_type, + bqstorage_client=bqstorage_client, + create_bqstorage_client=create_bqstorage_client, + ) + + # When converting timestamp values to nanosecond precision, the result + # can be out of pyarrow bounds. To avoid the error when converting to + # Pandas, we set the timestamp_as_object parameter to True, if necessary. + types_to_check = { + pyarrow.timestamp("us"), + pyarrow.timestamp("us", tz=datetime.timezone.utc), + } - # When converting timestamp values to nanosecond precision, the result - # can be out of pyarrow bounds. To avoid the error when converting to - # Pandas, we set the timestamp_as_object parameter to True, if necessary. - # - # NOTE: Python 3+ only, as timestamp_as_object parameter is only supported - # in pyarrow>=1.0, but the latter is not compatible with Python 2. - if six.PY2: - extra_kwargs = {} - else: - types_to_check = { - pyarrow.timestamp("us"), - pyarrow.timestamp("us", tz=pytz.UTC), - } - - for column in record_batch: - if column.type in types_to_check: - try: - column.cast("timestamp[ns]") - except pyarrow.lib.ArrowInvalid: - timestamp_as_object = True - break - else: - timestamp_as_object = False - - extra_kwargs = {"timestamp_as_object": timestamp_as_object} - - df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) - - for column in dtypes: - df[column] = pandas.Series(df[column], dtype=dtypes[column]) - return df + for column in record_batch: + if column.type in types_to_check: + try: + column.cast("timestamp[ns]") + except pyarrow.lib.ArrowInvalid: + timestamp_as_object = True + break else: - warnings.warn( - "Converting to a dataframe without pyarrow installed is " - "often slower and will become unsupported in the future. " - "Please install the pyarrow package.", - PyarrowMissingWarning, - stacklevel=2, - ) + timestamp_as_object = False - # The bqstorage_client is only used if pyarrow is available, so the - # rest of this method only needs to account for tabledata.list. - progress_bar = self._get_progress_bar(progress_bar_type) + extra_kwargs = {"timestamp_as_object": timestamp_as_object} - frames = [] - for frame in self.to_dataframe_iterable(dtypes=dtypes): - frames.append(frame) + df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) - if progress_bar is not None: - # In some cases, the number of total rows is not populated - # until the first page of rows is fetched. Update the - # progress bar's total to keep an accurate count. - progress_bar.total = progress_bar.total or self.total_rows - progress_bar.update(len(frame)) + for column in dtypes: + df[column] = pandas.Series(df[column], dtype=dtypes[column]) + + if geography_as_object: + for field in self.schema: + if field.field_type.upper() == "GEOGRAPHY": + df[field.name] = df[field.name].dropna().apply(_read_wkt) + + return df + + # If changing the signature of this method, make sure to apply the same + # changes to job.QueryJob.to_geodataframe() + def to_geodataframe( + self, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + dtypes: Dict[str, Any] = None, + progress_bar_type: str = None, + create_bqstorage_client: bool = True, + date_as_object: bool = True, + geography_column: Optional[str] = None, + ) -> "geopandas.GeoDataFrame": + """Create a GeoPandas GeoDataFrame by loading all pages of a query. + + Args: + bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): + A BigQuery Storage API client. If supplied, use the faster + BigQuery Storage API to fetch rows from BigQuery. + + This method requires the ``pyarrow`` and + ``google-cloud-bigquery-storage`` libraries. + + This method only exposes a subset of the capabilities of the + BigQuery Storage API. For full access to all features + (projections, filters, snapshots) use the Storage API directly. + + dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): + A dictionary of column names pandas ``dtype``s. The provided + ``dtype`` is used when constructing the series for the column + specified. Otherwise, the default pandas behavior is used. + progress_bar_type (Optional[str]): + If set, use the `tqdm `_ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + Possible values of ``progress_bar_type`` include: + + ``None`` + No progress bar. + ``'tqdm'`` + Use the :func:`tqdm.tqdm` function to print a progress bar + to :data:`sys.stderr`. + ``'tqdm_notebook'`` + Use the :func:`tqdm.tqdm_notebook` function to display a + progress bar as a Jupyter notebook widget. + ``'tqdm_gui'`` + Use the :func:`tqdm.tqdm_gui` function to display a + progress bar as a graphical dialog box. + + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API + is a faster way to fetch rows from BigQuery. See the + ``bqstorage_client`` parameter for more information. - if progress_bar is not None: - # Indicate that the download has finished. - progress_bar.close() + This argument does nothing if ``bqstorage_client`` is supplied. - # Avoid concatting an empty list. - if not frames: - column_names = [field.name for field in self._schema] - return pandas.DataFrame(columns=column_names) - return pandas.concat(frames, ignore_index=True) + date_as_object (Optional[bool]): + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. + + geography_column (Optional[str]): + If there are more than one GEOGRAPHY column, + identifies which one to use to construct a geopandas + GeoDataFrame. This option can be ommitted if there's + only one GEOGRAPHY column. + Returns: + geopandas.GeoDataFrame: + A :class:`geopandas.GeoDataFrame` populated with row + data and column headers from the query results. The + column headers are derived from the destination + table's schema. -class _EmptyRowIterator(object): + Raises: + ValueError: + If the :mod:`geopandas` library cannot be imported, or the + :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. + + .. versionadded:: 2.24.0 + """ + if geopandas is None: + raise ValueError(_NO_GEOPANDAS_ERROR) + + geography_columns = set( + field.name + for field in self.schema + if field.field_type.upper() == "GEOGRAPHY" + ) + if not geography_columns: + raise TypeError( + "There must be at least one GEOGRAPHY column" + " to create a GeoDataFrame" + ) + + if geography_column: + if geography_column not in geography_columns: + raise ValueError( + f"The given geography column, {geography_column}, doesn't name" + f" a GEOGRAPHY column in the result." + ) + elif len(geography_columns) == 1: + [geography_column] = geography_columns + else: + raise ValueError( + "There is more than one GEOGRAPHY column in the result. " + "The geography_column argument must be used to specify which " + "one to use to create a GeoDataFrame" + ) + + df = self.to_dataframe( + bqstorage_client, + dtypes, + progress_bar_type, + create_bqstorage_client, + date_as_object, + geography_as_object=True, + ) + + return geopandas.GeoDataFrame( + df, crs=_COORDINATE_REFERENCE_SYSTEM, geometry=geography_column + ) + + +class _EmptyRowIterator(RowIterator): """An empty row iterator. This class prevents API requests when there are no rows to fetch or rows @@ -1806,12 +2167,24 @@ class _EmptyRowIterator(object): pages = () total_rows = 0 + def __init__( + self, client=None, api_request=None, path=None, schema=(), *args, **kwargs + ): + super().__init__( + client=client, + api_request=api_request, + path=path, + schema=schema, + *args, + **kwargs, + ) + def to_arrow( self, progress_bar_type=None, bqstorage_client=None, create_bqstorage_client=True, - ): + ) -> "pyarrow.Table": """[Beta] Create an empty class:`pyarrow.Table`. Args: @@ -1833,7 +2206,8 @@ def to_dataframe( progress_bar_type=None, create_bqstorage_client=True, date_as_object=True, - ): + geography_as_object=False, + ) -> "pandas.DataFrame": """Create an empty dataframe. Args: @@ -1850,6 +2224,62 @@ def to_dataframe( raise ValueError(_NO_PANDAS_ERROR) return pandas.DataFrame() + def to_geodataframe( + self, + bqstorage_client=None, + dtypes=None, + progress_bar_type=None, + create_bqstorage_client=True, + date_as_object=True, + geography_column: Optional[str] = None, + ) -> "pandas.DataFrame": + """Create an empty dataframe. + + Args: + bqstorage_client (Any): Ignored. Added for compatibility with RowIterator. + dtypes (Any): Ignored. Added for compatibility with RowIterator. + progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. + create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + date_as_object (bool): Ignored. Added for compatibility with RowIterator. + + Returns: + pandas.DataFrame: An empty :class:`~pandas.DataFrame`. + """ + if geopandas is None: + raise ValueError(_NO_GEOPANDAS_ERROR) + return geopandas.GeoDataFrame(crs=_COORDINATE_REFERENCE_SYSTEM) + + def to_dataframe_iterable( + self, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + dtypes: Optional[Dict[str, Any]] = None, + max_queue_size: Optional[int] = None, + ) -> Iterator["pandas.DataFrame"]: + """Create an iterable of pandas DataFrames, to process the table as a stream. + + .. versionadded:: 2.21.0 + + Args: + bqstorage_client: + Ignored. Added for compatibility with RowIterator. + + dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): + Ignored. Added for compatibility with RowIterator. + + max_queue_size: + Ignored. Added for compatibility with RowIterator. + + Returns: + An iterator yielding a single empty :class:`~pandas.DataFrame`. + + Raises: + ValueError: + If the :mod:`pandas` library cannot be imported. + """ + if pandas is None: + raise ValueError(_NO_PANDAS_ERROR) + return iter((pandas.DataFrame(),)) + def __iter__(self): return iter(()) @@ -2024,6 +2454,12 @@ class TimePartitioningType(object): HOUR = "HOUR" """str: Generates one partition per hour.""" + MONTH = "MONTH" + """str: Generates one partition per month.""" + + YEAR = "YEAR" + """str: Generates one partition per year.""" + class TimePartitioning(object): """Configures time-based partitioning for a table. @@ -2031,13 +2467,24 @@ class TimePartitioning(object): Args: type_ (Optional[google.cloud.bigquery.table.TimePartitioningType]): Specifies the type of time partitioning to perform. Defaults to - :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`, - which is the only currently supported type. + :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`. + + Supported values are: + + * :attr:`~google.cloud.bigquery.table.TimePartitioningType.HOUR` + * :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY` + * :attr:`~google.cloud.bigquery.table.TimePartitioningType.MONTH` + * :attr:`~google.cloud.bigquery.table.TimePartitioningType.YEAR` + field (Optional[str]): If set, the table is partitioned by this field. If not set, the table is partitioned by pseudo column ``_PARTITIONTIME``. The field - must be a top-level ``TIMESTAMP`` or ``DATE`` field. Its mode must - be ``NULLABLE`` or ``REQUIRED``. + must be a top-level ``TIMESTAMP``, ``DATETIME``, or ``DATE`` + field. Its mode must be ``NULLABLE`` or ``REQUIRED``. + + See the `time-unit column-partitioned tables guide + `_ + in the BigQuery documentation. expiration_ms(Optional[int]): Number of milliseconds for which to keep the storage for a partition. @@ -2127,7 +2574,7 @@ def require_partition_filter(self, value): self._properties["requirePartitionFilter"] = value @classmethod - def from_api_repr(cls, api_repr): + def from_api_repr(cls, api_repr: dict) -> "TimePartitioning": """Return a :class:`TimePartitioning` object deserialized from a dict. This method creates a new ``TimePartitioning`` instance that points to @@ -2155,7 +2602,7 @@ def from_api_repr(cls, api_repr): instance._properties = api_repr return instance - def to_api_repr(self): + def to_api_repr(self) -> dict: """Return a dictionary representing this object. This method returns the properties dict of the ``TimePartitioning`` @@ -2171,7 +2618,20 @@ def to_api_repr(self): return self._properties def _key(self): - return tuple(sorted(self._properties.items())) + # because we are only "renaming" top level keys shallow copy is sufficient here. + properties = self._properties.copy() + # calling repr for non built-in type objects. + properties["type_"] = repr(properties.pop("type")) + if "field" in properties: + # calling repr for non built-in type objects. + properties["field"] = repr(properties["field"]) + if "requirePartitionFilter" in properties: + properties["require_partition_filter"] = properties.pop( + "requirePartitionFilter" + ) + if "expirationMs" in properties: + properties["expiration_ms"] = properties.pop("expirationMs") + return tuple(sorted(properties.items())) def __eq__(self, other): if not isinstance(other, TimePartitioning): @@ -2211,7 +2671,7 @@ def _item_to_row(iterator, resource): ) -def _tabledata_list_page_columns(schema, response): +def _row_iterator_page_columns(schema, response): """Make a generator of all the columns in a page from tabledata.list. This enables creating a :class:`pandas.DataFrame` and other @@ -2241,7 +2701,7 @@ def _rows_page_start(iterator, page, response): """ # Make a (lazy) copy of the page in column-oriented format for use in data # science packages. - page._columns = _tabledata_list_page_columns(iterator._schema, response) + page._columns = _row_iterator_page_columns(iterator._schema, response) total_rows = response.get("totalRows") if total_rows is not None: @@ -2257,7 +2717,7 @@ def _table_arg_to_table_ref(value, default_project=None): This function keeps TableReference and other kinds of objects unchanged. """ - if isinstance(value, six.string_types): + if isinstance(value, str): value = TableReference.from_string(value, default_project=default_project) if isinstance(value, (Table, TableListItem)): value = value.reference @@ -2269,7 +2729,7 @@ def _table_arg_to_table(value, default_project=None): This function keeps Table and other kinds of objects unchanged. """ - if isinstance(value, six.string_types): + if isinstance(value, str): value = TableReference.from_string(value, default_project=default_project) if isinstance(value, TableReference): value = Table(value) diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py new file mode 100644 index 000000000..21cbec9fe --- /dev/null +++ b/google/cloud/bigquery/version.py @@ -0,0 +1,15 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "2.25.1" diff --git a/google/cloud/bigquery_v2/__init__.py b/google/cloud/bigquery_v2/__init__.py index e58221432..f9957efa9 100644 --- a/google/cloud/bigquery_v2/__init__.py +++ b/google/cloud/bigquery_v2/__init__.py @@ -1,33 +1,46 @@ # -*- coding: utf-8 -*- -# -# Copyright 2018 Google LLC +# Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# https://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -from __future__ import absolute_import - -import pkg_resources - -__version__ = pkg_resources.get_distribution("google-cloud-bigquery").version # noqa - -from google.cloud.bigquery_v2 import types -from google.cloud.bigquery_v2.gapic import enums +from .types.encryption_config import EncryptionConfiguration +from .types.model import DeleteModelRequest +from .types.model import GetModelRequest +from .types.model import ListModelsRequest +from .types.model import ListModelsResponse +from .types.model import Model +from .types.model import PatchModelRequest +from .types.model_reference import ModelReference +from .types.standard_sql import StandardSqlDataType +from .types.standard_sql import StandardSqlField +from .types.standard_sql import StandardSqlStructType +from .types.standard_sql import StandardSqlTableType +from .types.table_reference import TableReference __all__ = ( - # google.cloud.bigquery_v2 - "__version__", - "types", - # google.cloud.bigquery_v2 - "enums", + "DeleteModelRequest", + "EncryptionConfiguration", + "GetModelRequest", + "ListModelsRequest", + "ListModelsResponse", + "Model", + "ModelReference", + "PatchModelRequest", + "StandardSqlDataType", + "StandardSqlField", + "StandardSqlStructType", + "StandardSqlTableType", + "TableReference", ) diff --git a/google/cloud/bigquery_v2/gapic/enums.py b/google/cloud/bigquery_v2/gapic/enums.py deleted file mode 100644 index 10d7c2517..000000000 --- a/google/cloud/bigquery_v2/gapic/enums.py +++ /dev/null @@ -1,171 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Wrappers for protocol buffer enum types.""" - -import enum - - -class Model(object): - class DataSplitMethod(enum.IntEnum): - """ - Indicates the method to split input data into multiple tables. - - Attributes: - DATA_SPLIT_METHOD_UNSPECIFIED (int) - RANDOM (int): Splits data randomly. - CUSTOM (int): Splits data with the user provided tags. - SEQUENTIAL (int): Splits data sequentially. - NO_SPLIT (int): Data split will be skipped. - AUTO_SPLIT (int): Splits data automatically: Uses NO_SPLIT if the data size is small. - Otherwise uses RANDOM. - """ - - DATA_SPLIT_METHOD_UNSPECIFIED = 0 - RANDOM = 1 - CUSTOM = 2 - SEQUENTIAL = 3 - NO_SPLIT = 4 - AUTO_SPLIT = 5 - - class DistanceType(enum.IntEnum): - """ - Distance metric used to compute the distance between two points. - - Attributes: - DISTANCE_TYPE_UNSPECIFIED (int) - EUCLIDEAN (int): Eculidean distance. - COSINE (int): Cosine distance. - """ - - DISTANCE_TYPE_UNSPECIFIED = 0 - EUCLIDEAN = 1 - COSINE = 2 - - class LearnRateStrategy(enum.IntEnum): - """ - Indicates the learning rate optimization strategy to use. - - Attributes: - LEARN_RATE_STRATEGY_UNSPECIFIED (int) - LINE_SEARCH (int): Use line search to determine learning rate. - CONSTANT (int): Use a constant learning rate. - """ - - LEARN_RATE_STRATEGY_UNSPECIFIED = 0 - LINE_SEARCH = 1 - CONSTANT = 2 - - class LossType(enum.IntEnum): - """ - Loss metric to evaluate model training performance. - - Attributes: - LOSS_TYPE_UNSPECIFIED (int) - MEAN_SQUARED_LOSS (int): Mean squared loss, used for linear regression. - MEAN_LOG_LOSS (int): Mean log loss, used for logistic regression. - """ - - LOSS_TYPE_UNSPECIFIED = 0 - MEAN_SQUARED_LOSS = 1 - MEAN_LOG_LOSS = 2 - - class ModelType(enum.IntEnum): - """ - Indicates the type of the Model. - - Attributes: - MODEL_TYPE_UNSPECIFIED (int) - LINEAR_REGRESSION (int): Linear regression model. - LOGISTIC_REGRESSION (int): Logistic regression based classification model. - KMEANS (int): K-means clustering model. - TENSORFLOW (int): [Beta] An imported TensorFlow model. - """ - - MODEL_TYPE_UNSPECIFIED = 0 - LINEAR_REGRESSION = 1 - LOGISTIC_REGRESSION = 2 - KMEANS = 3 - TENSORFLOW = 6 - - class OptimizationStrategy(enum.IntEnum): - """ - Indicates the optimization strategy used for training. - - Attributes: - OPTIMIZATION_STRATEGY_UNSPECIFIED (int) - BATCH_GRADIENT_DESCENT (int): Uses an iterative batch gradient descent algorithm. - NORMAL_EQUATION (int): Uses a normal equation to solve linear regression problem. - """ - - OPTIMIZATION_STRATEGY_UNSPECIFIED = 0 - BATCH_GRADIENT_DESCENT = 1 - NORMAL_EQUATION = 2 - - class KmeansEnums(object): - class KmeansInitializationMethod(enum.IntEnum): - """ - Indicates the method used to initialize the centroids for KMeans - clustering algorithm. - - Attributes: - KMEANS_INITIALIZATION_METHOD_UNSPECIFIED (int) - RANDOM (int): Initializes the centroids randomly. - CUSTOM (int): Initializes the centroids using data specified in - kmeans_initialization_column. - """ - - KMEANS_INITIALIZATION_METHOD_UNSPECIFIED = 0 - RANDOM = 1 - CUSTOM = 2 - - -class StandardSqlDataType(object): - class TypeKind(enum.IntEnum): - """ - Attributes: - TYPE_KIND_UNSPECIFIED (int): Invalid type. - INT64 (int): Encoded as a string in decimal format. - BOOL (int): Encoded as a boolean "false" or "true". - FLOAT64 (int): Encoded as a number, or string "NaN", "Infinity" or "-Infinity". - STRING (int): Encoded as a string value. - BYTES (int): Encoded as a base64 string per RFC 4648, section 4. - TIMESTAMP (int): Encoded as an RFC 3339 timestamp with mandatory "Z" time zone string: - 1985-04-12T23:20:50.52Z - DATE (int): Encoded as RFC 3339 full-date format string: 1985-04-12 - TIME (int): Encoded as RFC 3339 partial-time format string: 23:20:50.52 - DATETIME (int): Encoded as RFC 3339 full-date "T" partial-time: 1985-04-12T23:20:50.52 - GEOGRAPHY (int): Encoded as WKT - NUMERIC (int): Encoded as a decimal string. - ARRAY (int): Encoded as a list with types matching Type.array_type. - STRUCT (int): Encoded as a list with fields of type Type.struct_type[i]. List is - used because a JSON object cannot have duplicate field names. - """ - - TYPE_KIND_UNSPECIFIED = 0 - INT64 = 2 - BOOL = 5 - FLOAT64 = 7 - STRING = 8 - BYTES = 9 - TIMESTAMP = 19 - DATE = 10 - TIME = 20 - DATETIME = 21 - GEOGRAPHY = 22 - NUMERIC = 23 - ARRAY = 16 - STRUCT = 17 diff --git a/google/cloud/bigquery_v2/gapic_metadata.json b/google/cloud/bigquery_v2/gapic_metadata.json new file mode 100644 index 000000000..3251a2630 --- /dev/null +++ b/google/cloud/bigquery_v2/gapic_metadata.json @@ -0,0 +1,63 @@ + { + "comment": "This file maps proto services/RPCs to the corresponding library clients/methods", + "language": "python", + "libraryPackage": "google.cloud.bigquery_v2", + "protoPackage": "google.cloud.bigquery.v2", + "schema": "1.0", + "services": { + "ModelService": { + "clients": { + "grpc": { + "libraryClient": "ModelServiceClient", + "rpcs": { + "DeleteModel": { + "methods": [ + "delete_model" + ] + }, + "GetModel": { + "methods": [ + "get_model" + ] + }, + "ListModels": { + "methods": [ + "list_models" + ] + }, + "PatchModel": { + "methods": [ + "patch_model" + ] + } + } + }, + "grpc-async": { + "libraryClient": "ModelServiceAsyncClient", + "rpcs": { + "DeleteModel": { + "methods": [ + "delete_model" + ] + }, + "GetModel": { + "methods": [ + "get_model" + ] + }, + "ListModels": { + "methods": [ + "list_models" + ] + }, + "PatchModel": { + "methods": [ + "patch_model" + ] + } + } + } + } + } + } +} diff --git a/google/cloud/bigquery_v2/proto/encryption_config.proto b/google/cloud/bigquery_v2/proto/encryption_config.proto deleted file mode 100644 index 54445f0fa..000000000 --- a/google/cloud/bigquery_v2/proto/encryption_config.proto +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 Google LLC. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; - -package google.cloud.bigquery.v2; - -import "google/api/field_behavior.proto"; -import "google/protobuf/wrappers.proto"; -import "google/api/annotations.proto"; - -option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery"; -option java_outer_classname = "EncryptionConfigProto"; -option java_package = "com.google.cloud.bigquery.v2"; - -message EncryptionConfiguration { - // Optional. Describes the Cloud KMS encryption key that will be used to - // protect destination BigQuery table. The BigQuery Service Account associated - // with your project requires access to this encryption key. - google.protobuf.StringValue kms_key_name = 1 [(google.api.field_behavior) = OPTIONAL]; -} diff --git a/google/cloud/bigquery_v2/proto/encryption_config_pb2.py b/google/cloud/bigquery_v2/proto/encryption_config_pb2.py deleted file mode 100644 index 5147743b6..000000000 --- a/google/cloud/bigquery_v2/proto/encryption_config_pb2.py +++ /dev/null @@ -1,104 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: google/cloud/bigquery_v2/proto/encryption_config.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.api import field_behavior_pb2 as google_dot_api_dot_field__behavior__pb2 -from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2 -from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name="google/cloud/bigquery_v2/proto/encryption_config.proto", - package="google.cloud.bigquery.v2", - syntax="proto3", - serialized_options=b"\n\034com.google.cloud.bigquery.v2B\025EncryptionConfigProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery", - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n6google/cloud/bigquery_v2/proto/encryption_config.proto\x12\x18google.cloud.bigquery.v2\x1a\x1fgoogle/api/field_behavior.proto\x1a\x1egoogle/protobuf/wrappers.proto\x1a\x1cgoogle/api/annotations.proto"R\n\x17\x45ncryptionConfiguration\x12\x37\n\x0ckms_key_name\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.StringValueB\x03\xe0\x41\x01\x42w\n\x1c\x63om.google.cloud.bigquery.v2B\x15\x45ncryptionConfigProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigqueryb\x06proto3', - dependencies=[ - google_dot_api_dot_field__behavior__pb2.DESCRIPTOR, - google_dot_protobuf_dot_wrappers__pb2.DESCRIPTOR, - google_dot_api_dot_annotations__pb2.DESCRIPTOR, - ], -) - - -_ENCRYPTIONCONFIGURATION = _descriptor.Descriptor( - name="EncryptionConfiguration", - full_name="google.cloud.bigquery.v2.EncryptionConfiguration", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="kms_key_name", - full_name="google.cloud.bigquery.v2.EncryptionConfiguration.kms_key_name", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=179, - serialized_end=261, -) - -_ENCRYPTIONCONFIGURATION.fields_by_name[ - "kms_key_name" -].message_type = google_dot_protobuf_dot_wrappers__pb2._STRINGVALUE -DESCRIPTOR.message_types_by_name["EncryptionConfiguration"] = _ENCRYPTIONCONFIGURATION -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -EncryptionConfiguration = _reflection.GeneratedProtocolMessageType( - "EncryptionConfiguration", - (_message.Message,), - { - "DESCRIPTOR": _ENCRYPTIONCONFIGURATION, - "__module__": "google.cloud.bigquery_v2.proto.encryption_config_pb2", - "__doc__": """Encryption configuration. - - Attributes: - kms_key_name: - Optional. Describes the Cloud KMS encryption key that will be - used to protect destination BigQuery table. The BigQuery - Service Account associated with your project requires access - to this encryption key. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.EncryptionConfiguration) - }, -) -_sym_db.RegisterMessage(EncryptionConfiguration) - - -DESCRIPTOR._options = None -_ENCRYPTIONCONFIGURATION.fields_by_name["kms_key_name"]._options = None -# @@protoc_insertion_point(module_scope) diff --git a/google/cloud/bigquery_v2/proto/encryption_config_pb2_grpc.py b/google/cloud/bigquery_v2/proto/encryption_config_pb2_grpc.py deleted file mode 100644 index 8a9393943..000000000 --- a/google/cloud/bigquery_v2/proto/encryption_config_pb2_grpc.py +++ /dev/null @@ -1,3 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc diff --git a/google/cloud/bigquery_v2/proto/location_metadata.proto b/google/cloud/bigquery_v2/proto/location_metadata.proto deleted file mode 100644 index 95a3133c5..000000000 --- a/google/cloud/bigquery_v2/proto/location_metadata.proto +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2019 Google LLC. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; - -package google.cloud.bigquery.v2; - -import "google/api/annotations.proto"; - -option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery"; -option java_outer_classname = "LocationMetadataProto"; -option java_package = "com.google.cloud.bigquery.v2"; - - -// BigQuery-specific metadata about a location. This will be set on -// google.cloud.location.Location.metadata in Cloud Location API -// responses. -message LocationMetadata { - // The legacy BigQuery location ID, e.g. “EU” for the “europe” location. - // This is for any API consumers that need the legacy “US” and “EU” locations. - string legacy_location_id = 1; -} diff --git a/google/cloud/bigquery_v2/proto/location_metadata_pb2.py b/google/cloud/bigquery_v2/proto/location_metadata_pb2.py deleted file mode 100644 index 6dd9da52e..000000000 --- a/google/cloud/bigquery_v2/proto/location_metadata_pb2.py +++ /dev/null @@ -1,98 +0,0 @@ -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: google/cloud/bigquery_v2/proto/location_metadata.proto - -import sys - -_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1")) -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name="google/cloud/bigquery_v2/proto/location_metadata.proto", - package="google.cloud.bigquery.v2", - syntax="proto3", - serialized_options=_b( - "\n\034com.google.cloud.bigquery.v2B\025LocationMetadataProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery" - ), - serialized_pb=_b( - '\n6google/cloud/bigquery_v2/proto/location_metadata.proto\x12\x18google.cloud.bigquery.v2\x1a\x1cgoogle/api/annotations.proto".\n\x10LocationMetadata\x12\x1a\n\x12legacy_location_id\x18\x01 \x01(\tBw\n\x1c\x63om.google.cloud.bigquery.v2B\x15LocationMetadataProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigqueryb\x06proto3' - ), - dependencies=[google_dot_api_dot_annotations__pb2.DESCRIPTOR], -) - - -_LOCATIONMETADATA = _descriptor.Descriptor( - name="LocationMetadata", - full_name="google.cloud.bigquery.v2.LocationMetadata", - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name="legacy_location_id", - full_name="google.cloud.bigquery.v2.LocationMetadata.legacy_location_id", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - ) - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=114, - serialized_end=160, -) - -DESCRIPTOR.message_types_by_name["LocationMetadata"] = _LOCATIONMETADATA -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -LocationMetadata = _reflection.GeneratedProtocolMessageType( - "LocationMetadata", - (_message.Message,), - dict( - DESCRIPTOR=_LOCATIONMETADATA, - __module__="google.cloud.bigquery_v2.proto.location_metadata_pb2", - __doc__="""BigQuery-specific metadata about a location. This will be set on - google.cloud.location.Location.metadata in Cloud Location API responses. - - - Attributes: - legacy_location_id: - The legacy BigQuery location ID, e.g. ``EU`` for the ``europe`` - location. This is for any API consumers that need the legacy - ``US`` and ``EU`` locations. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.LocationMetadata) - ), -) -_sym_db.RegisterMessage(LocationMetadata) - - -DESCRIPTOR._options = None -# @@protoc_insertion_point(module_scope) diff --git a/google/cloud/bigquery_v2/proto/location_metadata_pb2_grpc.py b/google/cloud/bigquery_v2/proto/location_metadata_pb2_grpc.py deleted file mode 100644 index 07cb78fe0..000000000 --- a/google/cloud/bigquery_v2/proto/location_metadata_pb2_grpc.py +++ /dev/null @@ -1,2 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -import grpc diff --git a/google/cloud/bigquery_v2/proto/model.proto b/google/cloud/bigquery_v2/proto/model.proto deleted file mode 100644 index 13d980774..000000000 --- a/google/cloud/bigquery_v2/proto/model.proto +++ /dev/null @@ -1,640 +0,0 @@ -// Copyright 2019 Google LLC. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; - -package google.cloud.bigquery.v2; - -import "google/api/client.proto"; -import "google/api/field_behavior.proto"; -import "google/cloud/bigquery/v2/encryption_config.proto"; -import "google/cloud/bigquery/v2/model_reference.proto"; -import "google/cloud/bigquery/v2/standard_sql.proto"; -import "google/protobuf/empty.proto"; -import "google/protobuf/timestamp.proto"; -import "google/protobuf/wrappers.proto"; -import "google/api/annotations.proto"; - -option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery"; -option java_outer_classname = "ModelProto"; -option java_package = "com.google.cloud.bigquery.v2"; - -service ModelService { - option (google.api.default_host) = "bigquery.googleapis.com"; - option (google.api.oauth_scopes) = - "https://www.googleapis.com/auth/bigquery," - "https://www.googleapis.com/auth/bigquery.readonly," - "https://www.googleapis.com/auth/cloud-platform," - "https://www.googleapis.com/auth/cloud-platform.read-only"; - - // Gets the specified model resource by model ID. - rpc GetModel(GetModelRequest) returns (Model) { - option (google.api.method_signature) = "project_id,dataset_id,model_id"; - } - - // Lists all models in the specified dataset. Requires the READER dataset - // role. - rpc ListModels(ListModelsRequest) returns (ListModelsResponse) { - option (google.api.method_signature) = "project_id,dataset_id,max_results"; - } - - // Patch specific fields in the specified model. - rpc PatchModel(PatchModelRequest) returns (Model) { - option (google.api.method_signature) = "project_id,dataset_id,model_id,model"; - } - - // Deletes the model specified by modelId from the dataset. - rpc DeleteModel(DeleteModelRequest) returns (google.protobuf.Empty) { - option (google.api.method_signature) = "project_id,dataset_id,model_id"; - } -} - -message Model { - message KmeansEnums { - // Indicates the method used to initialize the centroids for KMeans - // clustering algorithm. - enum KmeansInitializationMethod { - KMEANS_INITIALIZATION_METHOD_UNSPECIFIED = 0; - - // Initializes the centroids randomly. - RANDOM = 1; - - // Initializes the centroids using data specified in - // kmeans_initialization_column. - CUSTOM = 2; - } - - - } - - // Evaluation metrics for regression and explicit feedback type matrix - // factorization models. - message RegressionMetrics { - // Mean absolute error. - google.protobuf.DoubleValue mean_absolute_error = 1; - - // Mean squared error. - google.protobuf.DoubleValue mean_squared_error = 2; - - // Mean squared log error. - google.protobuf.DoubleValue mean_squared_log_error = 3; - - // Median absolute error. - google.protobuf.DoubleValue median_absolute_error = 4; - - // R^2 score. - google.protobuf.DoubleValue r_squared = 5; - } - - // Aggregate metrics for classification/classifier models. For multi-class - // models, the metrics are either macro-averaged or micro-averaged. When - // macro-averaged, the metrics are calculated for each label and then an - // unweighted average is taken of those values. When micro-averaged, the - // metric is calculated globally by counting the total number of correctly - // predicted rows. - message AggregateClassificationMetrics { - // Precision is the fraction of actual positive predictions that had - // positive actual labels. For multiclass this is a macro-averaged - // metric treating each class as a binary classifier. - google.protobuf.DoubleValue precision = 1; - - // Recall is the fraction of actual positive labels that were given a - // positive prediction. For multiclass this is a macro-averaged metric. - google.protobuf.DoubleValue recall = 2; - - // Accuracy is the fraction of predictions given the correct label. For - // multiclass this is a micro-averaged metric. - google.protobuf.DoubleValue accuracy = 3; - - // Threshold at which the metrics are computed. For binary - // classification models this is the positive class threshold. - // For multi-class classfication models this is the confidence - // threshold. - google.protobuf.DoubleValue threshold = 4; - - // The F1 score is an average of recall and precision. For multiclass - // this is a macro-averaged metric. - google.protobuf.DoubleValue f1_score = 5; - - // Logarithmic Loss. For multiclass this is a macro-averaged metric. - google.protobuf.DoubleValue log_loss = 6; - - // Area Under a ROC Curve. For multiclass this is a macro-averaged - // metric. - google.protobuf.DoubleValue roc_auc = 7; - } - - // Evaluation metrics for binary classification/classifier models. - message BinaryClassificationMetrics { - // Confusion matrix for binary classification models. - message BinaryConfusionMatrix { - // Threshold value used when computing each of the following metric. - google.protobuf.DoubleValue positive_class_threshold = 1; - - // Number of true samples predicted as true. - google.protobuf.Int64Value true_positives = 2; - - // Number of false samples predicted as true. - google.protobuf.Int64Value false_positives = 3; - - // Number of true samples predicted as false. - google.protobuf.Int64Value true_negatives = 4; - - // Number of false samples predicted as false. - google.protobuf.Int64Value false_negatives = 5; - - // The fraction of actual positive predictions that had positive actual - // labels. - google.protobuf.DoubleValue precision = 6; - - // The fraction of actual positive labels that were given a positive - // prediction. - google.protobuf.DoubleValue recall = 7; - - // The equally weighted average of recall and precision. - google.protobuf.DoubleValue f1_score = 8; - - // The fraction of predictions given the correct label. - google.protobuf.DoubleValue accuracy = 9; - } - - // Aggregate classification metrics. - AggregateClassificationMetrics aggregate_classification_metrics = 1; - - // Binary confusion matrix at multiple thresholds. - repeated BinaryConfusionMatrix binary_confusion_matrix_list = 2; - - // Label representing the positive class. - string positive_label = 3; - - // Label representing the negative class. - string negative_label = 4; - } - - // Evaluation metrics for multi-class classification/classifier models. - message MultiClassClassificationMetrics { - // Confusion matrix for multi-class classification models. - message ConfusionMatrix { - // A single entry in the confusion matrix. - message Entry { - // The predicted label. For confidence_threshold > 0, we will - // also add an entry indicating the number of items under the - // confidence threshold. - string predicted_label = 1; - - // Number of items being predicted as this label. - google.protobuf.Int64Value item_count = 2; - } - - // A single row in the confusion matrix. - message Row { - // The original label of this row. - string actual_label = 1; - - // Info describing predicted label distribution. - repeated Entry entries = 2; - } - - // Confidence threshold used when computing the entries of the - // confusion matrix. - google.protobuf.DoubleValue confidence_threshold = 1; - - // One row per actual label. - repeated Row rows = 2; - } - - // Aggregate classification metrics. - AggregateClassificationMetrics aggregate_classification_metrics = 1; - - // Confusion matrix at different thresholds. - repeated ConfusionMatrix confusion_matrix_list = 2; - } - - // Evaluation metrics for clustering models. - message ClusteringMetrics { - // Message containing the information about one cluster. - message Cluster { - // Representative value of a single feature within the cluster. - message FeatureValue { - // Representative value of a categorical feature. - message CategoricalValue { - // Represents the count of a single category within the cluster. - message CategoryCount { - // The name of category. - string category = 1; - - // The count of training samples matching the category within the - // cluster. - google.protobuf.Int64Value count = 2; - } - - // Counts of all categories for the categorical feature. If there are - // more than ten categories, we return top ten (by count) and return - // one more CategoryCount with category "_OTHER_" and count as - // aggregate counts of remaining categories. - repeated CategoryCount category_counts = 1; - } - - // The feature column name. - string feature_column = 1; - - oneof value { - // The numerical feature value. This is the centroid value for this - // feature. - google.protobuf.DoubleValue numerical_value = 2; - - // The categorical feature value. - CategoricalValue categorical_value = 3; - } - } - - // Centroid id. - int64 centroid_id = 1; - - // Values of highly variant features for this cluster. - repeated FeatureValue feature_values = 2; - - // Count of training data rows that were assigned to this cluster. - google.protobuf.Int64Value count = 3; - } - - // Davies-Bouldin index. - google.protobuf.DoubleValue davies_bouldin_index = 1; - - // Mean of squared distances between each sample to its cluster centroid. - google.protobuf.DoubleValue mean_squared_distance = 2; - - // [Beta] Information for all clusters. - repeated Cluster clusters = 3; - } - - // Evaluation metrics of a model. These are either computed on all training - // data or just the eval data based on whether eval data was used during - // training. These are not present for imported models. - message EvaluationMetrics { - oneof metrics { - // Populated for regression models and explicit feedback type matrix - // factorization models. - RegressionMetrics regression_metrics = 1; - - // Populated for binary classification/classifier models. - BinaryClassificationMetrics binary_classification_metrics = 2; - - // Populated for multi-class classification/classifier models. - MultiClassClassificationMetrics multi_class_classification_metrics = 3; - - // Populated for clustering models. - ClusteringMetrics clustering_metrics = 4; - } - } - - // Information about a single training query run for the model. - message TrainingRun { - message TrainingOptions { - // The maximum number of iterations in training. Used only for iterative - // training algorithms. - int64 max_iterations = 1; - - // Type of loss function used during training run. - LossType loss_type = 2; - - // Learning rate in training. Used only for iterative training algorithms. - double learn_rate = 3; - - // L1 regularization coefficient. - google.protobuf.DoubleValue l1_regularization = 4; - - // L2 regularization coefficient. - google.protobuf.DoubleValue l2_regularization = 5; - - // When early_stop is true, stops training when accuracy improvement is - // less than 'min_relative_progress'. Used only for iterative training - // algorithms. - google.protobuf.DoubleValue min_relative_progress = 6; - - // Whether to train a model from the last checkpoint. - google.protobuf.BoolValue warm_start = 7; - - // Whether to stop early when the loss doesn't improve significantly - // any more (compared to min_relative_progress). Used only for iterative - // training algorithms. - google.protobuf.BoolValue early_stop = 8; - - // Name of input label columns in training data. - repeated string input_label_columns = 9; - - // The data split type for training and evaluation, e.g. RANDOM. - DataSplitMethod data_split_method = 10; - - // The fraction of evaluation data over the whole input data. The rest - // of data will be used as training data. The format should be double. - // Accurate to two decimal places. - // Default value is 0.2. - double data_split_eval_fraction = 11; - - // The column to split data with. This column won't be used as a - // feature. - // 1. When data_split_method is CUSTOM, the corresponding column should - // be boolean. The rows with true value tag are eval data, and the false - // are training data. - // 2. When data_split_method is SEQ, the first DATA_SPLIT_EVAL_FRACTION - // rows (from smallest to largest) in the corresponding column are used - // as training data, and the rest are eval data. It respects the order - // in Orderable data types: - // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data-type-properties - string data_split_column = 12; - - // The strategy to determine learn rate for the current iteration. - LearnRateStrategy learn_rate_strategy = 13; - - // Specifies the initial learning rate for the line search learn rate - // strategy. - double initial_learn_rate = 16; - - // Weights associated with each label class, for rebalancing the - // training data. Only applicable for classification models. - map label_class_weights = 17; - - // Distance type for clustering models. - DistanceType distance_type = 20; - - // Number of clusters for clustering models. - int64 num_clusters = 21; - - // [Beta] Google Cloud Storage URI from which the model was imported. Only - // applicable for imported models. - string model_uri = 22; - - // Optimization strategy for training linear regression models. - OptimizationStrategy optimization_strategy = 23; - - // The method used to initialize the centroids for kmeans algorithm. - KmeansEnums.KmeansInitializationMethod kmeans_initialization_method = 33; - - // The column used to provide the initial centroids for kmeans algorithm - // when kmeans_initialization_method is CUSTOM. - string kmeans_initialization_column = 34; - } - - // Information about a single iteration of the training run. - message IterationResult { - // Information about a single cluster for clustering model. - message ClusterInfo { - // Centroid id. - int64 centroid_id = 1; - - // Cluster radius, the average distance from centroid - // to each point assigned to the cluster. - google.protobuf.DoubleValue cluster_radius = 2; - - // Cluster size, the total number of points assigned to the cluster. - google.protobuf.Int64Value cluster_size = 3; - } - - // Index of the iteration, 0 based. - google.protobuf.Int32Value index = 1; - - // Time taken to run the iteration in milliseconds. - google.protobuf.Int64Value duration_ms = 4; - - // Loss computed on the training data at the end of iteration. - google.protobuf.DoubleValue training_loss = 5; - - // Loss computed on the eval data at the end of iteration. - google.protobuf.DoubleValue eval_loss = 6; - - // Learn rate used for this iteration. - double learn_rate = 7; - - // Information about top clusters for clustering models. - repeated ClusterInfo cluster_infos = 8; - } - - // Options that were used for this training run, includes - // user specified and default options that were used. - TrainingOptions training_options = 1; - - // The start time of this training run. - google.protobuf.Timestamp start_time = 8; - - // Output of each iteration run, results.size() <= max_iterations. - repeated IterationResult results = 6; - - // The evaluation metrics over training/eval data that were computed at the - // end of training. - EvaluationMetrics evaluation_metrics = 7; - } - - // Indicates the type of the Model. - enum ModelType { - MODEL_TYPE_UNSPECIFIED = 0; - - // Linear regression model. - LINEAR_REGRESSION = 1; - - // Logistic regression based classification model. - LOGISTIC_REGRESSION = 2; - - // K-means clustering model. - KMEANS = 3; - - // [Beta] An imported TensorFlow model. - TENSORFLOW = 6; - } - - // Loss metric to evaluate model training performance. - enum LossType { - LOSS_TYPE_UNSPECIFIED = 0; - - // Mean squared loss, used for linear regression. - MEAN_SQUARED_LOSS = 1; - - // Mean log loss, used for logistic regression. - MEAN_LOG_LOSS = 2; - } - - // Distance metric used to compute the distance between two points. - enum DistanceType { - DISTANCE_TYPE_UNSPECIFIED = 0; - - // Eculidean distance. - EUCLIDEAN = 1; - - // Cosine distance. - COSINE = 2; - } - - // Indicates the method to split input data into multiple tables. - enum DataSplitMethod { - DATA_SPLIT_METHOD_UNSPECIFIED = 0; - - // Splits data randomly. - RANDOM = 1; - - // Splits data with the user provided tags. - CUSTOM = 2; - - // Splits data sequentially. - SEQUENTIAL = 3; - - // Data split will be skipped. - NO_SPLIT = 4; - - // Splits data automatically: Uses NO_SPLIT if the data size is small. - // Otherwise uses RANDOM. - AUTO_SPLIT = 5; - } - - // Indicates the learning rate optimization strategy to use. - enum LearnRateStrategy { - LEARN_RATE_STRATEGY_UNSPECIFIED = 0; - - // Use line search to determine learning rate. - LINE_SEARCH = 1; - - // Use a constant learning rate. - CONSTANT = 2; - } - - // Indicates the optimization strategy used for training. - enum OptimizationStrategy { - OPTIMIZATION_STRATEGY_UNSPECIFIED = 0; - - // Uses an iterative batch gradient descent algorithm. - BATCH_GRADIENT_DESCENT = 1; - - // Uses a normal equation to solve linear regression problem. - NORMAL_EQUATION = 2; - } - - // Output only. A hash of this resource. - string etag = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Required. Unique identifier for this model. - ModelReference model_reference = 2 [(google.api.field_behavior) = REQUIRED]; - - // Output only. The time when this model was created, in millisecs since the epoch. - int64 creation_time = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Output only. The time when this model was last modified, in millisecs since the epoch. - int64 last_modified_time = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Optional. A user-friendly description of this model. - string description = 12 [(google.api.field_behavior) = OPTIONAL]; - - // Optional. A descriptive name for this model. - string friendly_name = 14 [(google.api.field_behavior) = OPTIONAL]; - - // The labels associated with this model. You can use these to organize - // and group your models. Label keys and values can be no longer - // than 63 characters, can only contain lowercase letters, numeric - // characters, underscores and dashes. International characters are allowed. - // Label values are optional. Label keys must start with a letter and each - // label in the list must have a different key. - map labels = 15; - - // Optional. The time when this model expires, in milliseconds since the epoch. - // If not present, the model will persist indefinitely. Expired models - // will be deleted and their storage reclaimed. The defaultTableExpirationMs - // property of the encapsulating dataset can be used to set a default - // expirationTime on newly created models. - int64 expiration_time = 16 [(google.api.field_behavior) = OPTIONAL]; - - // Output only. The geographic location where the model resides. This value - // is inherited from the dataset. - string location = 13 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Custom encryption configuration (e.g., Cloud KMS keys). This shows the - // encryption configuration of the model data while stored in BigQuery - // storage. - google.cloud.bigquery.v2.EncryptionConfiguration encryption_configuration = 17; - - // Output only. Type of the model resource. - ModelType model_type = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Output only. Information for all training runs in increasing order of start_time. - repeated TrainingRun training_runs = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Output only. Input feature columns that were used to train this model. - repeated StandardSqlField feature_columns = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Output only. Label columns that were used to train this model. - // The output of the model will have a "predicted_" prefix to these columns. - repeated StandardSqlField label_columns = 11 [(google.api.field_behavior) = OUTPUT_ONLY]; -} - -message GetModelRequest { - // Required. Project ID of the requested model. - string project_id = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. Dataset ID of the requested model. - string dataset_id = 2 [(google.api.field_behavior) = REQUIRED]; - - // Required. Model ID of the requested model. - string model_id = 3 [(google.api.field_behavior) = REQUIRED]; -} - -message PatchModelRequest { - // Required. Project ID of the model to patch. - string project_id = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. Dataset ID of the model to patch. - string dataset_id = 2 [(google.api.field_behavior) = REQUIRED]; - - // Required. Model ID of the model to patch. - string model_id = 3 [(google.api.field_behavior) = REQUIRED]; - - // Required. Patched model. - // Follows RFC5789 patch semantics. Missing fields are not updated. - // To clear a field, explicitly set to default value. - Model model = 4 [(google.api.field_behavior) = REQUIRED]; -} - -message DeleteModelRequest { - // Required. Project ID of the model to delete. - string project_id = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. Dataset ID of the model to delete. - string dataset_id = 2 [(google.api.field_behavior) = REQUIRED]; - - // Required. Model ID of the model to delete. - string model_id = 3 [(google.api.field_behavior) = REQUIRED]; -} - -message ListModelsRequest { - // Required. Project ID of the models to list. - string project_id = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. Dataset ID of the models to list. - string dataset_id = 2 [(google.api.field_behavior) = REQUIRED]; - - // The maximum number of results to return in a single response page. - // Leverage the page tokens to iterate through the entire collection. - google.protobuf.UInt32Value max_results = 3; - - // Page token, returned by a previous call to request the next page of - // results - string page_token = 4; -} - -message ListModelsResponse { - // Models in the requested dataset. Only the following fields are populated: - // model_reference, model_type, creation_time, last_modified_time and - // labels. - repeated Model models = 1; - - // A token to request the next page of results. - string next_page_token = 2; -} diff --git a/google/cloud/bigquery_v2/proto/model_pb2.py b/google/cloud/bigquery_v2/proto/model_pb2.py deleted file mode 100644 index f485c4568..000000000 --- a/google/cloud/bigquery_v2/proto/model_pb2.py +++ /dev/null @@ -1,4298 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: google/cloud/bigquery_v2/proto/model.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.api import client_pb2 as google_dot_api_dot_client__pb2 -from google.api import field_behavior_pb2 as google_dot_api_dot_field__behavior__pb2 -from google.cloud.bigquery_v2.proto import ( - encryption_config_pb2 as google_dot_cloud_dot_bigquery__v2_dot_proto_dot_encryption__config__pb2, -) -from google.cloud.bigquery_v2.proto import ( - model_reference_pb2 as google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__reference__pb2, -) -from google.cloud.bigquery_v2.proto import ( - standard_sql_pb2 as google_dot_cloud_dot_bigquery__v2_dot_proto_dot_standard__sql__pb2, -) -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2 -from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name="google/cloud/bigquery_v2/proto/model.proto", - package="google.cloud.bigquery.v2", - syntax="proto3", - serialized_options=b"\n\034com.google.cloud.bigquery.v2B\nModelProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery", - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n*google/cloud/bigquery_v2/proto/model.proto\x12\x18google.cloud.bigquery.v2\x1a\x17google/api/client.proto\x1a\x1fgoogle/api/field_behavior.proto\x1a\x36google/cloud/bigquery_v2/proto/encryption_config.proto\x1a\x34google/cloud/bigquery_v2/proto/model_reference.proto\x1a\x31google/cloud/bigquery_v2/proto/standard_sql.proto\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/wrappers.proto\x1a\x1cgoogle/api/annotations.proto"\x9b\x35\n\x05Model\x12\x11\n\x04\x65tag\x18\x01 \x01(\tB\x03\xe0\x41\x03\x12\x46\n\x0fmodel_reference\x18\x02 \x01(\x0b\x32(.google.cloud.bigquery.v2.ModelReferenceB\x03\xe0\x41\x02\x12\x1a\n\rcreation_time\x18\x05 \x01(\x03\x42\x03\xe0\x41\x03\x12\x1f\n\x12last_modified_time\x18\x06 \x01(\x03\x42\x03\xe0\x41\x03\x12\x18\n\x0b\x64\x65scription\x18\x0c \x01(\tB\x03\xe0\x41\x01\x12\x1a\n\rfriendly_name\x18\x0e \x01(\tB\x03\xe0\x41\x01\x12;\n\x06labels\x18\x0f \x03(\x0b\x32+.google.cloud.bigquery.v2.Model.LabelsEntry\x12\x1c\n\x0f\x65xpiration_time\x18\x10 \x01(\x03\x42\x03\xe0\x41\x01\x12\x15\n\x08location\x18\r \x01(\tB\x03\xe0\x41\x03\x12S\n\x18\x65ncryption_configuration\x18\x11 \x01(\x0b\x32\x31.google.cloud.bigquery.v2.EncryptionConfiguration\x12\x42\n\nmodel_type\x18\x07 \x01(\x0e\x32).google.cloud.bigquery.v2.Model.ModelTypeB\x03\xe0\x41\x03\x12G\n\rtraining_runs\x18\t \x03(\x0b\x32+.google.cloud.bigquery.v2.Model.TrainingRunB\x03\xe0\x41\x03\x12H\n\x0f\x66\x65\x61ture_columns\x18\n \x03(\x0b\x32*.google.cloud.bigquery.v2.StandardSqlFieldB\x03\xe0\x41\x03\x12\x46\n\rlabel_columns\x18\x0b \x03(\x0b\x32*.google.cloud.bigquery.v2.StandardSqlFieldB\x03\xe0\x41\x03\x1aq\n\x0bKmeansEnums"b\n\x1aKmeansInitializationMethod\x12,\n(KMEANS_INITIALIZATION_METHOD_UNSPECIFIED\x10\x00\x12\n\n\x06RANDOM\x10\x01\x12\n\n\x06\x43USTOM\x10\x02\x1a\xb4\x02\n\x11RegressionMetrics\x12\x39\n\x13mean_absolute_error\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12\x38\n\x12mean_squared_error\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12<\n\x16mean_squared_log_error\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12;\n\x15median_absolute_error\x18\x04 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12/\n\tr_squared\x18\x05 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x1a\xef\x02\n\x1e\x41ggregateClassificationMetrics\x12/\n\tprecision\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12,\n\x06recall\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12.\n\x08\x61\x63\x63uracy\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12/\n\tthreshold\x18\x04 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12.\n\x08\x66\x31_score\x18\x05 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12.\n\x08log_loss\x18\x06 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12-\n\x07roc_auc\x18\x07 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x1a\x9f\x06\n\x1b\x42inaryClassificationMetrics\x12h\n aggregate_classification_metrics\x18\x01 \x01(\x0b\x32>.google.cloud.bigquery.v2.Model.AggregateClassificationMetrics\x12w\n\x1c\x62inary_confusion_matrix_list\x18\x02 \x03(\x0b\x32Q.google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix\x12\x16\n\x0epositive_label\x18\x03 \x01(\t\x12\x16\n\x0enegative_label\x18\x04 \x01(\t\x1a\xec\x03\n\x15\x42inaryConfusionMatrix\x12>\n\x18positive_class_threshold\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12\x33\n\x0etrue_positives\x18\x02 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x12\x34\n\x0f\x66\x61lse_positives\x18\x03 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x12\x33\n\x0etrue_negatives\x18\x04 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x12\x34\n\x0f\x66\x61lse_negatives\x18\x05 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x12/\n\tprecision\x18\x06 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12,\n\x06recall\x18\x07 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12.\n\x08\x66\x31_score\x18\x08 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12.\n\x08\x61\x63\x63uracy\x18\t \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x1a\x87\x05\n\x1fMultiClassClassificationMetrics\x12h\n aggregate_classification_metrics\x18\x01 \x01(\x0b\x32>.google.cloud.bigquery.v2.Model.AggregateClassificationMetrics\x12n\n\x15\x63onfusion_matrix_list\x18\x02 \x03(\x0b\x32O.google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix\x1a\x89\x03\n\x0f\x43onfusionMatrix\x12:\n\x14\x63onfidence_threshold\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12\x61\n\x04rows\x18\x02 \x03(\x0b\x32S.google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Row\x1aQ\n\x05\x45ntry\x12\x17\n\x0fpredicted_label\x18\x01 \x01(\t\x12/\n\nitem_count\x18\x02 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x1a\x83\x01\n\x03Row\x12\x14\n\x0c\x61\x63tual_label\x18\x01 \x01(\t\x12\x66\n\x07\x65ntries\x18\x02 \x03(\x0b\x32U.google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry\x1a\xcb\x06\n\x11\x43lusteringMetrics\x12:\n\x14\x64\x61vies_bouldin_index\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12;\n\x15mean_squared_distance\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12K\n\x08\x63lusters\x18\x03 \x03(\x0b\x32\x39.google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster\x1a\xef\x04\n\x07\x43luster\x12\x13\n\x0b\x63\x65ntroid_id\x18\x01 \x01(\x03\x12^\n\x0e\x66\x65\x61ture_values\x18\x02 \x03(\x0b\x32\x46.google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue\x12*\n\x05\x63ount\x18\x03 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x1a\xc2\x03\n\x0c\x46\x65\x61tureValue\x12\x16\n\x0e\x66\x65\x61ture_column\x18\x01 \x01(\t\x12\x37\n\x0fnumerical_value\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.DoubleValueH\x00\x12t\n\x11\x63\x61tegorical_value\x18\x03 \x01(\x0b\x32W.google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValueH\x00\x1a\xe1\x01\n\x10\x43\x61tegoricalValue\x12~\n\x0f\x63\x61tegory_counts\x18\x01 \x03(\x0b\x32\x65.google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount\x1aM\n\rCategoryCount\x12\x10\n\x08\x63\x61tegory\x18\x01 \x01(\t\x12*\n\x05\x63ount\x18\x02 \x01(\x0b\x32\x1b.google.protobuf.Int64ValueB\x07\n\x05value\x1a\x95\x03\n\x11\x45valuationMetrics\x12O\n\x12regression_metrics\x18\x01 \x01(\x0b\x32\x31.google.cloud.bigquery.v2.Model.RegressionMetricsH\x00\x12\x64\n\x1d\x62inary_classification_metrics\x18\x02 \x01(\x0b\x32;.google.cloud.bigquery.v2.Model.BinaryClassificationMetricsH\x00\x12m\n"multi_class_classification_metrics\x18\x03 \x01(\x0b\x32?.google.cloud.bigquery.v2.Model.MultiClassClassificationMetricsH\x00\x12O\n\x12\x63lustering_metrics\x18\x04 \x01(\x0b\x32\x31.google.cloud.bigquery.v2.Model.ClusteringMetricsH\x00\x42\t\n\x07metrics\x1a\xab\x0f\n\x0bTrainingRun\x12U\n\x10training_options\x18\x01 \x01(\x0b\x32;.google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions\x12.\n\nstart_time\x18\x08 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12L\n\x07results\x18\x06 \x03(\x0b\x32;.google.cloud.bigquery.v2.Model.TrainingRun.IterationResult\x12M\n\x12\x65valuation_metrics\x18\x07 \x01(\x0b\x32\x31.google.cloud.bigquery.v2.Model.EvaluationMetrics\x1a\x9d\t\n\x0fTrainingOptions\x12\x16\n\x0emax_iterations\x18\x01 \x01(\x03\x12;\n\tloss_type\x18\x02 \x01(\x0e\x32(.google.cloud.bigquery.v2.Model.LossType\x12\x12\n\nlearn_rate\x18\x03 \x01(\x01\x12\x37\n\x11l1_regularization\x18\x04 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12\x37\n\x11l2_regularization\x18\x05 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12;\n\x15min_relative_progress\x18\x06 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12.\n\nwarm_start\x18\x07 \x01(\x0b\x32\x1a.google.protobuf.BoolValue\x12.\n\nearly_stop\x18\x08 \x01(\x0b\x32\x1a.google.protobuf.BoolValue\x12\x1b\n\x13input_label_columns\x18\t \x03(\t\x12J\n\x11\x64\x61ta_split_method\x18\n \x01(\x0e\x32/.google.cloud.bigquery.v2.Model.DataSplitMethod\x12 \n\x18\x64\x61ta_split_eval_fraction\x18\x0b \x01(\x01\x12\x19\n\x11\x64\x61ta_split_column\x18\x0c \x01(\t\x12N\n\x13learn_rate_strategy\x18\r \x01(\x0e\x32\x31.google.cloud.bigquery.v2.Model.LearnRateStrategy\x12\x1a\n\x12initial_learn_rate\x18\x10 \x01(\x01\x12o\n\x13label_class_weights\x18\x11 \x03(\x0b\x32R.google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.LabelClassWeightsEntry\x12\x43\n\rdistance_type\x18\x14 \x01(\x0e\x32,.google.cloud.bigquery.v2.Model.DistanceType\x12\x14\n\x0cnum_clusters\x18\x15 \x01(\x03\x12\x11\n\tmodel_uri\x18\x16 \x01(\t\x12S\n\x15optimization_strategy\x18\x17 \x01(\x0e\x32\x34.google.cloud.bigquery.v2.Model.OptimizationStrategy\x12l\n\x1ckmeans_initialization_method\x18! \x01(\x0e\x32\x46.google.cloud.bigquery.v2.Model.KmeansEnums.KmeansInitializationMethod\x12$\n\x1ckmeans_initialization_column\x18" \x01(\t\x1a\x38\n\x16LabelClassWeightsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x01:\x02\x38\x01\x1a\xd7\x03\n\x0fIterationResult\x12*\n\x05index\x18\x01 \x01(\x0b\x32\x1b.google.protobuf.Int32Value\x12\x30\n\x0b\x64uration_ms\x18\x04 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x12\x33\n\rtraining_loss\x18\x05 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12/\n\teval_loss\x18\x06 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12\x12\n\nlearn_rate\x18\x07 \x01(\x01\x12^\n\rcluster_infos\x18\x08 \x03(\x0b\x32G.google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.ClusterInfo\x1a\x8b\x01\n\x0b\x43lusterInfo\x12\x13\n\x0b\x63\x65ntroid_id\x18\x01 \x01(\x03\x12\x34\n\x0e\x63luster_radius\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12\x31\n\x0c\x63luster_size\x18\x03 \x01(\x0b\x32\x1b.google.protobuf.Int64Value\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"s\n\tModelType\x12\x1a\n\x16MODEL_TYPE_UNSPECIFIED\x10\x00\x12\x15\n\x11LINEAR_REGRESSION\x10\x01\x12\x17\n\x13LOGISTIC_REGRESSION\x10\x02\x12\n\n\x06KMEANS\x10\x03\x12\x0e\n\nTENSORFLOW\x10\x06"O\n\x08LossType\x12\x19\n\x15LOSS_TYPE_UNSPECIFIED\x10\x00\x12\x15\n\x11MEAN_SQUARED_LOSS\x10\x01\x12\x11\n\rMEAN_LOG_LOSS\x10\x02"H\n\x0c\x44istanceType\x12\x1d\n\x19\x44ISTANCE_TYPE_UNSPECIFIED\x10\x00\x12\r\n\tEUCLIDEAN\x10\x01\x12\n\n\x06\x43OSINE\x10\x02"z\n\x0f\x44\x61taSplitMethod\x12!\n\x1d\x44\x41TA_SPLIT_METHOD_UNSPECIFIED\x10\x00\x12\n\n\x06RANDOM\x10\x01\x12\n\n\x06\x43USTOM\x10\x02\x12\x0e\n\nSEQUENTIAL\x10\x03\x12\x0c\n\x08NO_SPLIT\x10\x04\x12\x0e\n\nAUTO_SPLIT\x10\x05"W\n\x11LearnRateStrategy\x12#\n\x1fLEARN_RATE_STRATEGY_UNSPECIFIED\x10\x00\x12\x0f\n\x0bLINE_SEARCH\x10\x01\x12\x0c\n\x08\x43ONSTANT\x10\x02"n\n\x14OptimizationStrategy\x12%\n!OPTIMIZATION_STRATEGY_UNSPECIFIED\x10\x00\x12\x1a\n\x16\x42\x41TCH_GRADIENT_DESCENT\x10\x01\x12\x13\n\x0fNORMAL_EQUATION\x10\x02"Z\n\x0fGetModelRequest\x12\x17\n\nproject_id\x18\x01 \x01(\tB\x03\xe0\x41\x02\x12\x17\n\ndataset_id\x18\x02 \x01(\tB\x03\xe0\x41\x02\x12\x15\n\x08model_id\x18\x03 \x01(\tB\x03\xe0\x41\x02"\x91\x01\n\x11PatchModelRequest\x12\x17\n\nproject_id\x18\x01 \x01(\tB\x03\xe0\x41\x02\x12\x17\n\ndataset_id\x18\x02 \x01(\tB\x03\xe0\x41\x02\x12\x15\n\x08model_id\x18\x03 \x01(\tB\x03\xe0\x41\x02\x12\x33\n\x05model\x18\x04 \x01(\x0b\x32\x1f.google.cloud.bigquery.v2.ModelB\x03\xe0\x41\x02"]\n\x12\x44\x65leteModelRequest\x12\x17\n\nproject_id\x18\x01 \x01(\tB\x03\xe0\x41\x02\x12\x17\n\ndataset_id\x18\x02 \x01(\tB\x03\xe0\x41\x02\x12\x15\n\x08model_id\x18\x03 \x01(\tB\x03\xe0\x41\x02"\x8c\x01\n\x11ListModelsRequest\x12\x17\n\nproject_id\x18\x01 \x01(\tB\x03\xe0\x41\x02\x12\x17\n\ndataset_id\x18\x02 \x01(\tB\x03\xe0\x41\x02\x12\x31\n\x0bmax_results\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12\x12\n\npage_token\x18\x04 \x01(\t"^\n\x12ListModelsResponse\x12/\n\x06models\x18\x01 \x03(\x0b\x32\x1f.google.cloud.bigquery.v2.Model\x12\x17\n\x0fnext_page_token\x18\x02 \x01(\t2\xfa\x05\n\x0cModelService\x12y\n\x08GetModel\x12).google.cloud.bigquery.v2.GetModelRequest\x1a\x1f.google.cloud.bigquery.v2.Model"!\xda\x41\x1eproject_id,dataset_id,model_id\x12\x8d\x01\n\nListModels\x12+.google.cloud.bigquery.v2.ListModelsRequest\x1a,.google.cloud.bigquery.v2.ListModelsResponse"$\xda\x41!project_id,dataset_id,max_results\x12\x83\x01\n\nPatchModel\x12+.google.cloud.bigquery.v2.PatchModelRequest\x1a\x1f.google.cloud.bigquery.v2.Model"\'\xda\x41$project_id,dataset_id,model_id,model\x12v\n\x0b\x44\x65leteModel\x12,.google.cloud.bigquery.v2.DeleteModelRequest\x1a\x16.google.protobuf.Empty"!\xda\x41\x1eproject_id,dataset_id,model_id\x1a\xe0\x01\xca\x41\x17\x62igquery.googleapis.com\xd2\x41\xc2\x01https://www.googleapis.com/auth/bigquery,https://www.googleapis.com/auth/bigquery.readonly,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/cloud-platform.read-onlyBl\n\x1c\x63om.google.cloud.bigquery.v2B\nModelProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigqueryb\x06proto3', - dependencies=[ - google_dot_api_dot_client__pb2.DESCRIPTOR, - google_dot_api_dot_field__behavior__pb2.DESCRIPTOR, - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_encryption__config__pb2.DESCRIPTOR, - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__reference__pb2.DESCRIPTOR, - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_standard__sql__pb2.DESCRIPTOR, - google_dot_protobuf_dot_empty__pb2.DESCRIPTOR, - google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR, - google_dot_protobuf_dot_wrappers__pb2.DESCRIPTOR, - google_dot_api_dot_annotations__pb2.DESCRIPTOR, - ], -) - - -_MODEL_KMEANSENUMS_KMEANSINITIALIZATIONMETHOD = _descriptor.EnumDescriptor( - name="KmeansInitializationMethod", - full_name="google.cloud.bigquery.v2.Model.KmeansEnums.KmeansInitializationMethod", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="KMEANS_INITIALIZATION_METHOD_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="RANDOM", - index=1, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="CUSTOM", - index=2, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=1132, - serialized_end=1230, -) -_sym_db.RegisterEnumDescriptor(_MODEL_KMEANSENUMS_KMEANSINITIALIZATIONMETHOD) - -_MODEL_MODELTYPE = _descriptor.EnumDescriptor( - name="ModelType", - full_name="google.cloud.bigquery.v2.Model.ModelType", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="MODEL_TYPE_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="LINEAR_REGRESSION", - index=1, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="LOGISTIC_REGRESSION", - index=2, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="KMEANS", - index=3, - number=3, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="TENSORFLOW", - index=4, - number=6, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=6632, - serialized_end=6747, -) -_sym_db.RegisterEnumDescriptor(_MODEL_MODELTYPE) - -_MODEL_LOSSTYPE = _descriptor.EnumDescriptor( - name="LossType", - full_name="google.cloud.bigquery.v2.Model.LossType", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="LOSS_TYPE_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="MEAN_SQUARED_LOSS", - index=1, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="MEAN_LOG_LOSS", - index=2, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=6749, - serialized_end=6828, -) -_sym_db.RegisterEnumDescriptor(_MODEL_LOSSTYPE) - -_MODEL_DISTANCETYPE = _descriptor.EnumDescriptor( - name="DistanceType", - full_name="google.cloud.bigquery.v2.Model.DistanceType", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="DISTANCE_TYPE_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="EUCLIDEAN", - index=1, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="COSINE", - index=2, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=6830, - serialized_end=6902, -) -_sym_db.RegisterEnumDescriptor(_MODEL_DISTANCETYPE) - -_MODEL_DATASPLITMETHOD = _descriptor.EnumDescriptor( - name="DataSplitMethod", - full_name="google.cloud.bigquery.v2.Model.DataSplitMethod", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="DATA_SPLIT_METHOD_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="RANDOM", - index=1, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="CUSTOM", - index=2, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="SEQUENTIAL", - index=3, - number=3, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="NO_SPLIT", - index=4, - number=4, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="AUTO_SPLIT", - index=5, - number=5, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=6904, - serialized_end=7026, -) -_sym_db.RegisterEnumDescriptor(_MODEL_DATASPLITMETHOD) - -_MODEL_LEARNRATESTRATEGY = _descriptor.EnumDescriptor( - name="LearnRateStrategy", - full_name="google.cloud.bigquery.v2.Model.LearnRateStrategy", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="LEARN_RATE_STRATEGY_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="LINE_SEARCH", - index=1, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="CONSTANT", - index=2, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=7028, - serialized_end=7115, -) -_sym_db.RegisterEnumDescriptor(_MODEL_LEARNRATESTRATEGY) - -_MODEL_OPTIMIZATIONSTRATEGY = _descriptor.EnumDescriptor( - name="OptimizationStrategy", - full_name="google.cloud.bigquery.v2.Model.OptimizationStrategy", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="OPTIMIZATION_STRATEGY_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="BATCH_GRADIENT_DESCENT", - index=1, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="NORMAL_EQUATION", - index=2, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=7117, - serialized_end=7227, -) -_sym_db.RegisterEnumDescriptor(_MODEL_OPTIMIZATIONSTRATEGY) - - -_MODEL_KMEANSENUMS = _descriptor.Descriptor( - name="KmeansEnums", - full_name="google.cloud.bigquery.v2.Model.KmeansEnums", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[], - extensions=[], - nested_types=[], - enum_types=[_MODEL_KMEANSENUMS_KMEANSINITIALIZATIONMETHOD,], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=1117, - serialized_end=1230, -) - -_MODEL_REGRESSIONMETRICS = _descriptor.Descriptor( - name="RegressionMetrics", - full_name="google.cloud.bigquery.v2.Model.RegressionMetrics", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="mean_absolute_error", - full_name="google.cloud.bigquery.v2.Model.RegressionMetrics.mean_absolute_error", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="mean_squared_error", - full_name="google.cloud.bigquery.v2.Model.RegressionMetrics.mean_squared_error", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="mean_squared_log_error", - full_name="google.cloud.bigquery.v2.Model.RegressionMetrics.mean_squared_log_error", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="median_absolute_error", - full_name="google.cloud.bigquery.v2.Model.RegressionMetrics.median_absolute_error", - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="r_squared", - full_name="google.cloud.bigquery.v2.Model.RegressionMetrics.r_squared", - index=4, - number=5, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=1233, - serialized_end=1541, -) - -_MODEL_AGGREGATECLASSIFICATIONMETRICS = _descriptor.Descriptor( - name="AggregateClassificationMetrics", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="precision", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics.precision", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="recall", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics.recall", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="accuracy", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics.accuracy", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="threshold", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics.threshold", - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="f1_score", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics.f1_score", - index=4, - number=5, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="log_loss", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics.log_loss", - index=5, - number=6, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="roc_auc", - full_name="google.cloud.bigquery.v2.Model.AggregateClassificationMetrics.roc_auc", - index=6, - number=7, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=1544, - serialized_end=1911, -) - -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX = _descriptor.Descriptor( - name="BinaryConfusionMatrix", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="positive_class_threshold", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.positive_class_threshold", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="true_positives", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.true_positives", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="false_positives", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.false_positives", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="true_negatives", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.true_negatives", - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="false_negatives", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.false_negatives", - index=4, - number=5, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="precision", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.precision", - index=5, - number=6, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="recall", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.recall", - index=6, - number=7, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="f1_score", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.f1_score", - index=7, - number=8, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="accuracy", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix.accuracy", - index=8, - number=9, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=2221, - serialized_end=2713, -) - -_MODEL_BINARYCLASSIFICATIONMETRICS = _descriptor.Descriptor( - name="BinaryClassificationMetrics", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="aggregate_classification_metrics", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.aggregate_classification_metrics", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="binary_confusion_matrix_list", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.binary_confusion_matrix_list", - index=1, - number=2, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="positive_label", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.positive_label", - index=2, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="negative_label", - full_name="google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.negative_label", - index=3, - number=4, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX,], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=1914, - serialized_end=2713, -) - -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ENTRY = _descriptor.Descriptor( - name="Entry", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="predicted_label", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry.predicted_label", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="item_count", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry.item_count", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=3148, - serialized_end=3229, -) - -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ROW = _descriptor.Descriptor( - name="Row", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Row", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="actual_label", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Row.actual_label", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="entries", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Row.entries", - index=1, - number=2, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=3232, - serialized_end=3363, -) - -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX = _descriptor.Descriptor( - name="ConfusionMatrix", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="confidence_threshold", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.confidence_threshold", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="rows", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.rows", - index=1, - number=2, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[ - _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ENTRY, - _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ROW, - ], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=2970, - serialized_end=3363, -) - -_MODEL_MULTICLASSCLASSIFICATIONMETRICS = _descriptor.Descriptor( - name="MultiClassClassificationMetrics", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="aggregate_classification_metrics", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.aggregate_classification_metrics", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="confusion_matrix_list", - full_name="google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.confusion_matrix_list", - index=1, - number=2, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX,], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=2716, - serialized_end=3363, -) - -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE_CATEGORYCOUNT = _descriptor.Descriptor( - name="CategoryCount", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="category", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount.category", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="count", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount.count", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=4123, - serialized_end=4200, -) - -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE = _descriptor.Descriptor( - name="CategoricalValue", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="category_counts", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.category_counts", - index=0, - number=1, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[ - _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE_CATEGORYCOUNT, - ], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=3975, - serialized_end=4200, -) - -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE = _descriptor.Descriptor( - name="FeatureValue", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="feature_column", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.feature_column", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="numerical_value", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.numerical_value", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="categorical_value", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.categorical_value", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE,], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[ - _descriptor.OneofDescriptor( - name="value", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.value", - index=0, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[], - ), - ], - serialized_start=3759, - serialized_end=4209, -) - -_MODEL_CLUSTERINGMETRICS_CLUSTER = _descriptor.Descriptor( - name="Cluster", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="centroid_id", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.centroid_id", - index=0, - number=1, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="feature_values", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.feature_values", - index=1, - number=2, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="count", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.count", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE,], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=3586, - serialized_end=4209, -) - -_MODEL_CLUSTERINGMETRICS = _descriptor.Descriptor( - name="ClusteringMetrics", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="davies_bouldin_index", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.davies_bouldin_index", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="mean_squared_distance", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.mean_squared_distance", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="clusters", - full_name="google.cloud.bigquery.v2.Model.ClusteringMetrics.clusters", - index=2, - number=3, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[_MODEL_CLUSTERINGMETRICS_CLUSTER,], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=3366, - serialized_end=4209, -) - -_MODEL_EVALUATIONMETRICS = _descriptor.Descriptor( - name="EvaluationMetrics", - full_name="google.cloud.bigquery.v2.Model.EvaluationMetrics", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="regression_metrics", - full_name="google.cloud.bigquery.v2.Model.EvaluationMetrics.regression_metrics", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="binary_classification_metrics", - full_name="google.cloud.bigquery.v2.Model.EvaluationMetrics.binary_classification_metrics", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="multi_class_classification_metrics", - full_name="google.cloud.bigquery.v2.Model.EvaluationMetrics.multi_class_classification_metrics", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="clustering_metrics", - full_name="google.cloud.bigquery.v2.Model.EvaluationMetrics.clustering_metrics", - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[ - _descriptor.OneofDescriptor( - name="metrics", - full_name="google.cloud.bigquery.v2.Model.EvaluationMetrics.metrics", - index=0, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[], - ), - ], - serialized_start=4212, - serialized_end=4617, -) - -_MODEL_TRAININGRUN_TRAININGOPTIONS_LABELCLASSWEIGHTSENTRY = _descriptor.Descriptor( - name="LabelClassWeightsEntry", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.LabelClassWeightsEntry", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="key", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.LabelClassWeightsEntry.key", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="value", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.LabelClassWeightsEntry.value", - index=1, - number=2, - type=1, - cpp_type=5, - label=1, - has_default_value=False, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=b"8\001", - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=6053, - serialized_end=6109, -) - -_MODEL_TRAININGRUN_TRAININGOPTIONS = _descriptor.Descriptor( - name="TrainingOptions", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="max_iterations", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.max_iterations", - index=0, - number=1, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="loss_type", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.loss_type", - index=1, - number=2, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="learn_rate", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.learn_rate", - index=2, - number=3, - type=1, - cpp_type=5, - label=1, - has_default_value=False, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="l1_regularization", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.l1_regularization", - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="l2_regularization", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.l2_regularization", - index=4, - number=5, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="min_relative_progress", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.min_relative_progress", - index=5, - number=6, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="warm_start", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.warm_start", - index=6, - number=7, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="early_stop", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.early_stop", - index=7, - number=8, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="input_label_columns", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.input_label_columns", - index=8, - number=9, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="data_split_method", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.data_split_method", - index=9, - number=10, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="data_split_eval_fraction", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.data_split_eval_fraction", - index=10, - number=11, - type=1, - cpp_type=5, - label=1, - has_default_value=False, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="data_split_column", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.data_split_column", - index=11, - number=12, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="learn_rate_strategy", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.learn_rate_strategy", - index=12, - number=13, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="initial_learn_rate", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.initial_learn_rate", - index=13, - number=16, - type=1, - cpp_type=5, - label=1, - has_default_value=False, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="label_class_weights", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.label_class_weights", - index=14, - number=17, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="distance_type", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.distance_type", - index=15, - number=20, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="num_clusters", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.num_clusters", - index=16, - number=21, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_uri", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.model_uri", - index=17, - number=22, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="optimization_strategy", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.optimization_strategy", - index=18, - number=23, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="kmeans_initialization_method", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.kmeans_initialization_method", - index=19, - number=33, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="kmeans_initialization_column", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.kmeans_initialization_column", - index=20, - number=34, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[_MODEL_TRAININGRUN_TRAININGOPTIONS_LABELCLASSWEIGHTSENTRY,], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=4928, - serialized_end=6109, -) - -_MODEL_TRAININGRUN_ITERATIONRESULT_CLUSTERINFO = _descriptor.Descriptor( - name="ClusterInfo", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.ClusterInfo", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="centroid_id", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.ClusterInfo.centroid_id", - index=0, - number=1, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="cluster_radius", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.ClusterInfo.cluster_radius", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="cluster_size", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.ClusterInfo.cluster_size", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=6444, - serialized_end=6583, -) - -_MODEL_TRAININGRUN_ITERATIONRESULT = _descriptor.Descriptor( - name="IterationResult", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="index", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.index", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="duration_ms", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.duration_ms", - index=1, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="training_loss", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.training_loss", - index=2, - number=5, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="eval_loss", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.eval_loss", - index=3, - number=6, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="learn_rate", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.learn_rate", - index=4, - number=7, - type=1, - cpp_type=5, - label=1, - has_default_value=False, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="cluster_infos", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.cluster_infos", - index=5, - number=8, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[_MODEL_TRAININGRUN_ITERATIONRESULT_CLUSTERINFO,], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=6112, - serialized_end=6583, -) - -_MODEL_TRAININGRUN = _descriptor.Descriptor( - name="TrainingRun", - full_name="google.cloud.bigquery.v2.Model.TrainingRun", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="training_options", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.training_options", - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="start_time", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.start_time", - index=1, - number=8, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="results", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.results", - index=2, - number=6, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="evaluation_metrics", - full_name="google.cloud.bigquery.v2.Model.TrainingRun.evaluation_metrics", - index=3, - number=7, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[ - _MODEL_TRAININGRUN_TRAININGOPTIONS, - _MODEL_TRAININGRUN_ITERATIONRESULT, - ], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=4620, - serialized_end=6583, -) - -_MODEL_LABELSENTRY = _descriptor.Descriptor( - name="LabelsEntry", - full_name="google.cloud.bigquery.v2.Model.LabelsEntry", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="key", - full_name="google.cloud.bigquery.v2.Model.LabelsEntry.key", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="value", - full_name="google.cloud.bigquery.v2.Model.LabelsEntry.value", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=b"8\001", - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=6585, - serialized_end=6630, -) - -_MODEL = _descriptor.Descriptor( - name="Model", - full_name="google.cloud.bigquery.v2.Model", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="etag", - full_name="google.cloud.bigquery.v2.Model.etag", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_reference", - full_name="google.cloud.bigquery.v2.Model.model_reference", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="creation_time", - full_name="google.cloud.bigquery.v2.Model.creation_time", - index=2, - number=5, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="last_modified_time", - full_name="google.cloud.bigquery.v2.Model.last_modified_time", - index=3, - number=6, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="description", - full_name="google.cloud.bigquery.v2.Model.description", - index=4, - number=12, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="friendly_name", - full_name="google.cloud.bigquery.v2.Model.friendly_name", - index=5, - number=14, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="labels", - full_name="google.cloud.bigquery.v2.Model.labels", - index=6, - number=15, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="expiration_time", - full_name="google.cloud.bigquery.v2.Model.expiration_time", - index=7, - number=16, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="location", - full_name="google.cloud.bigquery.v2.Model.location", - index=8, - number=13, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="encryption_configuration", - full_name="google.cloud.bigquery.v2.Model.encryption_configuration", - index=9, - number=17, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_type", - full_name="google.cloud.bigquery.v2.Model.model_type", - index=10, - number=7, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="training_runs", - full_name="google.cloud.bigquery.v2.Model.training_runs", - index=11, - number=9, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="feature_columns", - full_name="google.cloud.bigquery.v2.Model.feature_columns", - index=12, - number=10, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="label_columns", - full_name="google.cloud.bigquery.v2.Model.label_columns", - index=13, - number=11, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\003", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[ - _MODEL_KMEANSENUMS, - _MODEL_REGRESSIONMETRICS, - _MODEL_AGGREGATECLASSIFICATIONMETRICS, - _MODEL_BINARYCLASSIFICATIONMETRICS, - _MODEL_MULTICLASSCLASSIFICATIONMETRICS, - _MODEL_CLUSTERINGMETRICS, - _MODEL_EVALUATIONMETRICS, - _MODEL_TRAININGRUN, - _MODEL_LABELSENTRY, - ], - enum_types=[ - _MODEL_MODELTYPE, - _MODEL_LOSSTYPE, - _MODEL_DISTANCETYPE, - _MODEL_DATASPLITMETHOD, - _MODEL_LEARNRATESTRATEGY, - _MODEL_OPTIMIZATIONSTRATEGY, - ], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=416, - serialized_end=7227, -) - - -_GETMODELREQUEST = _descriptor.Descriptor( - name="GetModelRequest", - full_name="google.cloud.bigquery.v2.GetModelRequest", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="project_id", - full_name="google.cloud.bigquery.v2.GetModelRequest.project_id", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="dataset_id", - full_name="google.cloud.bigquery.v2.GetModelRequest.dataset_id", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_id", - full_name="google.cloud.bigquery.v2.GetModelRequest.model_id", - index=2, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=7229, - serialized_end=7319, -) - - -_PATCHMODELREQUEST = _descriptor.Descriptor( - name="PatchModelRequest", - full_name="google.cloud.bigquery.v2.PatchModelRequest", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="project_id", - full_name="google.cloud.bigquery.v2.PatchModelRequest.project_id", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="dataset_id", - full_name="google.cloud.bigquery.v2.PatchModelRequest.dataset_id", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_id", - full_name="google.cloud.bigquery.v2.PatchModelRequest.model_id", - index=2, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model", - full_name="google.cloud.bigquery.v2.PatchModelRequest.model", - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=7322, - serialized_end=7467, -) - - -_DELETEMODELREQUEST = _descriptor.Descriptor( - name="DeleteModelRequest", - full_name="google.cloud.bigquery.v2.DeleteModelRequest", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="project_id", - full_name="google.cloud.bigquery.v2.DeleteModelRequest.project_id", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="dataset_id", - full_name="google.cloud.bigquery.v2.DeleteModelRequest.dataset_id", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_id", - full_name="google.cloud.bigquery.v2.DeleteModelRequest.model_id", - index=2, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=7469, - serialized_end=7562, -) - - -_LISTMODELSREQUEST = _descriptor.Descriptor( - name="ListModelsRequest", - full_name="google.cloud.bigquery.v2.ListModelsRequest", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="project_id", - full_name="google.cloud.bigquery.v2.ListModelsRequest.project_id", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="dataset_id", - full_name="google.cloud.bigquery.v2.ListModelsRequest.dataset_id", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="max_results", - full_name="google.cloud.bigquery.v2.ListModelsRequest.max_results", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="page_token", - full_name="google.cloud.bigquery.v2.ListModelsRequest.page_token", - index=3, - number=4, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=7565, - serialized_end=7705, -) - - -_LISTMODELSRESPONSE = _descriptor.Descriptor( - name="ListModelsResponse", - full_name="google.cloud.bigquery.v2.ListModelsResponse", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="models", - full_name="google.cloud.bigquery.v2.ListModelsResponse.models", - index=0, - number=1, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="next_page_token", - full_name="google.cloud.bigquery.v2.ListModelsResponse.next_page_token", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=7707, - serialized_end=7801, -) - -_MODEL_KMEANSENUMS.containing_type = _MODEL -_MODEL_KMEANSENUMS_KMEANSINITIALIZATIONMETHOD.containing_type = _MODEL_KMEANSENUMS -_MODEL_REGRESSIONMETRICS.fields_by_name[ - "mean_absolute_error" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_REGRESSIONMETRICS.fields_by_name[ - "mean_squared_error" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_REGRESSIONMETRICS.fields_by_name[ - "mean_squared_log_error" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_REGRESSIONMETRICS.fields_by_name[ - "median_absolute_error" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_REGRESSIONMETRICS.fields_by_name[ - "r_squared" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_REGRESSIONMETRICS.containing_type = _MODEL -_MODEL_AGGREGATECLASSIFICATIONMETRICS.fields_by_name[ - "precision" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_AGGREGATECLASSIFICATIONMETRICS.fields_by_name[ - "recall" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_AGGREGATECLASSIFICATIONMETRICS.fields_by_name[ - "accuracy" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_AGGREGATECLASSIFICATIONMETRICS.fields_by_name[ - "threshold" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_AGGREGATECLASSIFICATIONMETRICS.fields_by_name[ - "f1_score" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_AGGREGATECLASSIFICATIONMETRICS.fields_by_name[ - "log_loss" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_AGGREGATECLASSIFICATIONMETRICS.fields_by_name[ - "roc_auc" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_AGGREGATECLASSIFICATIONMETRICS.containing_type = _MODEL -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "positive_class_threshold" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "true_positives" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "false_positives" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "true_negatives" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "false_negatives" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "precision" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "recall" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "f1_score" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.fields_by_name[ - "accuracy" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX.containing_type = ( - _MODEL_BINARYCLASSIFICATIONMETRICS -) -_MODEL_BINARYCLASSIFICATIONMETRICS.fields_by_name[ - "aggregate_classification_metrics" -].message_type = _MODEL_AGGREGATECLASSIFICATIONMETRICS -_MODEL_BINARYCLASSIFICATIONMETRICS.fields_by_name[ - "binary_confusion_matrix_list" -].message_type = _MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX -_MODEL_BINARYCLASSIFICATIONMETRICS.containing_type = _MODEL -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ENTRY.fields_by_name[ - "item_count" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ENTRY.containing_type = ( - _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX -) -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ROW.fields_by_name[ - "entries" -].message_type = _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ENTRY -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ROW.containing_type = ( - _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX -) -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX.fields_by_name[ - "confidence_threshold" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX.fields_by_name[ - "rows" -].message_type = _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ROW -_MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX.containing_type = ( - _MODEL_MULTICLASSCLASSIFICATIONMETRICS -) -_MODEL_MULTICLASSCLASSIFICATIONMETRICS.fields_by_name[ - "aggregate_classification_metrics" -].message_type = _MODEL_AGGREGATECLASSIFICATIONMETRICS -_MODEL_MULTICLASSCLASSIFICATIONMETRICS.fields_by_name[ - "confusion_matrix_list" -].message_type = _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX -_MODEL_MULTICLASSCLASSIFICATIONMETRICS.containing_type = _MODEL -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE_CATEGORYCOUNT.fields_by_name[ - "count" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE_CATEGORYCOUNT.containing_type = ( - _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE -) -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE.fields_by_name[ - "category_counts" -].message_type = ( - _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE_CATEGORYCOUNT -) -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE.containing_type = ( - _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE -) -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.fields_by_name[ - "numerical_value" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.fields_by_name[ - "categorical_value" -].message_type = _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.containing_type = ( - _MODEL_CLUSTERINGMETRICS_CLUSTER -) -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.oneofs_by_name["value"].fields.append( - _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.fields_by_name["numerical_value"] -) -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.fields_by_name[ - "numerical_value" -].containing_oneof = _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.oneofs_by_name[ - "value" -] -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.oneofs_by_name["value"].fields.append( - _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.fields_by_name["categorical_value"] -) -_MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.fields_by_name[ - "categorical_value" -].containing_oneof = _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE.oneofs_by_name[ - "value" -] -_MODEL_CLUSTERINGMETRICS_CLUSTER.fields_by_name[ - "feature_values" -].message_type = _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE -_MODEL_CLUSTERINGMETRICS_CLUSTER.fields_by_name[ - "count" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_CLUSTERINGMETRICS_CLUSTER.containing_type = _MODEL_CLUSTERINGMETRICS -_MODEL_CLUSTERINGMETRICS.fields_by_name[ - "davies_bouldin_index" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_CLUSTERINGMETRICS.fields_by_name[ - "mean_squared_distance" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_CLUSTERINGMETRICS.fields_by_name[ - "clusters" -].message_type = _MODEL_CLUSTERINGMETRICS_CLUSTER -_MODEL_CLUSTERINGMETRICS.containing_type = _MODEL -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "regression_metrics" -].message_type = _MODEL_REGRESSIONMETRICS -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "binary_classification_metrics" -].message_type = _MODEL_BINARYCLASSIFICATIONMETRICS -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "multi_class_classification_metrics" -].message_type = _MODEL_MULTICLASSCLASSIFICATIONMETRICS -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "clustering_metrics" -].message_type = _MODEL_CLUSTERINGMETRICS -_MODEL_EVALUATIONMETRICS.containing_type = _MODEL -_MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"].fields.append( - _MODEL_EVALUATIONMETRICS.fields_by_name["regression_metrics"] -) -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "regression_metrics" -].containing_oneof = _MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"] -_MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"].fields.append( - _MODEL_EVALUATIONMETRICS.fields_by_name["binary_classification_metrics"] -) -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "binary_classification_metrics" -].containing_oneof = _MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"] -_MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"].fields.append( - _MODEL_EVALUATIONMETRICS.fields_by_name["multi_class_classification_metrics"] -) -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "multi_class_classification_metrics" -].containing_oneof = _MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"] -_MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"].fields.append( - _MODEL_EVALUATIONMETRICS.fields_by_name["clustering_metrics"] -) -_MODEL_EVALUATIONMETRICS.fields_by_name[ - "clustering_metrics" -].containing_oneof = _MODEL_EVALUATIONMETRICS.oneofs_by_name["metrics"] -_MODEL_TRAININGRUN_TRAININGOPTIONS_LABELCLASSWEIGHTSENTRY.containing_type = ( - _MODEL_TRAININGRUN_TRAININGOPTIONS -) -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "loss_type" -].enum_type = _MODEL_LOSSTYPE -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "l1_regularization" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "l2_regularization" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "min_relative_progress" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "warm_start" -].message_type = google_dot_protobuf_dot_wrappers__pb2._BOOLVALUE -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "early_stop" -].message_type = google_dot_protobuf_dot_wrappers__pb2._BOOLVALUE -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "data_split_method" -].enum_type = _MODEL_DATASPLITMETHOD -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "learn_rate_strategy" -].enum_type = _MODEL_LEARNRATESTRATEGY -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "label_class_weights" -].message_type = _MODEL_TRAININGRUN_TRAININGOPTIONS_LABELCLASSWEIGHTSENTRY -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "distance_type" -].enum_type = _MODEL_DISTANCETYPE -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "optimization_strategy" -].enum_type = _MODEL_OPTIMIZATIONSTRATEGY -_MODEL_TRAININGRUN_TRAININGOPTIONS.fields_by_name[ - "kmeans_initialization_method" -].enum_type = _MODEL_KMEANSENUMS_KMEANSINITIALIZATIONMETHOD -_MODEL_TRAININGRUN_TRAININGOPTIONS.containing_type = _MODEL_TRAININGRUN -_MODEL_TRAININGRUN_ITERATIONRESULT_CLUSTERINFO.fields_by_name[ - "cluster_radius" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_TRAININGRUN_ITERATIONRESULT_CLUSTERINFO.fields_by_name[ - "cluster_size" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_TRAININGRUN_ITERATIONRESULT_CLUSTERINFO.containing_type = ( - _MODEL_TRAININGRUN_ITERATIONRESULT -) -_MODEL_TRAININGRUN_ITERATIONRESULT.fields_by_name[ - "index" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT32VALUE -_MODEL_TRAININGRUN_ITERATIONRESULT.fields_by_name[ - "duration_ms" -].message_type = google_dot_protobuf_dot_wrappers__pb2._INT64VALUE -_MODEL_TRAININGRUN_ITERATIONRESULT.fields_by_name[ - "training_loss" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_TRAININGRUN_ITERATIONRESULT.fields_by_name[ - "eval_loss" -].message_type = google_dot_protobuf_dot_wrappers__pb2._DOUBLEVALUE -_MODEL_TRAININGRUN_ITERATIONRESULT.fields_by_name[ - "cluster_infos" -].message_type = _MODEL_TRAININGRUN_ITERATIONRESULT_CLUSTERINFO -_MODEL_TRAININGRUN_ITERATIONRESULT.containing_type = _MODEL_TRAININGRUN -_MODEL_TRAININGRUN.fields_by_name[ - "training_options" -].message_type = _MODEL_TRAININGRUN_TRAININGOPTIONS -_MODEL_TRAININGRUN.fields_by_name[ - "start_time" -].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP -_MODEL_TRAININGRUN.fields_by_name[ - "results" -].message_type = _MODEL_TRAININGRUN_ITERATIONRESULT -_MODEL_TRAININGRUN.fields_by_name[ - "evaluation_metrics" -].message_type = _MODEL_EVALUATIONMETRICS -_MODEL_TRAININGRUN.containing_type = _MODEL -_MODEL_LABELSENTRY.containing_type = _MODEL -_MODEL.fields_by_name[ - "model_reference" -].message_type = ( - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__reference__pb2._MODELREFERENCE -) -_MODEL.fields_by_name["labels"].message_type = _MODEL_LABELSENTRY -_MODEL.fields_by_name[ - "encryption_configuration" -].message_type = ( - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_encryption__config__pb2._ENCRYPTIONCONFIGURATION -) -_MODEL.fields_by_name["model_type"].enum_type = _MODEL_MODELTYPE -_MODEL.fields_by_name["training_runs"].message_type = _MODEL_TRAININGRUN -_MODEL.fields_by_name[ - "feature_columns" -].message_type = ( - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_standard__sql__pb2._STANDARDSQLFIELD -) -_MODEL.fields_by_name[ - "label_columns" -].message_type = ( - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_standard__sql__pb2._STANDARDSQLFIELD -) -_MODEL_MODELTYPE.containing_type = _MODEL -_MODEL_LOSSTYPE.containing_type = _MODEL -_MODEL_DISTANCETYPE.containing_type = _MODEL -_MODEL_DATASPLITMETHOD.containing_type = _MODEL -_MODEL_LEARNRATESTRATEGY.containing_type = _MODEL -_MODEL_OPTIMIZATIONSTRATEGY.containing_type = _MODEL -_PATCHMODELREQUEST.fields_by_name["model"].message_type = _MODEL -_LISTMODELSREQUEST.fields_by_name[ - "max_results" -].message_type = google_dot_protobuf_dot_wrappers__pb2._UINT32VALUE -_LISTMODELSRESPONSE.fields_by_name["models"].message_type = _MODEL -DESCRIPTOR.message_types_by_name["Model"] = _MODEL -DESCRIPTOR.message_types_by_name["GetModelRequest"] = _GETMODELREQUEST -DESCRIPTOR.message_types_by_name["PatchModelRequest"] = _PATCHMODELREQUEST -DESCRIPTOR.message_types_by_name["DeleteModelRequest"] = _DELETEMODELREQUEST -DESCRIPTOR.message_types_by_name["ListModelsRequest"] = _LISTMODELSREQUEST -DESCRIPTOR.message_types_by_name["ListModelsResponse"] = _LISTMODELSRESPONSE -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -Model = _reflection.GeneratedProtocolMessageType( - "Model", - (_message.Message,), - { - "KmeansEnums": _reflection.GeneratedProtocolMessageType( - "KmeansEnums", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_KMEANSENUMS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2" - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.KmeansEnums) - }, - ), - "RegressionMetrics": _reflection.GeneratedProtocolMessageType( - "RegressionMetrics", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_REGRESSIONMETRICS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Evaluation metrics for regression and explicit feedback type matrix - factorization models. - - Attributes: - mean_absolute_error: - Mean absolute error. - mean_squared_error: - Mean squared error. - mean_squared_log_error: - Mean squared log error. - median_absolute_error: - Median absolute error. - r_squared: - R^2 score. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.RegressionMetrics) - }, - ), - "AggregateClassificationMetrics": _reflection.GeneratedProtocolMessageType( - "AggregateClassificationMetrics", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_AGGREGATECLASSIFICATIONMETRICS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Aggregate metrics for classification/classifier models. For multi- - class models, the metrics are either macro-averaged or micro-averaged. - When macro-averaged, the metrics are calculated for each label and - then an unweighted average is taken of those values. When micro- - averaged, the metric is calculated globally by counting the total - number of correctly predicted rows. - - Attributes: - precision: - Precision is the fraction of actual positive predictions that - had positive actual labels. For multiclass this is a macro- - averaged metric treating each class as a binary classifier. - recall: - Recall is the fraction of actual positive labels that were - given a positive prediction. For multiclass this is a macro- - averaged metric. - accuracy: - Accuracy is the fraction of predictions given the correct - label. For multiclass this is a micro-averaged metric. - threshold: - Threshold at which the metrics are computed. For binary - classification models this is the positive class threshold. - For multi-class classfication models this is the confidence - threshold. - f1_score: - The F1 score is an average of recall and precision. For - multiclass this is a macro-averaged metric. - log_loss: - Logarithmic Loss. For multiclass this is a macro-averaged - metric. - roc_auc: - Area Under a ROC Curve. For multiclass this is a macro- - averaged metric. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.AggregateClassificationMetrics) - }, - ), - "BinaryClassificationMetrics": _reflection.GeneratedProtocolMessageType( - "BinaryClassificationMetrics", - (_message.Message,), - { - "BinaryConfusionMatrix": _reflection.GeneratedProtocolMessageType( - "BinaryConfusionMatrix", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_BINARYCLASSIFICATIONMETRICS_BINARYCONFUSIONMATRIX, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Confusion matrix for binary classification models. - - Attributes: - positive_class_threshold: - Threshold value used when computing each of the following - metric. - true_positives: - Number of true samples predicted as true. - false_positives: - Number of false samples predicted as true. - true_negatives: - Number of true samples predicted as false. - false_negatives: - Number of false samples predicted as false. - precision: - The fraction of actual positive predictions that had positive - actual labels. - recall: - The fraction of actual positive labels that were given a - positive prediction. - f1_score: - The equally weighted average of recall and precision. - accuracy: - The fraction of predictions given the correct label. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.BinaryClassificationMetrics.BinaryConfusionMatrix) - }, - ), - "DESCRIPTOR": _MODEL_BINARYCLASSIFICATIONMETRICS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Evaluation metrics for binary classification/classifier models. - - Attributes: - aggregate_classification_metrics: - Aggregate classification metrics. - binary_confusion_matrix_list: - Binary confusion matrix at multiple thresholds. - positive_label: - Label representing the positive class. - negative_label: - Label representing the negative class. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.BinaryClassificationMetrics) - }, - ), - "MultiClassClassificationMetrics": _reflection.GeneratedProtocolMessageType( - "MultiClassClassificationMetrics", - (_message.Message,), - { - "ConfusionMatrix": _reflection.GeneratedProtocolMessageType( - "ConfusionMatrix", - (_message.Message,), - { - "Entry": _reflection.GeneratedProtocolMessageType( - "Entry", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ENTRY, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """A single entry in the confusion matrix. - - Attributes: - predicted_label: - The predicted label. For confidence_threshold > 0, we will - also add an entry indicating the number of items under the - confidence threshold. - item_count: - Number of items being predicted as this label. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry) - }, - ), - "Row": _reflection.GeneratedProtocolMessageType( - "Row", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX_ROW, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """A single row in the confusion matrix. - - Attributes: - actual_label: - The original label of this row. - entries: - Info describing predicted label distribution. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix.Row) - }, - ), - "DESCRIPTOR": _MODEL_MULTICLASSCLASSIFICATIONMETRICS_CONFUSIONMATRIX, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Confusion matrix for multi-class classification models. - - Attributes: - confidence_threshold: - Confidence threshold used when computing the entries of the - confusion matrix. - rows: - One row per actual label. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics.ConfusionMatrix) - }, - ), - "DESCRIPTOR": _MODEL_MULTICLASSCLASSIFICATIONMETRICS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Evaluation metrics for multi-class classification/classifier models. - - Attributes: - aggregate_classification_metrics: - Aggregate classification metrics. - confusion_matrix_list: - Confusion matrix at different thresholds. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.MultiClassClassificationMetrics) - }, - ), - "ClusteringMetrics": _reflection.GeneratedProtocolMessageType( - "ClusteringMetrics", - (_message.Message,), - { - "Cluster": _reflection.GeneratedProtocolMessageType( - "Cluster", - (_message.Message,), - { - "FeatureValue": _reflection.GeneratedProtocolMessageType( - "FeatureValue", - (_message.Message,), - { - "CategoricalValue": _reflection.GeneratedProtocolMessageType( - "CategoricalValue", - (_message.Message,), - { - "CategoryCount": _reflection.GeneratedProtocolMessageType( - "CategoryCount", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE_CATEGORYCOUNT, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Represents the count of a single category within the cluster. - - Attributes: - category: - The name of category. - count: - The count of training samples matching the category within the - cluster. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount) - }, - ), - "DESCRIPTOR": _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE_CATEGORICALVALUE, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Representative value of a categorical feature. - - Attributes: - category_counts: - Counts of all categories for the categorical feature. If there - are more than ten categories, we return top ten (by count) and - return one more CategoryCount with category ``*OTHER*`` and - count as aggregate counts of remaining categories. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue) - }, - ), - "DESCRIPTOR": _MODEL_CLUSTERINGMETRICS_CLUSTER_FEATUREVALUE, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Representative value of a single feature within the cluster. - - Attributes: - feature_column: - The feature column name. - numerical_value: - The numerical feature value. This is the centroid value for - this feature. - categorical_value: - The categorical feature value. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster.FeatureValue) - }, - ), - "DESCRIPTOR": _MODEL_CLUSTERINGMETRICS_CLUSTER, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Message containing the information about one cluster. - - Attributes: - centroid_id: - Centroid id. - feature_values: - Values of highly variant features for this cluster. - count: - Count of training data rows that were assigned to this - cluster. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.ClusteringMetrics.Cluster) - }, - ), - "DESCRIPTOR": _MODEL_CLUSTERINGMETRICS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Evaluation metrics for clustering models. - - Attributes: - davies_bouldin_index: - Davies-Bouldin index. - mean_squared_distance: - Mean of squared distances between each sample to its cluster - centroid. - clusters: - [Beta] Information for all clusters. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.ClusteringMetrics) - }, - ), - "EvaluationMetrics": _reflection.GeneratedProtocolMessageType( - "EvaluationMetrics", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_EVALUATIONMETRICS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Evaluation metrics of a model. These are either computed on all - training data or just the eval data based on whether eval data was - used during training. These are not present for imported models. - - Attributes: - regression_metrics: - Populated for regression models and explicit feedback type - matrix factorization models. - binary_classification_metrics: - Populated for binary classification/classifier models. - multi_class_classification_metrics: - Populated for multi-class classification/classifier models. - clustering_metrics: - Populated for clustering models. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.EvaluationMetrics) - }, - ), - "TrainingRun": _reflection.GeneratedProtocolMessageType( - "TrainingRun", - (_message.Message,), - { - "TrainingOptions": _reflection.GeneratedProtocolMessageType( - "TrainingOptions", - (_message.Message,), - { - "LabelClassWeightsEntry": _reflection.GeneratedProtocolMessageType( - "LabelClassWeightsEntry", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_TRAININGRUN_TRAININGOPTIONS_LABELCLASSWEIGHTSENTRY, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2" - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions.LabelClassWeightsEntry) - }, - ), - "DESCRIPTOR": _MODEL_TRAININGRUN_TRAININGOPTIONS, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Protocol buffer. - - Attributes: - max_iterations: - The maximum number of iterations in training. Used only for - iterative training algorithms. - loss_type: - Type of loss function used during training run. - learn_rate: - Learning rate in training. Used only for iterative training - algorithms. - l1_regularization: - L1 regularization coefficient. - l2_regularization: - L2 regularization coefficient. - min_relative_progress: - When early_stop is true, stops training when accuracy - improvement is less than ‘min_relative_progress’. Used only - for iterative training algorithms. - warm_start: - Whether to train a model from the last checkpoint. - early_stop: - Whether to stop early when the loss doesn’t improve - significantly any more (compared to min_relative_progress). - Used only for iterative training algorithms. - input_label_columns: - Name of input label columns in training data. - data_split_method: - The data split type for training and evaluation, e.g. RANDOM. - data_split_eval_fraction: - The fraction of evaluation data over the whole input data. The - rest of data will be used as training data. The format should - be double. Accurate to two decimal places. Default value is - 0.2. - data_split_column: - The column to split data with. This column won’t be used as a - feature. 1. When data_split_method is CUSTOM, the - corresponding column should be boolean. The rows with true - value tag are eval data, and the false are training data. 2. - When data_split_method is SEQ, the first - DATA_SPLIT_EVAL_FRACTION rows (from smallest to largest) in - the corresponding column are used as training data, and the - rest are eval data. It respects the order in Orderable data - types: - https://cloud.google.com/bigquery/docs/reference/standard- - sql/data-types#data-type-properties - learn_rate_strategy: - The strategy to determine learn rate for the current - iteration. - initial_learn_rate: - Specifies the initial learning rate for the line search learn - rate strategy. - label_class_weights: - Weights associated with each label class, for rebalancing the - training data. Only applicable for classification models. - distance_type: - Distance type for clustering models. - num_clusters: - Number of clusters for clustering models. - model_uri: - [Beta] Google Cloud Storage URI from which the model was - imported. Only applicable for imported models. - optimization_strategy: - Optimization strategy for training linear regression models. - kmeans_initialization_method: - The method used to initialize the centroids for kmeans - algorithm. - kmeans_initialization_column: - The column used to provide the initial centroids for kmeans - algorithm when kmeans_initialization_method is CUSTOM. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.TrainingRun.TrainingOptions) - }, - ), - "IterationResult": _reflection.GeneratedProtocolMessageType( - "IterationResult", - (_message.Message,), - { - "ClusterInfo": _reflection.GeneratedProtocolMessageType( - "ClusterInfo", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_TRAININGRUN_ITERATIONRESULT_CLUSTERINFO, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Information about a single cluster for clustering model. - - Attributes: - centroid_id: - Centroid id. - cluster_radius: - Cluster radius, the average distance from centroid to each - point assigned to the cluster. - cluster_size: - Cluster size, the total number of points assigned to the - cluster. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.TrainingRun.IterationResult.ClusterInfo) - }, - ), - "DESCRIPTOR": _MODEL_TRAININGRUN_ITERATIONRESULT, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Information about a single iteration of the training run. - - Attributes: - index: - Index of the iteration, 0 based. - duration_ms: - Time taken to run the iteration in milliseconds. - training_loss: - Loss computed on the training data at the end of iteration. - eval_loss: - Loss computed on the eval data at the end of iteration. - learn_rate: - Learn rate used for this iteration. - cluster_infos: - Information about top clusters for clustering models. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.TrainingRun.IterationResult) - }, - ), - "DESCRIPTOR": _MODEL_TRAININGRUN, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Information about a single training query run for the model. - - Attributes: - training_options: - Options that were used for this training run, includes user - specified and default options that were used. - start_time: - The start time of this training run. - results: - Output of each iteration run, results.size() <= - max_iterations. - evaluation_metrics: - The evaluation metrics over training/eval data that were - computed at the end of training. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.TrainingRun) - }, - ), - "LabelsEntry": _reflection.GeneratedProtocolMessageType( - "LabelsEntry", - (_message.Message,), - { - "DESCRIPTOR": _MODEL_LABELSENTRY, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2" - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model.LabelsEntry) - }, - ), - "DESCRIPTOR": _MODEL, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Protocol buffer. - - Attributes: - etag: - Output only. A hash of this resource. - model_reference: - Required. Unique identifier for this model. - creation_time: - Output only. The time when this model was created, in - millisecs since the epoch. - last_modified_time: - Output only. The time when this model was last modified, in - millisecs since the epoch. - description: - Optional. A user-friendly description of this model. - friendly_name: - Optional. A descriptive name for this model. - labels: - The labels associated with this model. You can use these to - organize and group your models. Label keys and values can be - no longer than 63 characters, can only contain lowercase - letters, numeric characters, underscores and dashes. - International characters are allowed. Label values are - optional. Label keys must start with a letter and each label - in the list must have a different key. - expiration_time: - Optional. The time when this model expires, in milliseconds - since the epoch. If not present, the model will persist - indefinitely. Expired models will be deleted and their storage - reclaimed. The defaultTableExpirationMs property of the - encapsulating dataset can be used to set a default - expirationTime on newly created models. - location: - Output only. The geographic location where the model resides. - This value is inherited from the dataset. - encryption_configuration: - Custom encryption configuration (e.g., Cloud KMS keys). This - shows the encryption configuration of the model data while - stored in BigQuery storage. - model_type: - Output only. Type of the model resource. - training_runs: - Output only. Information for all training runs in increasing - order of start_time. - feature_columns: - Output only. Input feature columns that were used to train - this model. - label_columns: - Output only. Label columns that were used to train this model. - The output of the model will have a ``predicted\_`` prefix to - these columns. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.Model) - }, -) -_sym_db.RegisterMessage(Model) -_sym_db.RegisterMessage(Model.KmeansEnums) -_sym_db.RegisterMessage(Model.RegressionMetrics) -_sym_db.RegisterMessage(Model.AggregateClassificationMetrics) -_sym_db.RegisterMessage(Model.BinaryClassificationMetrics) -_sym_db.RegisterMessage(Model.BinaryClassificationMetrics.BinaryConfusionMatrix) -_sym_db.RegisterMessage(Model.MultiClassClassificationMetrics) -_sym_db.RegisterMessage(Model.MultiClassClassificationMetrics.ConfusionMatrix) -_sym_db.RegisterMessage(Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry) -_sym_db.RegisterMessage(Model.MultiClassClassificationMetrics.ConfusionMatrix.Row) -_sym_db.RegisterMessage(Model.ClusteringMetrics) -_sym_db.RegisterMessage(Model.ClusteringMetrics.Cluster) -_sym_db.RegisterMessage(Model.ClusteringMetrics.Cluster.FeatureValue) -_sym_db.RegisterMessage(Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue) -_sym_db.RegisterMessage( - Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount -) -_sym_db.RegisterMessage(Model.EvaluationMetrics) -_sym_db.RegisterMessage(Model.TrainingRun) -_sym_db.RegisterMessage(Model.TrainingRun.TrainingOptions) -_sym_db.RegisterMessage(Model.TrainingRun.TrainingOptions.LabelClassWeightsEntry) -_sym_db.RegisterMessage(Model.TrainingRun.IterationResult) -_sym_db.RegisterMessage(Model.TrainingRun.IterationResult.ClusterInfo) -_sym_db.RegisterMessage(Model.LabelsEntry) - -GetModelRequest = _reflection.GeneratedProtocolMessageType( - "GetModelRequest", - (_message.Message,), - { - "DESCRIPTOR": _GETMODELREQUEST, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Protocol buffer. - - Attributes: - project_id: - Required. Project ID of the requested model. - dataset_id: - Required. Dataset ID of the requested model. - model_id: - Required. Model ID of the requested model. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.GetModelRequest) - }, -) -_sym_db.RegisterMessage(GetModelRequest) - -PatchModelRequest = _reflection.GeneratedProtocolMessageType( - "PatchModelRequest", - (_message.Message,), - { - "DESCRIPTOR": _PATCHMODELREQUEST, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Protocol buffer. - - Attributes: - project_id: - Required. Project ID of the model to patch. - dataset_id: - Required. Dataset ID of the model to patch. - model_id: - Required. Model ID of the model to patch. - model: - Required. Patched model. Follows RFC5789 patch semantics. - Missing fields are not updated. To clear a field, explicitly - set to default value. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.PatchModelRequest) - }, -) -_sym_db.RegisterMessage(PatchModelRequest) - -DeleteModelRequest = _reflection.GeneratedProtocolMessageType( - "DeleteModelRequest", - (_message.Message,), - { - "DESCRIPTOR": _DELETEMODELREQUEST, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Protocol buffer. - - Attributes: - project_id: - Required. Project ID of the model to delete. - dataset_id: - Required. Dataset ID of the model to delete. - model_id: - Required. Model ID of the model to delete. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.DeleteModelRequest) - }, -) -_sym_db.RegisterMessage(DeleteModelRequest) - -ListModelsRequest = _reflection.GeneratedProtocolMessageType( - "ListModelsRequest", - (_message.Message,), - { - "DESCRIPTOR": _LISTMODELSREQUEST, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Protocol buffer. - - Attributes: - project_id: - Required. Project ID of the models to list. - dataset_id: - Required. Dataset ID of the models to list. - max_results: - The maximum number of results to return in a single response - page. Leverage the page tokens to iterate through the entire - collection. - page_token: - Page token, returned by a previous call to request the next - page of results - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.ListModelsRequest) - }, -) -_sym_db.RegisterMessage(ListModelsRequest) - -ListModelsResponse = _reflection.GeneratedProtocolMessageType( - "ListModelsResponse", - (_message.Message,), - { - "DESCRIPTOR": _LISTMODELSRESPONSE, - "__module__": "google.cloud.bigquery_v2.proto.model_pb2", - "__doc__": """Protocol buffer. - - Attributes: - models: - Models in the requested dataset. Only the following fields are - populated: model_reference, model_type, creation_time, - last_modified_time and labels. - next_page_token: - A token to request the next page of results. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.ListModelsResponse) - }, -) -_sym_db.RegisterMessage(ListModelsResponse) - - -DESCRIPTOR._options = None -_MODEL_TRAININGRUN_TRAININGOPTIONS_LABELCLASSWEIGHTSENTRY._options = None -_MODEL_LABELSENTRY._options = None -_MODEL.fields_by_name["etag"]._options = None -_MODEL.fields_by_name["model_reference"]._options = None -_MODEL.fields_by_name["creation_time"]._options = None -_MODEL.fields_by_name["last_modified_time"]._options = None -_MODEL.fields_by_name["description"]._options = None -_MODEL.fields_by_name["friendly_name"]._options = None -_MODEL.fields_by_name["expiration_time"]._options = None -_MODEL.fields_by_name["location"]._options = None -_MODEL.fields_by_name["model_type"]._options = None -_MODEL.fields_by_name["training_runs"]._options = None -_MODEL.fields_by_name["feature_columns"]._options = None -_MODEL.fields_by_name["label_columns"]._options = None -_GETMODELREQUEST.fields_by_name["project_id"]._options = None -_GETMODELREQUEST.fields_by_name["dataset_id"]._options = None -_GETMODELREQUEST.fields_by_name["model_id"]._options = None -_PATCHMODELREQUEST.fields_by_name["project_id"]._options = None -_PATCHMODELREQUEST.fields_by_name["dataset_id"]._options = None -_PATCHMODELREQUEST.fields_by_name["model_id"]._options = None -_PATCHMODELREQUEST.fields_by_name["model"]._options = None -_DELETEMODELREQUEST.fields_by_name["project_id"]._options = None -_DELETEMODELREQUEST.fields_by_name["dataset_id"]._options = None -_DELETEMODELREQUEST.fields_by_name["model_id"]._options = None -_LISTMODELSREQUEST.fields_by_name["project_id"]._options = None -_LISTMODELSREQUEST.fields_by_name["dataset_id"]._options = None - -_MODELSERVICE = _descriptor.ServiceDescriptor( - name="ModelService", - full_name="google.cloud.bigquery.v2.ModelService", - file=DESCRIPTOR, - index=0, - serialized_options=b"\312A\027bigquery.googleapis.com\322A\302\001https://www.googleapis.com/auth/bigquery,https://www.googleapis.com/auth/bigquery.readonly,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/cloud-platform.read-only", - create_key=_descriptor._internal_create_key, - serialized_start=7804, - serialized_end=8566, - methods=[ - _descriptor.MethodDescriptor( - name="GetModel", - full_name="google.cloud.bigquery.v2.ModelService.GetModel", - index=0, - containing_service=None, - input_type=_GETMODELREQUEST, - output_type=_MODEL, - serialized_options=b"\332A\036project_id,dataset_id,model_id", - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name="ListModels", - full_name="google.cloud.bigquery.v2.ModelService.ListModels", - index=1, - containing_service=None, - input_type=_LISTMODELSREQUEST, - output_type=_LISTMODELSRESPONSE, - serialized_options=b"\332A!project_id,dataset_id,max_results", - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name="PatchModel", - full_name="google.cloud.bigquery.v2.ModelService.PatchModel", - index=2, - containing_service=None, - input_type=_PATCHMODELREQUEST, - output_type=_MODEL, - serialized_options=b"\332A$project_id,dataset_id,model_id,model", - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name="DeleteModel", - full_name="google.cloud.bigquery.v2.ModelService.DeleteModel", - index=3, - containing_service=None, - input_type=_DELETEMODELREQUEST, - output_type=google_dot_protobuf_dot_empty__pb2._EMPTY, - serialized_options=b"\332A\036project_id,dataset_id,model_id", - create_key=_descriptor._internal_create_key, - ), - ], -) -_sym_db.RegisterServiceDescriptor(_MODELSERVICE) - -DESCRIPTOR.services_by_name["ModelService"] = _MODELSERVICE - -# @@protoc_insertion_point(module_scope) diff --git a/google/cloud/bigquery_v2/proto/model_pb2_grpc.py b/google/cloud/bigquery_v2/proto/model_pb2_grpc.py deleted file mode 100644 index 13db95717..000000000 --- a/google/cloud/bigquery_v2/proto/model_pb2_grpc.py +++ /dev/null @@ -1,214 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc - -from google.cloud.bigquery_v2.proto import ( - model_pb2 as google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2, -) -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 - - -class ModelServiceStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.GetModel = channel.unary_unary( - "/google.cloud.bigquery.v2.ModelService/GetModel", - request_serializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.GetModelRequest.SerializeToString, - response_deserializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.Model.FromString, - ) - self.ListModels = channel.unary_unary( - "/google.cloud.bigquery.v2.ModelService/ListModels", - request_serializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.ListModelsRequest.SerializeToString, - response_deserializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.ListModelsResponse.FromString, - ) - self.PatchModel = channel.unary_unary( - "/google.cloud.bigquery.v2.ModelService/PatchModel", - request_serializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.PatchModelRequest.SerializeToString, - response_deserializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.Model.FromString, - ) - self.DeleteModel = channel.unary_unary( - "/google.cloud.bigquery.v2.ModelService/DeleteModel", - request_serializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.DeleteModelRequest.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) - - -class ModelServiceServicer(object): - """Missing associated documentation comment in .proto file.""" - - def GetModel(self, request, context): - """Gets the specified model resource by model ID. - """ - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def ListModels(self, request, context): - """Lists all models in the specified dataset. Requires the READER dataset - role. - """ - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def PatchModel(self, request, context): - """Patch specific fields in the specified model. - """ - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def DeleteModel(self, request, context): - """Deletes the model specified by modelId from the dataset. - """ - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_ModelServiceServicer_to_server(servicer, server): - rpc_method_handlers = { - "GetModel": grpc.unary_unary_rpc_method_handler( - servicer.GetModel, - request_deserializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.GetModelRequest.FromString, - response_serializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.Model.SerializeToString, - ), - "ListModels": grpc.unary_unary_rpc_method_handler( - servicer.ListModels, - request_deserializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.ListModelsRequest.FromString, - response_serializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.ListModelsResponse.SerializeToString, - ), - "PatchModel": grpc.unary_unary_rpc_method_handler( - servicer.PatchModel, - request_deserializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.PatchModelRequest.FromString, - response_serializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.Model.SerializeToString, - ), - "DeleteModel": grpc.unary_unary_rpc_method_handler( - servicer.DeleteModel, - request_deserializer=google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.DeleteModelRequest.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "google.cloud.bigquery.v2.ModelService", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - - -# This class is part of an EXPERIMENTAL API. -class ModelService(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def GetModel( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/google.cloud.bigquery.v2.ModelService/GetModel", - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.GetModelRequest.SerializeToString, - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.Model.FromString, - options, - channel_credentials, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def ListModels( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/google.cloud.bigquery.v2.ModelService/ListModels", - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.ListModelsRequest.SerializeToString, - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.ListModelsResponse.FromString, - options, - channel_credentials, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def PatchModel( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/google.cloud.bigquery.v2.ModelService/PatchModel", - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.PatchModelRequest.SerializeToString, - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.Model.FromString, - options, - channel_credentials, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def DeleteModel( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/google.cloud.bigquery.v2.ModelService/DeleteModel", - google_dot_cloud_dot_bigquery__v2_dot_proto_dot_model__pb2.DeleteModelRequest.SerializeToString, - google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) diff --git a/google/cloud/bigquery_v2/proto/model_reference.proto b/google/cloud/bigquery_v2/proto/model_reference.proto deleted file mode 100644 index fadd17514..000000000 --- a/google/cloud/bigquery_v2/proto/model_reference.proto +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2019 Google LLC. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; - -package google.cloud.bigquery.v2; - -import "google/api/field_behavior.proto"; -import "google/api/annotations.proto"; - -option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery"; -option java_outer_classname = "ModelReferenceProto"; -option java_package = "com.google.cloud.bigquery.v2"; - -// Id path of a model. -message ModelReference { - // Required. The ID of the project containing this model. - string project_id = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. The ID of the dataset containing this model. - string dataset_id = 2 [(google.api.field_behavior) = REQUIRED]; - - // Required. The ID of the model. The ID must contain only - // letters (a-z, A-Z), numbers (0-9), or underscores (_). The maximum - // length is 1,024 characters. - string model_id = 3 [(google.api.field_behavior) = REQUIRED]; -} diff --git a/google/cloud/bigquery_v2/proto/model_reference_pb2.py b/google/cloud/bigquery_v2/proto/model_reference_pb2.py deleted file mode 100644 index 07d7e4c4b..000000000 --- a/google/cloud/bigquery_v2/proto/model_reference_pb2.py +++ /dev/null @@ -1,142 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: google/cloud/bigquery_v2/proto/model_reference.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.api import field_behavior_pb2 as google_dot_api_dot_field__behavior__pb2 -from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name="google/cloud/bigquery_v2/proto/model_reference.proto", - package="google.cloud.bigquery.v2", - syntax="proto3", - serialized_options=b"\n\034com.google.cloud.bigquery.v2B\023ModelReferenceProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery", - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n4google/cloud/bigquery_v2/proto/model_reference.proto\x12\x18google.cloud.bigquery.v2\x1a\x1fgoogle/api/field_behavior.proto\x1a\x1cgoogle/api/annotations.proto"Y\n\x0eModelReference\x12\x17\n\nproject_id\x18\x01 \x01(\tB\x03\xe0\x41\x02\x12\x17\n\ndataset_id\x18\x02 \x01(\tB\x03\xe0\x41\x02\x12\x15\n\x08model_id\x18\x03 \x01(\tB\x03\xe0\x41\x02\x42u\n\x1c\x63om.google.cloud.bigquery.v2B\x13ModelReferenceProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigqueryb\x06proto3', - dependencies=[ - google_dot_api_dot_field__behavior__pb2.DESCRIPTOR, - google_dot_api_dot_annotations__pb2.DESCRIPTOR, - ], -) - - -_MODELREFERENCE = _descriptor.Descriptor( - name="ModelReference", - full_name="google.cloud.bigquery.v2.ModelReference", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="project_id", - full_name="google.cloud.bigquery.v2.ModelReference.project_id", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="dataset_id", - full_name="google.cloud.bigquery.v2.ModelReference.dataset_id", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_id", - full_name="google.cloud.bigquery.v2.ModelReference.model_id", - index=2, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=145, - serialized_end=234, -) - -DESCRIPTOR.message_types_by_name["ModelReference"] = _MODELREFERENCE -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -ModelReference = _reflection.GeneratedProtocolMessageType( - "ModelReference", - (_message.Message,), - { - "DESCRIPTOR": _MODELREFERENCE, - "__module__": "google.cloud.bigquery_v2.proto.model_reference_pb2", - "__doc__": """Id path of a model. - - Attributes: - project_id: - Required. The ID of the project containing this model. - dataset_id: - Required. The ID of the dataset containing this model. - model_id: - Required. The ID of the model. The ID must contain only - letters (a-z, A-Z), numbers (0-9), or underscores (_). The - maximum length is 1,024 characters. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.ModelReference) - }, -) -_sym_db.RegisterMessage(ModelReference) - - -DESCRIPTOR._options = None -_MODELREFERENCE.fields_by_name["project_id"]._options = None -_MODELREFERENCE.fields_by_name["dataset_id"]._options = None -_MODELREFERENCE.fields_by_name["model_id"]._options = None -# @@protoc_insertion_point(module_scope) diff --git a/google/cloud/bigquery_v2/proto/model_reference_pb2_grpc.py b/google/cloud/bigquery_v2/proto/model_reference_pb2_grpc.py deleted file mode 100644 index 8a9393943..000000000 --- a/google/cloud/bigquery_v2/proto/model_reference_pb2_grpc.py +++ /dev/null @@ -1,3 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc diff --git a/google/cloud/bigquery_v2/proto/standard_sql.proto b/google/cloud/bigquery_v2/proto/standard_sql.proto deleted file mode 100644 index ff69dfc4e..000000000 --- a/google/cloud/bigquery_v2/proto/standard_sql.proto +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2019 Google LLC. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; - -package google.cloud.bigquery.v2; - -import "google/api/field_behavior.proto"; -import "google/api/annotations.proto"; - -option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery"; -option java_outer_classname = "StandardSqlProto"; -option java_package = "com.google.cloud.bigquery.v2"; - -// The type of a variable, e.g., a function argument. -// Examples: -// INT64: {type_kind="INT64"} -// ARRAY: {type_kind="ARRAY", array_element_type="STRING"} -// STRUCT>: -// {type_kind="STRUCT", -// struct_type={fields=[ -// {name="x", type={type_kind="STRING"}}, -// {name="y", type={type_kind="ARRAY", array_element_type="DATE"}} -// ]}} -message StandardSqlDataType { - enum TypeKind { - // Invalid type. - TYPE_KIND_UNSPECIFIED = 0; - - // Encoded as a string in decimal format. - INT64 = 2; - - // Encoded as a boolean "false" or "true". - BOOL = 5; - - // Encoded as a number, or string "NaN", "Infinity" or "-Infinity". - FLOAT64 = 7; - - // Encoded as a string value. - STRING = 8; - - // Encoded as a base64 string per RFC 4648, section 4. - BYTES = 9; - - // Encoded as an RFC 3339 timestamp with mandatory "Z" time zone string: - // 1985-04-12T23:20:50.52Z - TIMESTAMP = 19; - - // Encoded as RFC 3339 full-date format string: 1985-04-12 - DATE = 10; - - // Encoded as RFC 3339 partial-time format string: 23:20:50.52 - TIME = 20; - - // Encoded as RFC 3339 full-date "T" partial-time: 1985-04-12T23:20:50.52 - DATETIME = 21; - - // Encoded as WKT - GEOGRAPHY = 22; - - // Encoded as a decimal string. - NUMERIC = 23; - - // Encoded as a list with types matching Type.array_type. - ARRAY = 16; - - // Encoded as a list with fields of type Type.struct_type[i]. List is used - // because a JSON object cannot have duplicate field names. - STRUCT = 17; - } - - // Required. The top level type of this field. - // Can be any standard SQL data type (e.g., "INT64", "DATE", "ARRAY"). - TypeKind type_kind = 1 [(google.api.field_behavior) = REQUIRED]; - - oneof sub_type { - // The type of the array's elements, if type_kind = "ARRAY". - StandardSqlDataType array_element_type = 2; - - // The fields of this struct, in order, if type_kind = "STRUCT". - StandardSqlStructType struct_type = 3; - } -} - -// A field or a column. -message StandardSqlField { - // Optional. The name of this field. Can be absent for struct fields. - string name = 1 [(google.api.field_behavior) = OPTIONAL]; - - // Optional. The type of this parameter. Absent if not explicitly - // specified (e.g., CREATE FUNCTION statement can omit the return type; - // in this case the output parameter does not have this "type" field). - StandardSqlDataType type = 2 [(google.api.field_behavior) = OPTIONAL]; -} - -message StandardSqlStructType { - repeated StandardSqlField fields = 1; -} diff --git a/google/cloud/bigquery_v2/proto/standard_sql_pb2.py b/google/cloud/bigquery_v2/proto/standard_sql_pb2.py deleted file mode 100644 index 15f6715a2..000000000 --- a/google/cloud/bigquery_v2/proto/standard_sql_pb2.py +++ /dev/null @@ -1,442 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: google/cloud/bigquery_v2/proto/standard_sql.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.api import field_behavior_pb2 as google_dot_api_dot_field__behavior__pb2 -from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name="google/cloud/bigquery_v2/proto/standard_sql.proto", - package="google.cloud.bigquery.v2", - syntax="proto3", - serialized_options=b"\n\034com.google.cloud.bigquery.v2B\020StandardSqlProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery", - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n1google/cloud/bigquery_v2/proto/standard_sql.proto\x12\x18google.cloud.bigquery.v2\x1a\x1fgoogle/api/field_behavior.proto\x1a\x1cgoogle/api/annotations.proto"\xcb\x03\n\x13StandardSqlDataType\x12N\n\ttype_kind\x18\x01 \x01(\x0e\x32\x36.google.cloud.bigquery.v2.StandardSqlDataType.TypeKindB\x03\xe0\x41\x02\x12K\n\x12\x61rray_element_type\x18\x02 \x01(\x0b\x32-.google.cloud.bigquery.v2.StandardSqlDataTypeH\x00\x12\x46\n\x0bstruct_type\x18\x03 \x01(\x0b\x32/.google.cloud.bigquery.v2.StandardSqlStructTypeH\x00"\xc2\x01\n\x08TypeKind\x12\x19\n\x15TYPE_KIND_UNSPECIFIED\x10\x00\x12\t\n\x05INT64\x10\x02\x12\x08\n\x04\x42OOL\x10\x05\x12\x0b\n\x07\x46LOAT64\x10\x07\x12\n\n\x06STRING\x10\x08\x12\t\n\x05\x42YTES\x10\t\x12\r\n\tTIMESTAMP\x10\x13\x12\x08\n\x04\x44\x41TE\x10\n\x12\x08\n\x04TIME\x10\x14\x12\x0c\n\x08\x44\x41TETIME\x10\x15\x12\r\n\tGEOGRAPHY\x10\x16\x12\x0b\n\x07NUMERIC\x10\x17\x12\t\n\x05\x41RRAY\x10\x10\x12\n\n\x06STRUCT\x10\x11\x42\n\n\x08sub_type"g\n\x10StandardSqlField\x12\x11\n\x04name\x18\x01 \x01(\tB\x03\xe0\x41\x01\x12@\n\x04type\x18\x02 \x01(\x0b\x32-.google.cloud.bigquery.v2.StandardSqlDataTypeB\x03\xe0\x41\x01"S\n\x15StandardSqlStructType\x12:\n\x06\x66ields\x18\x01 \x03(\x0b\x32*.google.cloud.bigquery.v2.StandardSqlFieldBr\n\x1c\x63om.google.cloud.bigquery.v2B\x10StandardSqlProtoZ@google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigqueryb\x06proto3', - dependencies=[ - google_dot_api_dot_field__behavior__pb2.DESCRIPTOR, - google_dot_api_dot_annotations__pb2.DESCRIPTOR, - ], -) - - -_STANDARDSQLDATATYPE_TYPEKIND = _descriptor.EnumDescriptor( - name="TypeKind", - full_name="google.cloud.bigquery.v2.StandardSqlDataType.TypeKind", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="TYPE_KIND_UNSPECIFIED", - index=0, - number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="INT64", - index=1, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="BOOL", - index=2, - number=5, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="FLOAT64", - index=3, - number=7, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="STRING", - index=4, - number=8, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="BYTES", - index=5, - number=9, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="TIMESTAMP", - index=6, - number=19, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="DATE", - index=7, - number=10, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="TIME", - index=8, - number=20, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="DATETIME", - index=9, - number=21, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="GEOGRAPHY", - index=10, - number=22, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="NUMERIC", - index=11, - number=23, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="ARRAY", - index=12, - number=16, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="STRUCT", - index=13, - number=17, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=396, - serialized_end=590, -) -_sym_db.RegisterEnumDescriptor(_STANDARDSQLDATATYPE_TYPEKIND) - - -_STANDARDSQLDATATYPE = _descriptor.Descriptor( - name="StandardSqlDataType", - full_name="google.cloud.bigquery.v2.StandardSqlDataType", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="type_kind", - full_name="google.cloud.bigquery.v2.StandardSqlDataType.type_kind", - index=0, - number=1, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\002", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="array_element_type", - full_name="google.cloud.bigquery.v2.StandardSqlDataType.array_element_type", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="struct_type", - full_name="google.cloud.bigquery.v2.StandardSqlDataType.struct_type", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[_STANDARDSQLDATATYPE_TYPEKIND,], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[ - _descriptor.OneofDescriptor( - name="sub_type", - full_name="google.cloud.bigquery.v2.StandardSqlDataType.sub_type", - index=0, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[], - ), - ], - serialized_start=143, - serialized_end=602, -) - - -_STANDARDSQLFIELD = _descriptor.Descriptor( - name="StandardSqlField", - full_name="google.cloud.bigquery.v2.StandardSqlField", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="name", - full_name="google.cloud.bigquery.v2.StandardSqlField.name", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="type", - full_name="google.cloud.bigquery.v2.StandardSqlField.type", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\340A\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=604, - serialized_end=707, -) - - -_STANDARDSQLSTRUCTTYPE = _descriptor.Descriptor( - name="StandardSqlStructType", - full_name="google.cloud.bigquery.v2.StandardSqlStructType", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="fields", - full_name="google.cloud.bigquery.v2.StandardSqlStructType.fields", - index=0, - number=1, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto3", - extension_ranges=[], - oneofs=[], - serialized_start=709, - serialized_end=792, -) - -_STANDARDSQLDATATYPE.fields_by_name[ - "type_kind" -].enum_type = _STANDARDSQLDATATYPE_TYPEKIND -_STANDARDSQLDATATYPE.fields_by_name[ - "array_element_type" -].message_type = _STANDARDSQLDATATYPE -_STANDARDSQLDATATYPE.fields_by_name["struct_type"].message_type = _STANDARDSQLSTRUCTTYPE -_STANDARDSQLDATATYPE_TYPEKIND.containing_type = _STANDARDSQLDATATYPE -_STANDARDSQLDATATYPE.oneofs_by_name["sub_type"].fields.append( - _STANDARDSQLDATATYPE.fields_by_name["array_element_type"] -) -_STANDARDSQLDATATYPE.fields_by_name[ - "array_element_type" -].containing_oneof = _STANDARDSQLDATATYPE.oneofs_by_name["sub_type"] -_STANDARDSQLDATATYPE.oneofs_by_name["sub_type"].fields.append( - _STANDARDSQLDATATYPE.fields_by_name["struct_type"] -) -_STANDARDSQLDATATYPE.fields_by_name[ - "struct_type" -].containing_oneof = _STANDARDSQLDATATYPE.oneofs_by_name["sub_type"] -_STANDARDSQLFIELD.fields_by_name["type"].message_type = _STANDARDSQLDATATYPE -_STANDARDSQLSTRUCTTYPE.fields_by_name["fields"].message_type = _STANDARDSQLFIELD -DESCRIPTOR.message_types_by_name["StandardSqlDataType"] = _STANDARDSQLDATATYPE -DESCRIPTOR.message_types_by_name["StandardSqlField"] = _STANDARDSQLFIELD -DESCRIPTOR.message_types_by_name["StandardSqlStructType"] = _STANDARDSQLSTRUCTTYPE -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -StandardSqlDataType = _reflection.GeneratedProtocolMessageType( - "StandardSqlDataType", - (_message.Message,), - { - "DESCRIPTOR": _STANDARDSQLDATATYPE, - "__module__": "google.cloud.bigquery_v2.proto.standard_sql_pb2", - "__doc__": """The type of a variable, e.g., a function argument. Examples: INT64: - {type_kind=``INT64``} ARRAY: {type_kind=``ARRAY``, - array_element_type=``STRING``} STRUCT: - {type_kind=``STRUCT``, struct_type={fields=[ {name=``x``, - type={type_kind=``STRING``}}, {name=``y``, type={type_kind=``ARRAY``, - array_element_type=``DATE``}} ]}} - - Attributes: - type_kind: - Required. The top level type of this field. Can be any - standard SQL data type (e.g., ``INT64``, ``DATE``, ``ARRAY``). - array_element_type: - The type of the array’s elements, if type_kind = ``ARRAY``. - struct_type: - The fields of this struct, in order, if type_kind = ``STRUCT``. - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.StandardSqlDataType) - }, -) -_sym_db.RegisterMessage(StandardSqlDataType) - -StandardSqlField = _reflection.GeneratedProtocolMessageType( - "StandardSqlField", - (_message.Message,), - { - "DESCRIPTOR": _STANDARDSQLFIELD, - "__module__": "google.cloud.bigquery_v2.proto.standard_sql_pb2", - "__doc__": """A field or a column. - - Attributes: - name: - Optional. The name of this field. Can be absent for struct - fields. - type: - Optional. The type of this parameter. Absent if not explicitly - specified (e.g., CREATE FUNCTION statement can omit the return - type; in this case the output parameter does not have this - ``type`` field). - """, - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.StandardSqlField) - }, -) -_sym_db.RegisterMessage(StandardSqlField) - -StandardSqlStructType = _reflection.GeneratedProtocolMessageType( - "StandardSqlStructType", - (_message.Message,), - { - "DESCRIPTOR": _STANDARDSQLSTRUCTTYPE, - "__module__": "google.cloud.bigquery_v2.proto.standard_sql_pb2" - # @@protoc_insertion_point(class_scope:google.cloud.bigquery.v2.StandardSqlStructType) - }, -) -_sym_db.RegisterMessage(StandardSqlStructType) - - -DESCRIPTOR._options = None -_STANDARDSQLDATATYPE.fields_by_name["type_kind"]._options = None -_STANDARDSQLFIELD.fields_by_name["name"]._options = None -_STANDARDSQLFIELD.fields_by_name["type"]._options = None -# @@protoc_insertion_point(module_scope) diff --git a/google/cloud/bigquery_v2/proto/standard_sql_pb2_grpc.py b/google/cloud/bigquery_v2/proto/standard_sql_pb2_grpc.py deleted file mode 100644 index 8a9393943..000000000 --- a/google/cloud/bigquery_v2/proto/standard_sql_pb2_grpc.py +++ /dev/null @@ -1,3 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc diff --git a/google/cloud/bigquery_v2/py.typed b/google/cloud/bigquery_v2/py.typed new file mode 100644 index 000000000..e73777993 --- /dev/null +++ b/google/cloud/bigquery_v2/py.typed @@ -0,0 +1,2 @@ +# Marker file for PEP 561. +# The google-cloud-bigquery package uses inline types. diff --git a/google/cloud/bigquery_v2/types.py b/google/cloud/bigquery_v2/types.py deleted file mode 100644 index 7d4f9b732..000000000 --- a/google/cloud/bigquery_v2/types.py +++ /dev/null @@ -1,58 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import absolute_import -import sys - -from google.api_core.protobuf_helpers import get_messages - -from google.cloud.bigquery_v2.proto import encryption_config_pb2 -from google.cloud.bigquery_v2.proto import model_pb2 -from google.cloud.bigquery_v2.proto import model_reference_pb2 -from google.cloud.bigquery_v2.proto import standard_sql_pb2 -from google.protobuf import empty_pb2 -from google.protobuf import timestamp_pb2 -from google.protobuf import wrappers_pb2 - - -_shared_modules = [ - empty_pb2, - timestamp_pb2, - wrappers_pb2, -] - -_local_modules = [ - encryption_config_pb2, - model_pb2, - model_reference_pb2, - standard_sql_pb2, -] - -names = [] - -for module in _shared_modules: # pragma: NO COVER - for name, message in get_messages(module).items(): - setattr(sys.modules[__name__], name, message) - names.append(name) -for module in _local_modules: - for name, message in get_messages(module).items(): - message.__module__ = "google.cloud.bigquery_v2.types" - setattr(sys.modules[__name__], name, message) - names.append(name) - - -__all__ = tuple(sorted(names)) diff --git a/google/cloud/bigquery_v2/types/__init__.py b/google/cloud/bigquery_v2/types/__init__.py new file mode 100644 index 000000000..83bbb3a54 --- /dev/null +++ b/google/cloud/bigquery_v2/types/__init__.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from .encryption_config import EncryptionConfiguration +from .model import ( + DeleteModelRequest, + GetModelRequest, + ListModelsRequest, + ListModelsResponse, + Model, + PatchModelRequest, +) +from .model_reference import ModelReference +from .standard_sql import ( + StandardSqlDataType, + StandardSqlField, + StandardSqlStructType, + StandardSqlTableType, +) +from .table_reference import TableReference + +__all__ = ( + "EncryptionConfiguration", + "DeleteModelRequest", + "GetModelRequest", + "ListModelsRequest", + "ListModelsResponse", + "Model", + "PatchModelRequest", + "ModelReference", + "StandardSqlDataType", + "StandardSqlField", + "StandardSqlStructType", + "StandardSqlTableType", + "TableReference", +) diff --git a/google/cloud/bigquery_v2/types/encryption_config.py b/google/cloud/bigquery_v2/types/encryption_config.py new file mode 100644 index 000000000..4b9139733 --- /dev/null +++ b/google/cloud/bigquery_v2/types/encryption_config.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import proto # type: ignore + +from google.protobuf import wrappers_pb2 # type: ignore + + +__protobuf__ = proto.module( + package="google.cloud.bigquery.v2", manifest={"EncryptionConfiguration",}, +) + + +class EncryptionConfiguration(proto.Message): + r""" + Attributes: + kms_key_name (google.protobuf.wrappers_pb2.StringValue): + Optional. Describes the Cloud KMS encryption + key that will be used to protect destination + BigQuery table. The BigQuery Service Account + associated with your project requires access to + this encryption key. + """ + + kms_key_name = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.StringValue, + ) + + +__all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/google/cloud/bigquery_v2/types/model.py b/google/cloud/bigquery_v2/types/model.py new file mode 100644 index 000000000..706418401 --- /dev/null +++ b/google/cloud/bigquery_v2/types/model.py @@ -0,0 +1,1507 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import proto # type: ignore + +from google.cloud.bigquery_v2.types import encryption_config +from google.cloud.bigquery_v2.types import model_reference as gcb_model_reference +from google.cloud.bigquery_v2.types import standard_sql +from google.cloud.bigquery_v2.types import table_reference +from google.protobuf import timestamp_pb2 # type: ignore +from google.protobuf import wrappers_pb2 # type: ignore + + +__protobuf__ = proto.module( + package="google.cloud.bigquery.v2", + manifest={ + "Model", + "GetModelRequest", + "PatchModelRequest", + "DeleteModelRequest", + "ListModelsRequest", + "ListModelsResponse", + }, +) + + +class Model(proto.Message): + r""" + Attributes: + etag (str): + Output only. A hash of this resource. + model_reference (google.cloud.bigquery_v2.types.ModelReference): + Required. Unique identifier for this model. + creation_time (int): + Output only. The time when this model was + created, in millisecs since the epoch. + last_modified_time (int): + Output only. The time when this model was + last modified, in millisecs since the epoch. + description (str): + Optional. A user-friendly description of this + model. + friendly_name (str): + Optional. A descriptive name for this model. + labels (Sequence[google.cloud.bigquery_v2.types.Model.LabelsEntry]): + The labels associated with this model. You + can use these to organize and group your models. + Label keys and values can be no longer than 63 + characters, can only contain lowercase letters, + numeric characters, underscores and dashes. + International characters are allowed. Label + values are optional. Label keys must start with + a letter and each label in the list must have a + different key. + expiration_time (int): + Optional. The time when this model expires, + in milliseconds since the epoch. If not present, + the model will persist indefinitely. Expired + models will be deleted and their storage + reclaimed. The defaultTableExpirationMs + property of the encapsulating dataset can be + used to set a default expirationTime on newly + created models. + location (str): + Output only. The geographic location where + the model resides. This value is inherited from + the dataset. + encryption_configuration (google.cloud.bigquery_v2.types.EncryptionConfiguration): + Custom encryption configuration (e.g., Cloud + KMS keys). This shows the encryption + configuration of the model data while stored in + BigQuery storage. This field can be used with + PatchModel to update encryption key for an + already encrypted model. + model_type (google.cloud.bigquery_v2.types.Model.ModelType): + Output only. Type of the model resource. + training_runs (Sequence[google.cloud.bigquery_v2.types.Model.TrainingRun]): + Output only. Information for all training runs in increasing + order of start_time. + feature_columns (Sequence[google.cloud.bigquery_v2.types.StandardSqlField]): + Output only. Input feature columns that were + used to train this model. + label_columns (Sequence[google.cloud.bigquery_v2.types.StandardSqlField]): + Output only. Label columns that were used to train this + model. The output of the model will have a `predicted_` + prefix to these columns. + best_trial_id (int): + The best trial_id across all training runs. + """ + + class ModelType(proto.Enum): + r"""Indicates the type of the Model.""" + MODEL_TYPE_UNSPECIFIED = 0 + LINEAR_REGRESSION = 1 + LOGISTIC_REGRESSION = 2 + KMEANS = 3 + MATRIX_FACTORIZATION = 4 + DNN_CLASSIFIER = 5 + TENSORFLOW = 6 + DNN_REGRESSOR = 7 + BOOSTED_TREE_REGRESSOR = 9 + BOOSTED_TREE_CLASSIFIER = 10 + ARIMA = 11 + AUTOML_REGRESSOR = 12 + AUTOML_CLASSIFIER = 13 + ARIMA_PLUS = 19 + + class LossType(proto.Enum): + r"""Loss metric to evaluate model training performance.""" + LOSS_TYPE_UNSPECIFIED = 0 + MEAN_SQUARED_LOSS = 1 + MEAN_LOG_LOSS = 2 + + class DistanceType(proto.Enum): + r"""Distance metric used to compute the distance between two + points. + """ + DISTANCE_TYPE_UNSPECIFIED = 0 + EUCLIDEAN = 1 + COSINE = 2 + + class DataSplitMethod(proto.Enum): + r"""Indicates the method to split input data into multiple + tables. + """ + DATA_SPLIT_METHOD_UNSPECIFIED = 0 + RANDOM = 1 + CUSTOM = 2 + SEQUENTIAL = 3 + NO_SPLIT = 4 + AUTO_SPLIT = 5 + + class DataFrequency(proto.Enum): + r"""Type of supported data frequency for time series forecasting + models. + """ + DATA_FREQUENCY_UNSPECIFIED = 0 + AUTO_FREQUENCY = 1 + YEARLY = 2 + QUARTERLY = 3 + MONTHLY = 4 + WEEKLY = 5 + DAILY = 6 + HOURLY = 7 + PER_MINUTE = 8 + + class HolidayRegion(proto.Enum): + r"""Type of supported holiday regions for time series forecasting + models. + """ + HOLIDAY_REGION_UNSPECIFIED = 0 + GLOBAL = 1 + NA = 2 + JAPAC = 3 + EMEA = 4 + LAC = 5 + AE = 6 + AR = 7 + AT = 8 + AU = 9 + BE = 10 + BR = 11 + CA = 12 + CH = 13 + CL = 14 + CN = 15 + CO = 16 + CS = 17 + CZ = 18 + DE = 19 + DK = 20 + DZ = 21 + EC = 22 + EE = 23 + EG = 24 + ES = 25 + FI = 26 + FR = 27 + GB = 28 + GR = 29 + HK = 30 + HU = 31 + ID = 32 + IE = 33 + IL = 34 + IN = 35 + IR = 36 + IT = 37 + JP = 38 + KR = 39 + LV = 40 + MA = 41 + MX = 42 + MY = 43 + NG = 44 + NL = 45 + NO = 46 + NZ = 47 + PE = 48 + PH = 49 + PK = 50 + PL = 51 + PT = 52 + RO = 53 + RS = 54 + RU = 55 + SA = 56 + SE = 57 + SG = 58 + SI = 59 + SK = 60 + TH = 61 + TR = 62 + TW = 63 + UA = 64 + US = 65 + VE = 66 + VN = 67 + ZA = 68 + + class LearnRateStrategy(proto.Enum): + r"""Indicates the learning rate optimization strategy to use.""" + LEARN_RATE_STRATEGY_UNSPECIFIED = 0 + LINE_SEARCH = 1 + CONSTANT = 2 + + class OptimizationStrategy(proto.Enum): + r"""Indicates the optimization strategy used for training.""" + OPTIMIZATION_STRATEGY_UNSPECIFIED = 0 + BATCH_GRADIENT_DESCENT = 1 + NORMAL_EQUATION = 2 + + class FeedbackType(proto.Enum): + r"""Indicates the training algorithm to use for matrix + factorization models. + """ + FEEDBACK_TYPE_UNSPECIFIED = 0 + IMPLICIT = 1 + EXPLICIT = 2 + + class SeasonalPeriod(proto.Message): + r""" """ + + class SeasonalPeriodType(proto.Enum): + r"""""" + SEASONAL_PERIOD_TYPE_UNSPECIFIED = 0 + NO_SEASONALITY = 1 + DAILY = 2 + WEEKLY = 3 + MONTHLY = 4 + QUARTERLY = 5 + YEARLY = 6 + + class KmeansEnums(proto.Message): + r""" """ + + class KmeansInitializationMethod(proto.Enum): + r"""Indicates the method used to initialize the centroids for + KMeans clustering algorithm. + """ + KMEANS_INITIALIZATION_METHOD_UNSPECIFIED = 0 + RANDOM = 1 + CUSTOM = 2 + KMEANS_PLUS_PLUS = 3 + + class RegressionMetrics(proto.Message): + r"""Evaluation metrics for regression and explicit feedback type + matrix factorization models. + + Attributes: + mean_absolute_error (google.protobuf.wrappers_pb2.DoubleValue): + Mean absolute error. + mean_squared_error (google.protobuf.wrappers_pb2.DoubleValue): + Mean squared error. + mean_squared_log_error (google.protobuf.wrappers_pb2.DoubleValue): + Mean squared log error. + median_absolute_error (google.protobuf.wrappers_pb2.DoubleValue): + Median absolute error. + r_squared (google.protobuf.wrappers_pb2.DoubleValue): + R^2 score. This corresponds to r2_score in ML.EVALUATE. + """ + + mean_absolute_error = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.DoubleValue, + ) + mean_squared_error = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.DoubleValue, + ) + mean_squared_log_error = proto.Field( + proto.MESSAGE, number=3, message=wrappers_pb2.DoubleValue, + ) + median_absolute_error = proto.Field( + proto.MESSAGE, number=4, message=wrappers_pb2.DoubleValue, + ) + r_squared = proto.Field( + proto.MESSAGE, number=5, message=wrappers_pb2.DoubleValue, + ) + + class AggregateClassificationMetrics(proto.Message): + r"""Aggregate metrics for classification/classifier models. For + multi-class models, the metrics are either macro-averaged or + micro-averaged. When macro-averaged, the metrics are calculated + for each label and then an unweighted average is taken of those + values. When micro-averaged, the metric is calculated globally + by counting the total number of correctly predicted rows. + + Attributes: + precision (google.protobuf.wrappers_pb2.DoubleValue): + Precision is the fraction of actual positive + predictions that had positive actual labels. For + multiclass this is a macro-averaged metric + treating each class as a binary classifier. + recall (google.protobuf.wrappers_pb2.DoubleValue): + Recall is the fraction of actual positive + labels that were given a positive prediction. + For multiclass this is a macro-averaged metric. + accuracy (google.protobuf.wrappers_pb2.DoubleValue): + Accuracy is the fraction of predictions given + the correct label. For multiclass this is a + micro-averaged metric. + threshold (google.protobuf.wrappers_pb2.DoubleValue): + Threshold at which the metrics are computed. + For binary classification models this is the + positive class threshold. For multi-class + classfication models this is the confidence + threshold. + f1_score (google.protobuf.wrappers_pb2.DoubleValue): + The F1 score is an average of recall and + precision. For multiclass this is a macro- + averaged metric. + log_loss (google.protobuf.wrappers_pb2.DoubleValue): + Logarithmic Loss. For multiclass this is a + macro-averaged metric. + roc_auc (google.protobuf.wrappers_pb2.DoubleValue): + Area Under a ROC Curve. For multiclass this + is a macro-averaged metric. + """ + + precision = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.DoubleValue, + ) + recall = proto.Field(proto.MESSAGE, number=2, message=wrappers_pb2.DoubleValue,) + accuracy = proto.Field( + proto.MESSAGE, number=3, message=wrappers_pb2.DoubleValue, + ) + threshold = proto.Field( + proto.MESSAGE, number=4, message=wrappers_pb2.DoubleValue, + ) + f1_score = proto.Field( + proto.MESSAGE, number=5, message=wrappers_pb2.DoubleValue, + ) + log_loss = proto.Field( + proto.MESSAGE, number=6, message=wrappers_pb2.DoubleValue, + ) + roc_auc = proto.Field( + proto.MESSAGE, number=7, message=wrappers_pb2.DoubleValue, + ) + + class BinaryClassificationMetrics(proto.Message): + r"""Evaluation metrics for binary classification/classifier + models. + + Attributes: + aggregate_classification_metrics (google.cloud.bigquery_v2.types.Model.AggregateClassificationMetrics): + Aggregate classification metrics. + binary_confusion_matrix_list (Sequence[google.cloud.bigquery_v2.types.Model.BinaryClassificationMetrics.BinaryConfusionMatrix]): + Binary confusion matrix at multiple + thresholds. + positive_label (str): + Label representing the positive class. + negative_label (str): + Label representing the negative class. + """ + + class BinaryConfusionMatrix(proto.Message): + r"""Confusion matrix for binary classification models. + Attributes: + positive_class_threshold (google.protobuf.wrappers_pb2.DoubleValue): + Threshold value used when computing each of + the following metric. + true_positives (google.protobuf.wrappers_pb2.Int64Value): + Number of true samples predicted as true. + false_positives (google.protobuf.wrappers_pb2.Int64Value): + Number of false samples predicted as true. + true_negatives (google.protobuf.wrappers_pb2.Int64Value): + Number of true samples predicted as false. + false_negatives (google.protobuf.wrappers_pb2.Int64Value): + Number of false samples predicted as false. + precision (google.protobuf.wrappers_pb2.DoubleValue): + The fraction of actual positive predictions + that had positive actual labels. + recall (google.protobuf.wrappers_pb2.DoubleValue): + The fraction of actual positive labels that + were given a positive prediction. + f1_score (google.protobuf.wrappers_pb2.DoubleValue): + The equally weighted average of recall and + precision. + accuracy (google.protobuf.wrappers_pb2.DoubleValue): + The fraction of predictions given the correct + label. + """ + + positive_class_threshold = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.DoubleValue, + ) + true_positives = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.Int64Value, + ) + false_positives = proto.Field( + proto.MESSAGE, number=3, message=wrappers_pb2.Int64Value, + ) + true_negatives = proto.Field( + proto.MESSAGE, number=4, message=wrappers_pb2.Int64Value, + ) + false_negatives = proto.Field( + proto.MESSAGE, number=5, message=wrappers_pb2.Int64Value, + ) + precision = proto.Field( + proto.MESSAGE, number=6, message=wrappers_pb2.DoubleValue, + ) + recall = proto.Field( + proto.MESSAGE, number=7, message=wrappers_pb2.DoubleValue, + ) + f1_score = proto.Field( + proto.MESSAGE, number=8, message=wrappers_pb2.DoubleValue, + ) + accuracy = proto.Field( + proto.MESSAGE, number=9, message=wrappers_pb2.DoubleValue, + ) + + aggregate_classification_metrics = proto.Field( + proto.MESSAGE, number=1, message="Model.AggregateClassificationMetrics", + ) + binary_confusion_matrix_list = proto.RepeatedField( + proto.MESSAGE, + number=2, + message="Model.BinaryClassificationMetrics.BinaryConfusionMatrix", + ) + positive_label = proto.Field(proto.STRING, number=3,) + negative_label = proto.Field(proto.STRING, number=4,) + + class MultiClassClassificationMetrics(proto.Message): + r"""Evaluation metrics for multi-class classification/classifier + models. + + Attributes: + aggregate_classification_metrics (google.cloud.bigquery_v2.types.Model.AggregateClassificationMetrics): + Aggregate classification metrics. + confusion_matrix_list (Sequence[google.cloud.bigquery_v2.types.Model.MultiClassClassificationMetrics.ConfusionMatrix]): + Confusion matrix at different thresholds. + """ + + class ConfusionMatrix(proto.Message): + r"""Confusion matrix for multi-class classification models. + Attributes: + confidence_threshold (google.protobuf.wrappers_pb2.DoubleValue): + Confidence threshold used when computing the + entries of the confusion matrix. + rows (Sequence[google.cloud.bigquery_v2.types.Model.MultiClassClassificationMetrics.ConfusionMatrix.Row]): + One row per actual label. + """ + + class Entry(proto.Message): + r"""A single entry in the confusion matrix. + Attributes: + predicted_label (str): + The predicted label. For confidence_threshold > 0, we will + also add an entry indicating the number of items under the + confidence threshold. + item_count (google.protobuf.wrappers_pb2.Int64Value): + Number of items being predicted as this + label. + """ + + predicted_label = proto.Field(proto.STRING, number=1,) + item_count = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.Int64Value, + ) + + class Row(proto.Message): + r"""A single row in the confusion matrix. + Attributes: + actual_label (str): + The original label of this row. + entries (Sequence[google.cloud.bigquery_v2.types.Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry]): + Info describing predicted label distribution. + """ + + actual_label = proto.Field(proto.STRING, number=1,) + entries = proto.RepeatedField( + proto.MESSAGE, + number=2, + message="Model.MultiClassClassificationMetrics.ConfusionMatrix.Entry", + ) + + confidence_threshold = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.DoubleValue, + ) + rows = proto.RepeatedField( + proto.MESSAGE, + number=2, + message="Model.MultiClassClassificationMetrics.ConfusionMatrix.Row", + ) + + aggregate_classification_metrics = proto.Field( + proto.MESSAGE, number=1, message="Model.AggregateClassificationMetrics", + ) + confusion_matrix_list = proto.RepeatedField( + proto.MESSAGE, + number=2, + message="Model.MultiClassClassificationMetrics.ConfusionMatrix", + ) + + class ClusteringMetrics(proto.Message): + r"""Evaluation metrics for clustering models. + Attributes: + davies_bouldin_index (google.protobuf.wrappers_pb2.DoubleValue): + Davies-Bouldin index. + mean_squared_distance (google.protobuf.wrappers_pb2.DoubleValue): + Mean of squared distances between each sample + to its cluster centroid. + clusters (Sequence[google.cloud.bigquery_v2.types.Model.ClusteringMetrics.Cluster]): + Information for all clusters. + """ + + class Cluster(proto.Message): + r"""Message containing the information about one cluster. + Attributes: + centroid_id (int): + Centroid id. + feature_values (Sequence[google.cloud.bigquery_v2.types.Model.ClusteringMetrics.Cluster.FeatureValue]): + Values of highly variant features for this + cluster. + count (google.protobuf.wrappers_pb2.Int64Value): + Count of training data rows that were + assigned to this cluster. + """ + + class FeatureValue(proto.Message): + r"""Representative value of a single feature within the cluster. + Attributes: + feature_column (str): + The feature column name. + numerical_value (google.protobuf.wrappers_pb2.DoubleValue): + The numerical feature value. This is the + centroid value for this feature. + categorical_value (google.cloud.bigquery_v2.types.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue): + The categorical feature value. + """ + + class CategoricalValue(proto.Message): + r"""Representative value of a categorical feature. + Attributes: + category_counts (Sequence[google.cloud.bigquery_v2.types.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount]): + Counts of all categories for the categorical feature. If + there are more than ten categories, we return top ten (by + count) and return one more CategoryCount with category + "*OTHER*" and count as aggregate counts of remaining + categories. + """ + + class CategoryCount(proto.Message): + r"""Represents the count of a single category within the cluster. + Attributes: + category (str): + The name of category. + count (google.protobuf.wrappers_pb2.Int64Value): + The count of training samples matching the + category within the cluster. + """ + + category = proto.Field(proto.STRING, number=1,) + count = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.Int64Value, + ) + + category_counts = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount", + ) + + feature_column = proto.Field(proto.STRING, number=1,) + numerical_value = proto.Field( + proto.MESSAGE, + number=2, + oneof="value", + message=wrappers_pb2.DoubleValue, + ) + categorical_value = proto.Field( + proto.MESSAGE, + number=3, + oneof="value", + message="Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue", + ) + + centroid_id = proto.Field(proto.INT64, number=1,) + feature_values = proto.RepeatedField( + proto.MESSAGE, + number=2, + message="Model.ClusteringMetrics.Cluster.FeatureValue", + ) + count = proto.Field( + proto.MESSAGE, number=3, message=wrappers_pb2.Int64Value, + ) + + davies_bouldin_index = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.DoubleValue, + ) + mean_squared_distance = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.DoubleValue, + ) + clusters = proto.RepeatedField( + proto.MESSAGE, number=3, message="Model.ClusteringMetrics.Cluster", + ) + + class RankingMetrics(proto.Message): + r"""Evaluation metrics used by weighted-ALS models specified by + feedback_type=implicit. + + Attributes: + mean_average_precision (google.protobuf.wrappers_pb2.DoubleValue): + Calculates a precision per user for all the + items by ranking them and then averages all the + precisions across all the users. + mean_squared_error (google.protobuf.wrappers_pb2.DoubleValue): + Similar to the mean squared error computed in + regression and explicit recommendation models + except instead of computing the rating directly, + the output from evaluate is computed against a + preference which is 1 or 0 depending on if the + rating exists or not. + normalized_discounted_cumulative_gain (google.protobuf.wrappers_pb2.DoubleValue): + A metric to determine the goodness of a + ranking calculated from the predicted confidence + by comparing it to an ideal rank measured by the + original ratings. + average_rank (google.protobuf.wrappers_pb2.DoubleValue): + Determines the goodness of a ranking by + computing the percentile rank from the predicted + confidence and dividing it by the original rank. + """ + + mean_average_precision = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.DoubleValue, + ) + mean_squared_error = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.DoubleValue, + ) + normalized_discounted_cumulative_gain = proto.Field( + proto.MESSAGE, number=3, message=wrappers_pb2.DoubleValue, + ) + average_rank = proto.Field( + proto.MESSAGE, number=4, message=wrappers_pb2.DoubleValue, + ) + + class ArimaForecastingMetrics(proto.Message): + r"""Model evaluation metrics for ARIMA forecasting models. + Attributes: + non_seasonal_order (Sequence[google.cloud.bigquery_v2.types.Model.ArimaOrder]): + Non-seasonal order. + arima_fitting_metrics (Sequence[google.cloud.bigquery_v2.types.Model.ArimaFittingMetrics]): + Arima model fitting metrics. + seasonal_periods (Sequence[google.cloud.bigquery_v2.types.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + has_drift (Sequence[bool]): + Whether Arima model fitted with drift or not. + It is always false when d is not 1. + time_series_id (Sequence[str]): + Id to differentiate different time series for + the large-scale case. + arima_single_model_forecasting_metrics (Sequence[google.cloud.bigquery_v2.types.Model.ArimaForecastingMetrics.ArimaSingleModelForecastingMetrics]): + Repeated as there can be many metric sets + (one for each model) in auto-arima and the + large-scale case. + """ + + class ArimaSingleModelForecastingMetrics(proto.Message): + r"""Model evaluation metrics for a single ARIMA forecasting + model. + + Attributes: + non_seasonal_order (google.cloud.bigquery_v2.types.Model.ArimaOrder): + Non-seasonal order. + arima_fitting_metrics (google.cloud.bigquery_v2.types.Model.ArimaFittingMetrics): + Arima fitting metrics. + has_drift (bool): + Is arima model fitted with drift or not. It + is always false when d is not 1. + time_series_id (str): + The time_series_id value for this time series. It will be + one of the unique values from the time_series_id_column + specified during ARIMA model training. Only present when + time_series_id_column training option was used. + time_series_ids (Sequence[str]): + The tuple of time_series_ids identifying this time series. + It will be one of the unique tuples of values present in the + time_series_id_columns specified during ARIMA model + training. Only present when time_series_id_columns training + option was used and the order of values here are same as the + order of time_series_id_columns. + seasonal_periods (Sequence[google.cloud.bigquery_v2.types.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + has_holiday_effect (google.protobuf.wrappers_pb2.BoolValue): + If true, holiday_effect is a part of time series + decomposition result. + has_spikes_and_dips (google.protobuf.wrappers_pb2.BoolValue): + If true, spikes_and_dips is a part of time series + decomposition result. + has_step_changes (google.protobuf.wrappers_pb2.BoolValue): + If true, step_changes is a part of time series decomposition + result. + """ + + non_seasonal_order = proto.Field( + proto.MESSAGE, number=1, message="Model.ArimaOrder", + ) + arima_fitting_metrics = proto.Field( + proto.MESSAGE, number=2, message="Model.ArimaFittingMetrics", + ) + has_drift = proto.Field(proto.BOOL, number=3,) + time_series_id = proto.Field(proto.STRING, number=4,) + time_series_ids = proto.RepeatedField(proto.STRING, number=9,) + seasonal_periods = proto.RepeatedField( + proto.ENUM, number=5, enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + has_holiday_effect = proto.Field( + proto.MESSAGE, number=6, message=wrappers_pb2.BoolValue, + ) + has_spikes_and_dips = proto.Field( + proto.MESSAGE, number=7, message=wrappers_pb2.BoolValue, + ) + has_step_changes = proto.Field( + proto.MESSAGE, number=8, message=wrappers_pb2.BoolValue, + ) + + non_seasonal_order = proto.RepeatedField( + proto.MESSAGE, number=1, message="Model.ArimaOrder", + ) + arima_fitting_metrics = proto.RepeatedField( + proto.MESSAGE, number=2, message="Model.ArimaFittingMetrics", + ) + seasonal_periods = proto.RepeatedField( + proto.ENUM, number=3, enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + has_drift = proto.RepeatedField(proto.BOOL, number=4,) + time_series_id = proto.RepeatedField(proto.STRING, number=5,) + arima_single_model_forecasting_metrics = proto.RepeatedField( + proto.MESSAGE, + number=6, + message="Model.ArimaForecastingMetrics.ArimaSingleModelForecastingMetrics", + ) + + class EvaluationMetrics(proto.Message): + r"""Evaluation metrics of a model. These are either computed on + all training data or just the eval data based on whether eval + data was used during training. These are not present for + imported models. + + Attributes: + regression_metrics (google.cloud.bigquery_v2.types.Model.RegressionMetrics): + Populated for regression models and explicit + feedback type matrix factorization models. + binary_classification_metrics (google.cloud.bigquery_v2.types.Model.BinaryClassificationMetrics): + Populated for binary + classification/classifier models. + multi_class_classification_metrics (google.cloud.bigquery_v2.types.Model.MultiClassClassificationMetrics): + Populated for multi-class + classification/classifier models. + clustering_metrics (google.cloud.bigquery_v2.types.Model.ClusteringMetrics): + Populated for clustering models. + ranking_metrics (google.cloud.bigquery_v2.types.Model.RankingMetrics): + Populated for implicit feedback type matrix + factorization models. + arima_forecasting_metrics (google.cloud.bigquery_v2.types.Model.ArimaForecastingMetrics): + Populated for ARIMA models. + """ + + regression_metrics = proto.Field( + proto.MESSAGE, number=1, oneof="metrics", message="Model.RegressionMetrics", + ) + binary_classification_metrics = proto.Field( + proto.MESSAGE, + number=2, + oneof="metrics", + message="Model.BinaryClassificationMetrics", + ) + multi_class_classification_metrics = proto.Field( + proto.MESSAGE, + number=3, + oneof="metrics", + message="Model.MultiClassClassificationMetrics", + ) + clustering_metrics = proto.Field( + proto.MESSAGE, number=4, oneof="metrics", message="Model.ClusteringMetrics", + ) + ranking_metrics = proto.Field( + proto.MESSAGE, number=5, oneof="metrics", message="Model.RankingMetrics", + ) + arima_forecasting_metrics = proto.Field( + proto.MESSAGE, + number=6, + oneof="metrics", + message="Model.ArimaForecastingMetrics", + ) + + class DataSplitResult(proto.Message): + r"""Data split result. This contains references to the training + and evaluation data tables that were used to train the model. + + Attributes: + training_table (google.cloud.bigquery_v2.types.TableReference): + Table reference of the training data after + split. + evaluation_table (google.cloud.bigquery_v2.types.TableReference): + Table reference of the evaluation data after + split. + """ + + training_table = proto.Field( + proto.MESSAGE, number=1, message=table_reference.TableReference, + ) + evaluation_table = proto.Field( + proto.MESSAGE, number=2, message=table_reference.TableReference, + ) + + class ArimaOrder(proto.Message): + r"""Arima order, can be used for both non-seasonal and seasonal + parts. + + Attributes: + p (int): + Order of the autoregressive part. + d (int): + Order of the differencing part. + q (int): + Order of the moving-average part. + """ + + p = proto.Field(proto.INT64, number=1,) + d = proto.Field(proto.INT64, number=2,) + q = proto.Field(proto.INT64, number=3,) + + class ArimaFittingMetrics(proto.Message): + r"""ARIMA model fitting metrics. + Attributes: + log_likelihood (float): + Log-likelihood. + aic (float): + AIC. + variance (float): + Variance. + """ + + log_likelihood = proto.Field(proto.DOUBLE, number=1,) + aic = proto.Field(proto.DOUBLE, number=2,) + variance = proto.Field(proto.DOUBLE, number=3,) + + class GlobalExplanation(proto.Message): + r"""Global explanations containing the top most important + features after training. + + Attributes: + explanations (Sequence[google.cloud.bigquery_v2.types.Model.GlobalExplanation.Explanation]): + A list of the top global explanations. Sorted + by absolute value of attribution in descending + order. + class_label (str): + Class label for this set of global + explanations. Will be empty/null for binary + logistic and linear regression models. Sorted + alphabetically in descending order. + """ + + class Explanation(proto.Message): + r"""Explanation for a single feature. + Attributes: + feature_name (str): + Full name of the feature. For non-numerical features, will + be formatted like .. + Overall size of feature name will always be truncated to + first 120 characters. + attribution (google.protobuf.wrappers_pb2.DoubleValue): + Attribution of feature. + """ + + feature_name = proto.Field(proto.STRING, number=1,) + attribution = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.DoubleValue, + ) + + explanations = proto.RepeatedField( + proto.MESSAGE, number=1, message="Model.GlobalExplanation.Explanation", + ) + class_label = proto.Field(proto.STRING, number=2,) + + class TrainingRun(proto.Message): + r"""Information about a single training query run for the model. + Attributes: + training_options (google.cloud.bigquery_v2.types.Model.TrainingRun.TrainingOptions): + Options that were used for this training run, + includes user specified and default options that + were used. + start_time (google.protobuf.timestamp_pb2.Timestamp): + The start time of this training run. + results (Sequence[google.cloud.bigquery_v2.types.Model.TrainingRun.IterationResult]): + Output of each iteration run, results.size() <= + max_iterations. + evaluation_metrics (google.cloud.bigquery_v2.types.Model.EvaluationMetrics): + The evaluation metrics over training/eval + data that were computed at the end of training. + data_split_result (google.cloud.bigquery_v2.types.Model.DataSplitResult): + Data split result of the training run. Only + set when the input data is actually split. + global_explanations (Sequence[google.cloud.bigquery_v2.types.Model.GlobalExplanation]): + Global explanations for important features of + the model. For multi-class models, there is one + entry for each label class. For other models, + there is only one entry in the list. + """ + + class TrainingOptions(proto.Message): + r"""Options used in model training. + Attributes: + max_iterations (int): + The maximum number of iterations in training. + Used only for iterative training algorithms. + loss_type (google.cloud.bigquery_v2.types.Model.LossType): + Type of loss function used during training + run. + learn_rate (float): + Learning rate in training. Used only for + iterative training algorithms. + l1_regularization (google.protobuf.wrappers_pb2.DoubleValue): + L1 regularization coefficient. + l2_regularization (google.protobuf.wrappers_pb2.DoubleValue): + L2 regularization coefficient. + min_relative_progress (google.protobuf.wrappers_pb2.DoubleValue): + When early_stop is true, stops training when accuracy + improvement is less than 'min_relative_progress'. Used only + for iterative training algorithms. + warm_start (google.protobuf.wrappers_pb2.BoolValue): + Whether to train a model from the last + checkpoint. + early_stop (google.protobuf.wrappers_pb2.BoolValue): + Whether to stop early when the loss doesn't improve + significantly any more (compared to min_relative_progress). + Used only for iterative training algorithms. + input_label_columns (Sequence[str]): + Name of input label columns in training data. + data_split_method (google.cloud.bigquery_v2.types.Model.DataSplitMethod): + The data split type for training and + evaluation, e.g. RANDOM. + data_split_eval_fraction (float): + The fraction of evaluation data over the + whole input data. The rest of data will be used + as training data. The format should be double. + Accurate to two decimal places. + Default value is 0.2. + data_split_column (str): + The column to split data with. This column won't be used as + a feature. + + 1. When data_split_method is CUSTOM, the corresponding + column should be boolean. The rows with true value tag + are eval data, and the false are training data. + 2. When data_split_method is SEQ, the first + DATA_SPLIT_EVAL_FRACTION rows (from smallest to largest) + in the corresponding column are used as training data, + and the rest are eval data. It respects the order in + Orderable data types: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data-type-properties + learn_rate_strategy (google.cloud.bigquery_v2.types.Model.LearnRateStrategy): + The strategy to determine learn rate for the + current iteration. + initial_learn_rate (float): + Specifies the initial learning rate for the + line search learn rate strategy. + label_class_weights (Sequence[google.cloud.bigquery_v2.types.Model.TrainingRun.TrainingOptions.LabelClassWeightsEntry]): + Weights associated with each label class, for + rebalancing the training data. Only applicable + for classification models. + user_column (str): + User column specified for matrix + factorization models. + item_column (str): + Item column specified for matrix + factorization models. + distance_type (google.cloud.bigquery_v2.types.Model.DistanceType): + Distance type for clustering models. + num_clusters (int): + Number of clusters for clustering models. + model_uri (str): + Google Cloud Storage URI from which the model + was imported. Only applicable for imported + models. + optimization_strategy (google.cloud.bigquery_v2.types.Model.OptimizationStrategy): + Optimization strategy for training linear + regression models. + hidden_units (Sequence[int]): + Hidden units for dnn models. + batch_size (int): + Batch size for dnn models. + dropout (google.protobuf.wrappers_pb2.DoubleValue): + Dropout probability for dnn models. + max_tree_depth (int): + Maximum depth of a tree for boosted tree + models. + subsample (float): + Subsample fraction of the training data to + grow tree to prevent overfitting for boosted + tree models. + min_split_loss (google.protobuf.wrappers_pb2.DoubleValue): + Minimum split loss for boosted tree models. + num_factors (int): + Num factors specified for matrix + factorization models. + feedback_type (google.cloud.bigquery_v2.types.Model.FeedbackType): + Feedback type that specifies which algorithm + to run for matrix factorization. + wals_alpha (google.protobuf.wrappers_pb2.DoubleValue): + Hyperparameter for matrix factoration when + implicit feedback type is specified. + kmeans_initialization_method (google.cloud.bigquery_v2.types.Model.KmeansEnums.KmeansInitializationMethod): + The method used to initialize the centroids + for kmeans algorithm. + kmeans_initialization_column (str): + The column used to provide the initial centroids for kmeans + algorithm when kmeans_initialization_method is CUSTOM. + time_series_timestamp_column (str): + Column to be designated as time series + timestamp for ARIMA model. + time_series_data_column (str): + Column to be designated as time series data + for ARIMA model. + auto_arima (bool): + Whether to enable auto ARIMA or not. + non_seasonal_order (google.cloud.bigquery_v2.types.Model.ArimaOrder): + A specification of the non-seasonal part of + the ARIMA model: the three components (p, d, q) + are the AR order, the degree of differencing, + and the MA order. + data_frequency (google.cloud.bigquery_v2.types.Model.DataFrequency): + The data frequency of a time series. + include_drift (bool): + Include drift when fitting an ARIMA model. + holiday_region (google.cloud.bigquery_v2.types.Model.HolidayRegion): + The geographical region based on which the + holidays are considered in time series modeling. + If a valid value is specified, then holiday + effects modeling is enabled. + time_series_id_column (str): + The time series id column that was used + during ARIMA model training. + time_series_id_columns (Sequence[str]): + The time series id columns that were used + during ARIMA model training. + horizon (int): + The number of periods ahead that need to be + forecasted. + preserve_input_structs (bool): + Whether to preserve the input structs in output feature + names. Suppose there is a struct A with field b. When false + (default), the output feature name is A_b. When true, the + output feature name is A.b. + auto_arima_max_order (int): + The max value of non-seasonal p and q. + decompose_time_series (google.protobuf.wrappers_pb2.BoolValue): + If true, perform decompose time series and + save the results. + clean_spikes_and_dips (google.protobuf.wrappers_pb2.BoolValue): + If true, clean spikes and dips in the input + time series. + adjust_step_changes (google.protobuf.wrappers_pb2.BoolValue): + If true, detect step changes and make data + adjustment in the input time series. + """ + + max_iterations = proto.Field(proto.INT64, number=1,) + loss_type = proto.Field(proto.ENUM, number=2, enum="Model.LossType",) + learn_rate = proto.Field(proto.DOUBLE, number=3,) + l1_regularization = proto.Field( + proto.MESSAGE, number=4, message=wrappers_pb2.DoubleValue, + ) + l2_regularization = proto.Field( + proto.MESSAGE, number=5, message=wrappers_pb2.DoubleValue, + ) + min_relative_progress = proto.Field( + proto.MESSAGE, number=6, message=wrappers_pb2.DoubleValue, + ) + warm_start = proto.Field( + proto.MESSAGE, number=7, message=wrappers_pb2.BoolValue, + ) + early_stop = proto.Field( + proto.MESSAGE, number=8, message=wrappers_pb2.BoolValue, + ) + input_label_columns = proto.RepeatedField(proto.STRING, number=9,) + data_split_method = proto.Field( + proto.ENUM, number=10, enum="Model.DataSplitMethod", + ) + data_split_eval_fraction = proto.Field(proto.DOUBLE, number=11,) + data_split_column = proto.Field(proto.STRING, number=12,) + learn_rate_strategy = proto.Field( + proto.ENUM, number=13, enum="Model.LearnRateStrategy", + ) + initial_learn_rate = proto.Field(proto.DOUBLE, number=16,) + label_class_weights = proto.MapField(proto.STRING, proto.DOUBLE, number=17,) + user_column = proto.Field(proto.STRING, number=18,) + item_column = proto.Field(proto.STRING, number=19,) + distance_type = proto.Field( + proto.ENUM, number=20, enum="Model.DistanceType", + ) + num_clusters = proto.Field(proto.INT64, number=21,) + model_uri = proto.Field(proto.STRING, number=22,) + optimization_strategy = proto.Field( + proto.ENUM, number=23, enum="Model.OptimizationStrategy", + ) + hidden_units = proto.RepeatedField(proto.INT64, number=24,) + batch_size = proto.Field(proto.INT64, number=25,) + dropout = proto.Field( + proto.MESSAGE, number=26, message=wrappers_pb2.DoubleValue, + ) + max_tree_depth = proto.Field(proto.INT64, number=27,) + subsample = proto.Field(proto.DOUBLE, number=28,) + min_split_loss = proto.Field( + proto.MESSAGE, number=29, message=wrappers_pb2.DoubleValue, + ) + num_factors = proto.Field(proto.INT64, number=30,) + feedback_type = proto.Field( + proto.ENUM, number=31, enum="Model.FeedbackType", + ) + wals_alpha = proto.Field( + proto.MESSAGE, number=32, message=wrappers_pb2.DoubleValue, + ) + kmeans_initialization_method = proto.Field( + proto.ENUM, + number=33, + enum="Model.KmeansEnums.KmeansInitializationMethod", + ) + kmeans_initialization_column = proto.Field(proto.STRING, number=34,) + time_series_timestamp_column = proto.Field(proto.STRING, number=35,) + time_series_data_column = proto.Field(proto.STRING, number=36,) + auto_arima = proto.Field(proto.BOOL, number=37,) + non_seasonal_order = proto.Field( + proto.MESSAGE, number=38, message="Model.ArimaOrder", + ) + data_frequency = proto.Field( + proto.ENUM, number=39, enum="Model.DataFrequency", + ) + include_drift = proto.Field(proto.BOOL, number=41,) + holiday_region = proto.Field( + proto.ENUM, number=42, enum="Model.HolidayRegion", + ) + time_series_id_column = proto.Field(proto.STRING, number=43,) + time_series_id_columns = proto.RepeatedField(proto.STRING, number=51,) + horizon = proto.Field(proto.INT64, number=44,) + preserve_input_structs = proto.Field(proto.BOOL, number=45,) + auto_arima_max_order = proto.Field(proto.INT64, number=46,) + decompose_time_series = proto.Field( + proto.MESSAGE, number=50, message=wrappers_pb2.BoolValue, + ) + clean_spikes_and_dips = proto.Field( + proto.MESSAGE, number=52, message=wrappers_pb2.BoolValue, + ) + adjust_step_changes = proto.Field( + proto.MESSAGE, number=53, message=wrappers_pb2.BoolValue, + ) + + class IterationResult(proto.Message): + r"""Information about a single iteration of the training run. + Attributes: + index (google.protobuf.wrappers_pb2.Int32Value): + Index of the iteration, 0 based. + duration_ms (google.protobuf.wrappers_pb2.Int64Value): + Time taken to run the iteration in + milliseconds. + training_loss (google.protobuf.wrappers_pb2.DoubleValue): + Loss computed on the training data at the end + of iteration. + eval_loss (google.protobuf.wrappers_pb2.DoubleValue): + Loss computed on the eval data at the end of + iteration. + learn_rate (float): + Learn rate used for this iteration. + cluster_infos (Sequence[google.cloud.bigquery_v2.types.Model.TrainingRun.IterationResult.ClusterInfo]): + Information about top clusters for clustering + models. + arima_result (google.cloud.bigquery_v2.types.Model.TrainingRun.IterationResult.ArimaResult): + + """ + + class ClusterInfo(proto.Message): + r"""Information about a single cluster for clustering model. + Attributes: + centroid_id (int): + Centroid id. + cluster_radius (google.protobuf.wrappers_pb2.DoubleValue): + Cluster radius, the average distance from + centroid to each point assigned to the cluster. + cluster_size (google.protobuf.wrappers_pb2.Int64Value): + Cluster size, the total number of points + assigned to the cluster. + """ + + centroid_id = proto.Field(proto.INT64, number=1,) + cluster_radius = proto.Field( + proto.MESSAGE, number=2, message=wrappers_pb2.DoubleValue, + ) + cluster_size = proto.Field( + proto.MESSAGE, number=3, message=wrappers_pb2.Int64Value, + ) + + class ArimaResult(proto.Message): + r"""(Auto-)arima fitting result. Wrap everything in ArimaResult + for easier refactoring if we want to use model-specific + iteration results. + + Attributes: + arima_model_info (Sequence[google.cloud.bigquery_v2.types.Model.TrainingRun.IterationResult.ArimaResult.ArimaModelInfo]): + This message is repeated because there are + multiple arima models fitted in auto-arima. For + non-auto-arima model, its size is one. + seasonal_periods (Sequence[google.cloud.bigquery_v2.types.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + """ + + class ArimaCoefficients(proto.Message): + r"""Arima coefficients. + Attributes: + auto_regressive_coefficients (Sequence[float]): + Auto-regressive coefficients, an array of + double. + moving_average_coefficients (Sequence[float]): + Moving-average coefficients, an array of + double. + intercept_coefficient (float): + Intercept coefficient, just a double not an + array. + """ + + auto_regressive_coefficients = proto.RepeatedField( + proto.DOUBLE, number=1, + ) + moving_average_coefficients = proto.RepeatedField( + proto.DOUBLE, number=2, + ) + intercept_coefficient = proto.Field(proto.DOUBLE, number=3,) + + class ArimaModelInfo(proto.Message): + r"""Arima model information. + Attributes: + non_seasonal_order (google.cloud.bigquery_v2.types.Model.ArimaOrder): + Non-seasonal order. + arima_coefficients (google.cloud.bigquery_v2.types.Model.TrainingRun.IterationResult.ArimaResult.ArimaCoefficients): + Arima coefficients. + arima_fitting_metrics (google.cloud.bigquery_v2.types.Model.ArimaFittingMetrics): + Arima fitting metrics. + has_drift (bool): + Whether Arima model fitted with drift or not. + It is always false when d is not 1. + time_series_id (str): + The time_series_id value for this time series. It will be + one of the unique values from the time_series_id_column + specified during ARIMA model training. Only present when + time_series_id_column training option was used. + time_series_ids (Sequence[str]): + The tuple of time_series_ids identifying this time series. + It will be one of the unique tuples of values present in the + time_series_id_columns specified during ARIMA model + training. Only present when time_series_id_columns training + option was used and the order of values here are same as the + order of time_series_id_columns. + seasonal_periods (Sequence[google.cloud.bigquery_v2.types.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + has_holiday_effect (google.protobuf.wrappers_pb2.BoolValue): + If true, holiday_effect is a part of time series + decomposition result. + has_spikes_and_dips (google.protobuf.wrappers_pb2.BoolValue): + If true, spikes_and_dips is a part of time series + decomposition result. + has_step_changes (google.protobuf.wrappers_pb2.BoolValue): + If true, step_changes is a part of time series decomposition + result. + """ + + non_seasonal_order = proto.Field( + proto.MESSAGE, number=1, message="Model.ArimaOrder", + ) + arima_coefficients = proto.Field( + proto.MESSAGE, + number=2, + message="Model.TrainingRun.IterationResult.ArimaResult.ArimaCoefficients", + ) + arima_fitting_metrics = proto.Field( + proto.MESSAGE, number=3, message="Model.ArimaFittingMetrics", + ) + has_drift = proto.Field(proto.BOOL, number=4,) + time_series_id = proto.Field(proto.STRING, number=5,) + time_series_ids = proto.RepeatedField(proto.STRING, number=10,) + seasonal_periods = proto.RepeatedField( + proto.ENUM, + number=6, + enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + has_holiday_effect = proto.Field( + proto.MESSAGE, number=7, message=wrappers_pb2.BoolValue, + ) + has_spikes_and_dips = proto.Field( + proto.MESSAGE, number=8, message=wrappers_pb2.BoolValue, + ) + has_step_changes = proto.Field( + proto.MESSAGE, number=9, message=wrappers_pb2.BoolValue, + ) + + arima_model_info = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Model.TrainingRun.IterationResult.ArimaResult.ArimaModelInfo", + ) + seasonal_periods = proto.RepeatedField( + proto.ENUM, + number=2, + enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + + index = proto.Field( + proto.MESSAGE, number=1, message=wrappers_pb2.Int32Value, + ) + duration_ms = proto.Field( + proto.MESSAGE, number=4, message=wrappers_pb2.Int64Value, + ) + training_loss = proto.Field( + proto.MESSAGE, number=5, message=wrappers_pb2.DoubleValue, + ) + eval_loss = proto.Field( + proto.MESSAGE, number=6, message=wrappers_pb2.DoubleValue, + ) + learn_rate = proto.Field(proto.DOUBLE, number=7,) + cluster_infos = proto.RepeatedField( + proto.MESSAGE, + number=8, + message="Model.TrainingRun.IterationResult.ClusterInfo", + ) + arima_result = proto.Field( + proto.MESSAGE, + number=9, + message="Model.TrainingRun.IterationResult.ArimaResult", + ) + + training_options = proto.Field( + proto.MESSAGE, number=1, message="Model.TrainingRun.TrainingOptions", + ) + start_time = proto.Field( + proto.MESSAGE, number=8, message=timestamp_pb2.Timestamp, + ) + results = proto.RepeatedField( + proto.MESSAGE, number=6, message="Model.TrainingRun.IterationResult", + ) + evaluation_metrics = proto.Field( + proto.MESSAGE, number=7, message="Model.EvaluationMetrics", + ) + data_split_result = proto.Field( + proto.MESSAGE, number=9, message="Model.DataSplitResult", + ) + global_explanations = proto.RepeatedField( + proto.MESSAGE, number=10, message="Model.GlobalExplanation", + ) + + etag = proto.Field(proto.STRING, number=1,) + model_reference = proto.Field( + proto.MESSAGE, number=2, message=gcb_model_reference.ModelReference, + ) + creation_time = proto.Field(proto.INT64, number=5,) + last_modified_time = proto.Field(proto.INT64, number=6,) + description = proto.Field(proto.STRING, number=12,) + friendly_name = proto.Field(proto.STRING, number=14,) + labels = proto.MapField(proto.STRING, proto.STRING, number=15,) + expiration_time = proto.Field(proto.INT64, number=16,) + location = proto.Field(proto.STRING, number=13,) + encryption_configuration = proto.Field( + proto.MESSAGE, number=17, message=encryption_config.EncryptionConfiguration, + ) + model_type = proto.Field(proto.ENUM, number=7, enum=ModelType,) + training_runs = proto.RepeatedField(proto.MESSAGE, number=9, message=TrainingRun,) + feature_columns = proto.RepeatedField( + proto.MESSAGE, number=10, message=standard_sql.StandardSqlField, + ) + label_columns = proto.RepeatedField( + proto.MESSAGE, number=11, message=standard_sql.StandardSqlField, + ) + best_trial_id = proto.Field(proto.INT64, number=19,) + + +class GetModelRequest(proto.Message): + r""" + Attributes: + project_id (str): + Required. Project ID of the requested model. + dataset_id (str): + Required. Dataset ID of the requested model. + model_id (str): + Required. Model ID of the requested model. + """ + + project_id = proto.Field(proto.STRING, number=1,) + dataset_id = proto.Field(proto.STRING, number=2,) + model_id = proto.Field(proto.STRING, number=3,) + + +class PatchModelRequest(proto.Message): + r""" + Attributes: + project_id (str): + Required. Project ID of the model to patch. + dataset_id (str): + Required. Dataset ID of the model to patch. + model_id (str): + Required. Model ID of the model to patch. + model (google.cloud.bigquery_v2.types.Model): + Required. Patched model. + Follows RFC5789 patch semantics. Missing fields + are not updated. To clear a field, explicitly + set to default value. + """ + + project_id = proto.Field(proto.STRING, number=1,) + dataset_id = proto.Field(proto.STRING, number=2,) + model_id = proto.Field(proto.STRING, number=3,) + model = proto.Field(proto.MESSAGE, number=4, message="Model",) + + +class DeleteModelRequest(proto.Message): + r""" + Attributes: + project_id (str): + Required. Project ID of the model to delete. + dataset_id (str): + Required. Dataset ID of the model to delete. + model_id (str): + Required. Model ID of the model to delete. + """ + + project_id = proto.Field(proto.STRING, number=1,) + dataset_id = proto.Field(proto.STRING, number=2,) + model_id = proto.Field(proto.STRING, number=3,) + + +class ListModelsRequest(proto.Message): + r""" + Attributes: + project_id (str): + Required. Project ID of the models to list. + dataset_id (str): + Required. Dataset ID of the models to list. + max_results (google.protobuf.wrappers_pb2.UInt32Value): + The maximum number of results to return in a + single response page. Leverage the page tokens + to iterate through the entire collection. + page_token (str): + Page token, returned by a previous call to + request the next page of results + """ + + project_id = proto.Field(proto.STRING, number=1,) + dataset_id = proto.Field(proto.STRING, number=2,) + max_results = proto.Field( + proto.MESSAGE, number=3, message=wrappers_pb2.UInt32Value, + ) + page_token = proto.Field(proto.STRING, number=4,) + + +class ListModelsResponse(proto.Message): + r""" + Attributes: + models (Sequence[google.cloud.bigquery_v2.types.Model]): + Models in the requested dataset. Only the following fields + are populated: model_reference, model_type, creation_time, + last_modified_time and labels. + next_page_token (str): + A token to request the next page of results. + """ + + @property + def raw_page(self): + return self + + models = proto.RepeatedField(proto.MESSAGE, number=1, message="Model",) + next_page_token = proto.Field(proto.STRING, number=2,) + + +__all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/google/cloud/bigquery_v2/types/model_reference.py b/google/cloud/bigquery_v2/types/model_reference.py new file mode 100644 index 000000000..a9ebad613 --- /dev/null +++ b/google/cloud/bigquery_v2/types/model_reference.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import proto # type: ignore + + +__protobuf__ = proto.module( + package="google.cloud.bigquery.v2", manifest={"ModelReference",}, +) + + +class ModelReference(proto.Message): + r"""Id path of a model. + Attributes: + project_id (str): + Required. The ID of the project containing + this model. + dataset_id (str): + Required. The ID of the dataset containing + this model. + model_id (str): + Required. The ID of the model. The ID must contain only + letters (a-z, A-Z), numbers (0-9), or underscores (_). The + maximum length is 1,024 characters. + """ + + project_id = proto.Field(proto.STRING, number=1,) + dataset_id = proto.Field(proto.STRING, number=2,) + model_id = proto.Field(proto.STRING, number=3,) + + +__all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/google/cloud/bigquery_v2/types/standard_sql.py b/google/cloud/bigquery_v2/types/standard_sql.py new file mode 100644 index 000000000..7a845fc48 --- /dev/null +++ b/google/cloud/bigquery_v2/types/standard_sql.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import proto # type: ignore + + +__protobuf__ = proto.module( + package="google.cloud.bigquery.v2", + manifest={ + "StandardSqlDataType", + "StandardSqlField", + "StandardSqlStructType", + "StandardSqlTableType", + }, +) + + +class StandardSqlDataType(proto.Message): + r"""The type of a variable, e.g., a function argument. Examples: INT64: + {type_kind="INT64"} ARRAY: {type_kind="ARRAY", + array_element_type="STRING"} STRUCT: + {type_kind="STRUCT", struct_type={fields=[ {name="x", + type={type_kind="STRING"}}, {name="y", type={type_kind="ARRAY", + array_element_type="DATE"}} ]}} + + Attributes: + type_kind (google.cloud.bigquery_v2.types.StandardSqlDataType.TypeKind): + Required. The top level type of this field. + Can be any standard SQL data type (e.g., + "INT64", "DATE", "ARRAY"). + array_element_type (google.cloud.bigquery_v2.types.StandardSqlDataType): + The type of the array's elements, if type_kind = "ARRAY". + struct_type (google.cloud.bigquery_v2.types.StandardSqlStructType): + The fields of this struct, in order, if type_kind = + "STRUCT". + """ + + class TypeKind(proto.Enum): + r"""""" + TYPE_KIND_UNSPECIFIED = 0 + INT64 = 2 + BOOL = 5 + FLOAT64 = 7 + STRING = 8 + BYTES = 9 + TIMESTAMP = 19 + DATE = 10 + TIME = 20 + DATETIME = 21 + INTERVAL = 26 + GEOGRAPHY = 22 + NUMERIC = 23 + BIGNUMERIC = 24 + JSON = 25 + ARRAY = 16 + STRUCT = 17 + + type_kind = proto.Field(proto.ENUM, number=1, enum=TypeKind,) + array_element_type = proto.Field( + proto.MESSAGE, number=2, oneof="sub_type", message="StandardSqlDataType", + ) + struct_type = proto.Field( + proto.MESSAGE, number=3, oneof="sub_type", message="StandardSqlStructType", + ) + + +class StandardSqlField(proto.Message): + r"""A field or a column. + Attributes: + name (str): + Optional. The name of this field. Can be + absent for struct fields. + type (google.cloud.bigquery_v2.types.StandardSqlDataType): + Optional. The type of this parameter. Absent + if not explicitly specified (e.g., CREATE + FUNCTION statement can omit the return type; in + this case the output parameter does not have + this "type" field). + """ + + name = proto.Field(proto.STRING, number=1,) + type = proto.Field(proto.MESSAGE, number=2, message="StandardSqlDataType",) + + +class StandardSqlStructType(proto.Message): + r""" + Attributes: + fields (Sequence[google.cloud.bigquery_v2.types.StandardSqlField]): + + """ + + fields = proto.RepeatedField(proto.MESSAGE, number=1, message="StandardSqlField",) + + +class StandardSqlTableType(proto.Message): + r"""A table type + Attributes: + columns (Sequence[google.cloud.bigquery_v2.types.StandardSqlField]): + The columns in this table type + """ + + columns = proto.RepeatedField(proto.MESSAGE, number=1, message="StandardSqlField",) + + +__all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/google/cloud/bigquery_v2/types/table_reference.py b/google/cloud/bigquery_v2/types/table_reference.py new file mode 100644 index 000000000..d56e5b09f --- /dev/null +++ b/google/cloud/bigquery_v2/types/table_reference.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import proto # type: ignore + + +__protobuf__ = proto.module( + package="google.cloud.bigquery.v2", manifest={"TableReference",}, +) + + +class TableReference(proto.Message): + r""" + Attributes: + project_id (str): + Required. The ID of the project containing + this table. + dataset_id (str): + Required. The ID of the dataset containing + this table. + table_id (str): + Required. The ID of the table. The ID must contain only + letters (a-z, A-Z), numbers (0-9), or underscores (_). The + maximum length is 1,024 characters. Certain operations allow + suffixing of the table ID with a partition decorator, such + as ``sample_table$20190123``. + project_id_alternative (Sequence[str]): + The alternative field that will be used when ESF is not able + to translate the received data to the project_id field. + dataset_id_alternative (Sequence[str]): + The alternative field that will be used when ESF is not able + to translate the received data to the project_id field. + table_id_alternative (Sequence[str]): + The alternative field that will be used when ESF is not able + to translate the received data to the project_id field. + """ + + project_id = proto.Field(proto.STRING, number=1,) + dataset_id = proto.Field(proto.STRING, number=2,) + table_id = proto.Field(proto.STRING, number=3,) + project_id_alternative = proto.RepeatedField(proto.STRING, number=4,) + dataset_id_alternative = proto.RepeatedField(proto.STRING, number=5,) + table_id_alternative = proto.RepeatedField(proto.STRING, number=6,) + + +__all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..4505b4854 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,3 @@ +[mypy] +python_version = 3.6 +namespace_packages = True diff --git a/noxfile.py b/noxfile.py index 90f023add..9077924e9 100644 --- a/noxfile.py +++ b/noxfile.py @@ -21,12 +21,31 @@ import nox +PYTYPE_VERSION = "pytype==2021.4.9" BLACK_VERSION = "black==19.10b0" BLACK_PATHS = ("docs", "google", "samples", "tests", "noxfile.py", "setup.py") -CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() +DEFAULT_PYTHON_VERSION = "3.8" +SYSTEM_TEST_PYTHON_VERSIONS = ["3.8"] +UNIT_TEST_PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9"] +CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() -def default(session): +# 'docfx' is excluded since it only needs to run in 'docs-presubmit' +nox.options.sessions = [ + "unit_noextras", + "unit", + "system", + "snippets", + "cover", + "lint", + "lint_setup_py", + "blacken", + "pytype", + "docs", +] + + +def default(session, install_extras=True): """Default unit test session. This is intended to be run **without** an interpreter set, so @@ -49,16 +68,8 @@ def default(session): constraints_path, ) - if session.python == "2.7": - # The [all] extra is not installable on Python 2.7. - session.install("-e", ".[pandas,pyarrow]", "-c", constraints_path) - elif session.python == "3.5": - session.install("-e", ".[all]", "-c", constraints_path) - else: - # fastparquet is not included in .[all] because, in general, it's - # redundant with pyarrow. We still want to run some unit tests with - # fastparquet serialization, though. - session.install("-e", ".[all,fastparquet]", "-c", constraints_path) + install_target = ".[all]" if install_extras else "." + session.install("-e", install_target, "-c", constraints_path) session.install("ipython", "-c", constraints_path) @@ -66,8 +77,8 @@ def default(session): session.run( "py.test", "--quiet", - "--cov=google.cloud.bigquery", - "--cov=tests.unit", + "--cov=google/cloud/bigquery", + "--cov=tests/unit", "--cov-append", "--cov-config=.coveragerc", "--cov-report=", @@ -77,13 +88,32 @@ def default(session): ) -@nox.session(python=["2.7", "3.5", "3.6", "3.7", "3.8"]) +@nox.session(python=UNIT_TEST_PYTHON_VERSIONS) def unit(session): """Run the unit test suite.""" default(session) -@nox.session(python=["2.7", "3.8"]) +@nox.session(python=UNIT_TEST_PYTHON_VERSIONS[-1]) +def unit_noextras(session): + """Run the unit test suite.""" + default(session, install_extras=False) + + +@nox.session(python=DEFAULT_PYTHON_VERSION) +def pytype(session): + """Run type checks.""" + # An indirect dependecy attrs==21.1.0 breaks the check, and installing a less + # recent version avoids the error until a possibly better fix is found. + # https://github.com/googleapis/python-bigquery/issues/655 + session.install("attrs==20.3.0") + session.install("-e", ".[all]") + session.install("ipython") + session.install(PYTYPE_VERSION) + session.run("pytype") + + +@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) def system(session): """Run the system test suite.""" @@ -106,53 +136,55 @@ def system(session): session.install( "mock", "pytest", "psutil", "google-cloud-testutils", "-c", constraints_path ) - session.install("google-cloud-storage", "-c", constraints_path) - - if session.python == "2.7": - # The [all] extra is not installable on Python 2.7. - session.install("-e", ".[pandas]", "-c", constraints_path) + if os.environ.get("GOOGLE_API_USE_CLIENT_CERTIFICATE", "") == "true": + # mTLS test requires pyopenssl and latest google-cloud-storage + session.install("google-cloud-storage", "pyopenssl") else: - session.install("-e", ".[all]", "-c", constraints_path) + session.install("google-cloud-storage", "-c", constraints_path) + # Data Catalog needed for the column ACL test with a real Policy Tag. + session.install("google-cloud-datacatalog", "-c", constraints_path) + + session.install("-e", ".[all]", "-c", constraints_path) session.install("ipython", "-c", constraints_path) # Run py.test against the system tests. - session.run( - "py.test", "--quiet", os.path.join("tests", "system.py"), *session.posargs - ) + session.run("py.test", "--quiet", os.path.join("tests", "system"), *session.posargs) -@nox.session(python=["2.7", "3.8"]) +@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) def snippets(session): """Run the snippets test suite.""" + # Check the value of `RUN_SNIPPETS_TESTS` env var. It defaults to true. + if os.environ.get("RUN_SNIPPETS_TESTS", "true") == "false": + session.skip("RUN_SNIPPETS_TESTS is set to false, skipping") + constraints_path = str( CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) - # Sanity check: Only run snippets tests if the environment variable is set. - if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", ""): - session.skip("Credentials must be set via environment variable.") - # Install all test dependencies, then install local packages in place. session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path) session.install("google-cloud-storage", "-c", constraints_path) session.install("grpcio", "-c", constraints_path) - if session.python == "2.7": - # The [all] extra is not installable on Python 2.7. - session.install("-e", ".[pandas]", "-c", constraints_path) - else: - session.install("-e", ".[all]", "-c", constraints_path) + session.install("-e", ".[all]", "-c", constraints_path) # Run py.test against the snippets tests. # Skip tests in samples/snippets, as those are run in a different session # using the nox config from that directory. session.run("py.test", os.path.join("docs", "snippets.py"), *session.posargs) - session.run("py.test", "samples", "--ignore=samples/snippets", *session.posargs) + session.run( + "py.test", + "samples", + "--ignore=samples/snippets", + "--ignore=samples/geography", + *session.posargs, + ) -@nox.session(python="3.8") +@nox.session(python=DEFAULT_PYTHON_VERSION) def cover(session): """Run the final coverage report. @@ -164,7 +196,43 @@ def cover(session): session.run("coverage", "erase") -@nox.session(python="3.8") +@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) +def prerelease_deps(session): + """Run all tests with prerelease versions of dependencies installed. + + https://github.com/googleapis/python-bigquery/issues/95 + """ + # PyArrow prerelease packages are published to an alternative PyPI host. + # https://arrow.apache.org/docs/python/install.html#installing-nightly-packages + session.install( + "--extra-index-url", "https://pypi.fury.io/arrow-nightlies/", "--pre", "pyarrow" + ) + session.install("--pre", "grpcio", "pandas") + session.install( + "freezegun", + "google-cloud-datacatalog", + "google-cloud-storage", + "google-cloud-testutils", + "IPython", + "mock", + "psutil", + "pytest", + "pytest-cov", + ) + session.install("-e", ".[all]") + + # Print out prerelease package versions. + session.run("python", "-c", "import grpc; print(grpc.__version__)") + session.run("python", "-c", "import pandas; print(pandas.__version__)") + session.run("python", "-c", "import pyarrow; print(pyarrow.__version__)") + + # Run all tests, except a few samples tests which require extra dependencies. + session.run("py.test", "tests/unit") + session.run("py.test", "tests/system") + session.run("py.test", "samples/tests") + + +@nox.session(python=DEFAULT_PYTHON_VERSION) def lint(session): """Run linters. @@ -181,7 +249,7 @@ def lint(session): session.run("black", "--check", *BLACK_PATHS) -@nox.session(python="3.8") +@nox.session(python=DEFAULT_PYTHON_VERSION) def lint_setup_py(session): """Verify that setup.py is valid (including RST check).""" @@ -189,24 +257,21 @@ def lint_setup_py(session): session.run("python", "setup.py", "check", "--restructuredtext", "--strict") -@nox.session(python="3.6") +@nox.session(python=DEFAULT_PYTHON_VERSION) def blacken(session): """Run black. Format code to uniform standard. - - This currently uses Python 3.6 due to the automated Kokoro run of synthtool. - That run uses an image that doesn't have 3.6 installed. Before updating this - check the state of the `gcp_ubuntu_config` we use for that Kokoro run. """ + session.install(BLACK_VERSION) session.run("black", *BLACK_PATHS) -@nox.session(python="3.8") +@nox.session(python=DEFAULT_PYTHON_VERSION) def docs(session): """Build the docs.""" - session.install("ipython", "recommonmark", "sphinx", "sphinx_rtd_theme") + session.install("ipython", "recommonmark", "sphinx==4.0.1", "sphinx_rtd_theme") session.install("google-cloud-storage") session.install("-e", ".[all]") @@ -225,12 +290,14 @@ def docs(session): ) -@nox.session(python="3.8") +@nox.session(python=DEFAULT_PYTHON_VERSION) def docfx(session): """Build the docfx yaml files for this library.""" session.install("-e", ".") - session.install("sphinx", "alabaster", "recommonmark", "sphinx-docfx-yaml") + session.install( + "sphinx==4.0.1", "alabaster", "recommonmark", "gcp-sphinx-docfx-yaml" + ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) session.run( diff --git a/owlbot.py b/owlbot.py new file mode 100644 index 000000000..8664b658a --- /dev/null +++ b/owlbot.py @@ -0,0 +1,172 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This script is used to synthesize generated parts of this library.""" +import textwrap + +import synthtool as s +from synthtool import gcp +from synthtool.languages import python + +common = gcp.CommonTemplates() + +default_version = "v2" + +for library in s.get_staging_dirs(default_version): + # Do not expose ModelServiceClient and ModelServiceAsyncClient, as there + # is no public API endpoint for the models service. + s.replace( + library / f"google/cloud/bigquery_{library.name}/__init__.py", + r"from \.services\.model_service import ModelServiceClient", + "", + ) + + s.replace( + library / f"google/cloud/bigquery_{library.name}/__init__.py", + r"from \.services\.model_service import ModelServiceAsyncClient", + "", + ) + + s.replace( + library / f"google/cloud/bigquery_{library.name}/__init__.py", + r"""["']ModelServiceClient["'],""", + "", + ) + + s.replace( + library / f"google/cloud/bigquery_{library.name}/__init__.py", + r"""["']ModelServiceAsyncClient["'],""", + "", + ) + + # Adjust Model docstring so that Sphinx does not think that "predicted_" is + # a reference to something, issuing a false warning. + s.replace( + library / f"google/cloud/bigquery_{library.name}/types/model.py", + r'will have a "predicted_"', + "will have a `predicted_`", + ) + + # Avoid breaking change due to change in field renames. + # https://github.com/googleapis/python-bigquery/issues/319 + s.replace( + library / f"google/cloud/bigquery_{library.name}/types/standard_sql.py", + r"type_ ", + "type ", + ) + + s.move( + library, + excludes=[ + "*.tar.gz", + ".coveragerc", + "docs/index.rst", + f"docs/bigquery_{library.name}/*_service.rst", + f"docs/bigquery_{library.name}/services.rst", + "README.rst", + "noxfile.py", + "setup.py", + f"scripts/fixup_bigquery_{library.name}_keywords.py", + "google/cloud/bigquery/__init__.py", + "google/cloud/bigquery/py.typed", + # There are no public API endpoints for the generated ModelServiceClient, + # thus there's no point in generating it and its tests. + f"google/cloud/bigquery_{library.name}/services/**", + f"tests/unit/gapic/bigquery_{library.name}/**", + ], + ) + +s.remove_staging_dirs() + +# ---------------------------------------------------------------------------- +# Add templated files +# ---------------------------------------------------------------------------- +templated_files = common.py_library( + cov_level=100, + samples=True, + microgenerator=True, + split_system_tests=True, + intersphinx_dependencies={ + "pandas": "http://pandas.pydata.org/pandas-docs/dev", + "geopandas": "https://geopandas.org/", + }, +) + +# BigQuery has a custom multiprocessing note +s.move( + templated_files, + excludes=[ + "noxfile.py", + "docs/multiprocessing.rst", + ".coveragerc", + # Include custom SNIPPETS_TESTS job for performance. + # https://github.com/googleapis/python-bigquery/issues/191 + ".kokoro/presubmit/presubmit.cfg", + # Group all renovate PRs together. If this works well, remove this and + # update the shared templates (possibly with configuration option to + # py_library.) + "renovate.json", + ], +) + +# ---------------------------------------------------------------------------- +# Samples templates +# ---------------------------------------------------------------------------- + +python.py_samples() + +s.replace( + "docs/conf.py", + r'\{"members": True\}', + '{"members": True, "inherited-members": True}', +) + +# Tell Sphinx to ingore autogenerated docs files. +s.replace( + "docs/conf.py", + r'"samples/snippets/README\.rst",', + '\\g<0>\n "bigquery_v2/services.rst", # generated by the code generator', +) + +# ---------------------------------------------------------------------------- +# pytype-related changes +# ---------------------------------------------------------------------------- + +# Add .pytype to .gitignore +s.replace(".gitignore", r"\.pytest_cache", "\\g<0>\n.pytype") + +# Add pytype config to setup.cfg +s.replace( + "setup.cfg", + r"universal = 1", + textwrap.dedent( + """ \\g<0> + + [pytype] + python_version = 3.8 + inputs = + google/cloud/ + exclude = + tests/ + google/cloud/bigquery_v2/ + output = .pytype/ + disable = + # There's some issue with finding some pyi files, thus disabling. + # The issue https://github.com/google/pytype/issues/150 is closed, but the + # error still occurs for some reason. + pyi-error""" + ), +) + +s.shell.run(["nox", "-s", "blacken"], hide_output=False) diff --git a/renovate.json b/renovate.json index 4fa949311..713c60bb4 100644 --- a/renovate.json +++ b/renovate.json @@ -1,5 +1,9 @@ { "extends": [ - "config:base", ":preserveSemverRanges" - ] + "config:base", "group:all", ":preserveSemverRanges" + ], + "ignorePaths": [".pre-commit-config.yaml"], + "pip_requirements": { + "fileMatch": ["requirements-test.txt", "samples/[\\S/]*constraints.txt", "samples/[\\S/]*constraints-test.txt"] + } } diff --git a/samples/client_query_destination_table_clustered.py b/samples/client_query_destination_table_clustered.py new file mode 100644 index 000000000..5a109ed10 --- /dev/null +++ b/samples/client_query_destination_table_clustered.py @@ -0,0 +1,43 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def client_query_destination_table_clustered(table_id): + + # [START bigquery_query_clustered_table] + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client() + + # TODO(developer): Set table_id to the ID of the destination table. + # table_id = "your-project.your_dataset.your_table_name" + + sql = "SELECT * FROM `bigquery-public-data.samples.shakespeare`" + cluster_fields = ["corpus"] + + job_config = bigquery.QueryJobConfig( + clustering_fields=cluster_fields, destination=table_id + ) + + # Start the query, passing in the extra configuration. + query_job = client.query(sql, job_config=job_config) # Make an API request. + query_job.result() # Wait for the job to complete. + + table = client.get_table(table_id) # Make an API request. + if table.clustering_fields == cluster_fields: + print( + "The destination table is written using the cluster_fields configuration." + ) + # [END bigquery_query_clustered_table] diff --git a/samples/client_query_w_timestamp_params.py b/samples/client_query_w_timestamp_params.py index ca8eec0b5..41a27770e 100644 --- a/samples/client_query_w_timestamp_params.py +++ b/samples/client_query_w_timestamp_params.py @@ -18,7 +18,6 @@ def client_query_w_timestamp_params(): # [START bigquery_query_params_timestamps] import datetime - import pytz from google.cloud import bigquery # Construct a BigQuery client object. @@ -30,7 +29,7 @@ def client_query_w_timestamp_params(): bigquery.ScalarQueryParameter( "ts_value", "TIMESTAMP", - datetime.datetime(2016, 12, 7, 8, 0, tzinfo=pytz.UTC), + datetime.datetime(2016, 12, 7, 8, 0, tzinfo=datetime.timezone.utc), ) ] ) diff --git a/samples/create_routine.py b/samples/create_routine.py index d9b221a4f..1cb4a80b4 100644 --- a/samples/create_routine.py +++ b/samples/create_routine.py @@ -22,7 +22,7 @@ def create_routine(routine_id): # Construct a BigQuery client object. client = bigquery.Client() - # TODO(developer): Choose a fully-qualified ID for the routine. + # TODO(developer): Choose a fully qualified ID for the routine. # routine_id = "my-project.my_dataset.my_routine" routine = bigquery.Routine( @@ -34,7 +34,7 @@ def create_routine(routine_id): bigquery.RoutineArgument( name="x", data_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ), ) ], diff --git a/samples/create_table_clustered.py b/samples/create_table_clustered.py new file mode 100644 index 000000000..2b45b747e --- /dev/null +++ b/samples/create_table_clustered.py @@ -0,0 +1,42 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def create_table_clustered(table_id): + + # [START bigquery_create_table_clustered] + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client() + + # TODO(developer): Set table_id to the ID of the table to create. + # table_id = "your-project.your_dataset.your_table_name" + + schema = [ + bigquery.SchemaField("full_name", "STRING"), + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("zipcode", "INTEGER"), + ] + + table = bigquery.Table(table_id, schema=schema) + table.clustering_fields = ["city", "zipcode"] + table = client.create_table(table) # Make an API request. + print( + "Created clustered table {}.{}.{}".format( + table.project, table.dataset_id, table.table_id + ) + ) + # [END bigquery_create_table_clustered] + return table diff --git a/samples/geography/__init__.py b/samples/geography/__init__.py new file mode 100644 index 000000000..c6334245a --- /dev/null +++ b/samples/geography/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/samples/geography/conftest.py b/samples/geography/conftest.py new file mode 100644 index 000000000..265900f5a --- /dev/null +++ b/samples/geography/conftest.py @@ -0,0 +1,55 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import uuid + +from google.cloud import bigquery +import pytest + + +def temp_suffix(): + now = datetime.datetime.now() + return f"{now.strftime('%Y%m%d%H%M%S')}_{uuid.uuid4().hex[:8]}" + + +@pytest.fixture(scope="session") +def bigquery_client(): + bigquery_client = bigquery.Client() + return bigquery_client + + +@pytest.fixture(scope="session") +def project_id(bigquery_client): + return bigquery_client.project + + +@pytest.fixture +def dataset_id(bigquery_client): + dataset_id = f"geography_{temp_suffix()}" + bigquery_client.create_dataset(dataset_id) + yield dataset_id + bigquery_client.delete_dataset(dataset_id, delete_contents=True) + + +@pytest.fixture +def table_id(bigquery_client, project_id, dataset_id): + table_id = f"{project_id}.{dataset_id}.geography_{temp_suffix()}" + table = bigquery.Table(table_id) + table.schema = [ + bigquery.SchemaField("geo", bigquery.SqlTypeNames.GEOGRAPHY), + ] + bigquery_client.create_table(table) + yield table_id + bigquery_client.delete_table(table_id) diff --git a/samples/geography/insert_geojson.py b/samples/geography/insert_geojson.py new file mode 100644 index 000000000..23f249c15 --- /dev/null +++ b/samples/geography/insert_geojson.py @@ -0,0 +1,49 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def insert_geojson(override_values={}): + # [START bigquery_insert_geojson] + import geojson + from google.cloud import bigquery + + bigquery_client = bigquery.Client() + + # This example uses a table containing a column named "geo" with the + # GEOGRAPHY data type. + table_id = "my-project.my_dataset.my_table" + # [END bigquery_insert_geojson] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + table_id = override_values.get("table_id", table_id) + # [START bigquery_insert_geojson] + + # Use the python-geojson library to generate GeoJSON of a line from LAX to + # JFK airports. Alternatively, you may define GeoJSON data directly, but it + # must be converted to a string before loading it into BigQuery. + my_geography = geojson.LineString([(-118.4085, 33.9416), (-73.7781, 40.6413)]) + rows = [ + # Convert GeoJSON data into a string. + {"geo": geojson.dumps(my_geography)} + ] + + # table already exists and has a column + # named "geo" with data type GEOGRAPHY. + errors = bigquery_client.insert_rows_json(table_id, rows) + if errors: + raise RuntimeError(f"row insert failed: {errors}") + else: + print(f"wrote 1 row to {table_id}") + # [END bigquery_insert_geojson] + return errors diff --git a/samples/geography/insert_geojson_test.py b/samples/geography/insert_geojson_test.py new file mode 100644 index 000000000..5ef15ee13 --- /dev/null +++ b/samples/geography/insert_geojson_test.py @@ -0,0 +1,20 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import insert_geojson + + +def test_insert_geojson(table_id): + errors = insert_geojson.insert_geojson(override_values={"table_id": table_id}) + assert not errors diff --git a/samples/geography/insert_wkt.py b/samples/geography/insert_wkt.py new file mode 100644 index 000000000..d7d3accde --- /dev/null +++ b/samples/geography/insert_wkt.py @@ -0,0 +1,51 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def insert_wkt(override_values={}): + # [START bigquery_insert_geography_wkt] + from google.cloud import bigquery + import shapely.geometry + import shapely.wkt + + bigquery_client = bigquery.Client() + + # This example uses a table containing a column named "geo" with the + # GEOGRAPHY data type. + table_id = "my-project.my_dataset.my_table" + # [END bigquery_insert_geography_wkt] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + table_id = override_values.get("table_id", table_id) + # [START bigquery_insert_geography_wkt] + + # Use the Shapely library to generate WKT of a line from LAX to + # JFK airports. Alternatively, you may define WKT data directly. + my_geography = shapely.geometry.LineString( + [(-118.4085, 33.9416), (-73.7781, 40.6413)] + ) + rows = [ + # Convert data into a WKT string. + {"geo": shapely.wkt.dumps(my_geography)}, + ] + + # table already exists and has a column + # named "geo" with data type GEOGRAPHY. + errors = bigquery_client.insert_rows_json(table_id, rows) + if errors: + raise RuntimeError(f"row insert failed: {errors}") + else: + print(f"wrote 1 row to {table_id}") + # [END bigquery_insert_geography_wkt] + return errors diff --git a/samples/geography/insert_wkt_test.py b/samples/geography/insert_wkt_test.py new file mode 100644 index 000000000..8bcb62cec --- /dev/null +++ b/samples/geography/insert_wkt_test.py @@ -0,0 +1,20 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import insert_wkt + + +def test_insert_wkt(table_id): + errors = insert_wkt.insert_wkt(override_values={"table_id": table_id}) + assert not errors diff --git a/samples/geography/noxfile.py b/samples/geography/noxfile.py new file mode 100644 index 000000000..b008613f0 --- /dev/null +++ b/samples/geography/noxfile.py @@ -0,0 +1,266 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +from pathlib import Path +import sys +from typing import Callable, Dict, List, Optional + +import nox + + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +BLACK_VERSION = "black==19.10b0" + +# Copy `noxfile_config.py` to your directory and modify it instead. + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + "ignored_versions": [], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": False, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # If you need to use a specific version of pip, + # change pip_version_override to the string representation + # of the version number, for example, "20.2.4" + "pip_version_override": None, + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append(".") + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars() -> Dict[str, str]: + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG["gcloud_project_env"] + # This should error out if not set. + ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key] + + # Apply user supplied envs. + ret.update(TEST_CONFIG["envs"]) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to test samples. +ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False) in ( + "True", + "true", +) +# +# Style Checks +# + + +def _determine_local_import_names(start_dir: str) -> List[str]: + """Determines all import names that should be considered "local". + + This is used when running the linter to insure that import order is + properly checked. + """ + file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] + return [ + basename + for basename, extension in file_ext_pairs + if extension == ".py" + or os.path.isdir(os.path.join(start_dir, basename)) + and basename not in ("__pycache__") + ] + + +# Linting with flake8. +# +# We ignore the following rules: +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--import-order-style=google", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session: nox.sessions.Session) -> None: + if not TEST_CONFIG["enforce_type_hints"]: + session.install("flake8", "flake8-import-order") + else: + session.install("flake8", "flake8-import-order", "flake8-annotations") + + local_names = _determine_local_import_names(".") + args = FLAKE8_COMMON_ARGS + [ + "--application-import-names", + ",".join(local_names), + ".", + ] + session.run("flake8", *args) + + +# +# Black +# + + +@nox.session +def blacken(session: nox.sessions.Session) -> None: + session.install(BLACK_VERSION) + python_files = [path for path in os.listdir(".") if path.endswith(".py")] + + session.run("black", *python_files) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests( + session: nox.sessions.Session, post_install: Callable = None +) -> None: + if TEST_CONFIG["pip_version_override"]: + pip_version = TEST_CONFIG["pip_version_override"] + session.install(f"pip=={pip_version}") + """Runs py.test for a particular project.""" + if os.path.exists("requirements.txt"): + if os.path.exists("constraints.txt"): + session.install("-r", "requirements.txt", "-c", "constraints.txt") + else: + session.install("-r", "requirements.txt") + + if os.path.exists("requirements-test.txt"): + if os.path.exists("constraints-test.txt"): + session.install("-r", "requirements-test.txt", "-c", "constraints-test.txt") + else: + session.install("-r", "requirements-test.txt") + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars(), + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session: nox.sessions.Session) -> None: + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip( + "SKIPPED: {} tests are disabled for this sample.".format(session.python) + ) + + +# +# Readmegen +# + + +def _get_repo_root() -> Optional[str]: + """ Returns the root folder of the project. """ + # Get root of this repository. Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + # .git is not available in repos cloned via Cloud Build + # setup.py is always in the library's root, so use that instead + # https://github.com/googleapis/synthtool/issues/792 + if Path(p / "setup.py").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session: nox.sessions.Session, path: str) -> None: + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) diff --git a/samples/geography/noxfile_config.py b/samples/geography/noxfile_config.py new file mode 100644 index 000000000..7d2e02346 --- /dev/null +++ b/samples/geography/noxfile_config.py @@ -0,0 +1,35 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default TEST_CONFIG_OVERRIDE for python repos. + +# You can copy this file into your directory, then it will be inported from +# the noxfile.py. + +# The source of truth: +# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7"], + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # "gcloud_project_env": "BUILD_SPECIFIC_GCLOUD_PROJECT", + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} diff --git a/samples/geography/requirements-test.txt b/samples/geography/requirements-test.txt new file mode 100644 index 000000000..b0cf76724 --- /dev/null +++ b/samples/geography/requirements-test.txt @@ -0,0 +1,2 @@ +pytest==6.2.4 +mock==4.0.3 diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt new file mode 100644 index 000000000..b5fe247cb --- /dev/null +++ b/samples/geography/requirements.txt @@ -0,0 +1,50 @@ +attrs==21.2.0 +cachetools==4.2.2 +certifi==2021.5.30 +cffi==1.14.6 +charset-normalizer==2.0.4 +click==8.0.1 +click-plugins==1.1.1 +cligj==0.7.2 +dataclasses==0.6; python_version < '3.7' +Fiona==1.8.20 +geojson==2.5.0 +geopandas==0.9.0 +google-api-core==1.31.2 +google-auth==1.35.0 +google-cloud-bigquery==2.25.0 +google-cloud-bigquery-storage==2.6.3 +google-cloud-core==2.0.0 +google-crc32c==1.1.2 +google-resumable-media==1.3.3 +googleapis-common-protos==1.53.0 +grpcio==1.39.0 +idna==3.2 +importlib-metadata==4.6.4 +libcst==0.3.20 +munch==2.5.0 +mypy-extensions==0.4.3 +numpy==1.19.5; python_version < "3.7" +numpy==1.21.2; python_version > "3.6" +packaging==21.0 +pandas==1.1.5; python_version < '3.7' +pandas==1.3.2; python_version >= '3.7' +proto-plus==1.19.0 +protobuf==3.17.3 +pyarrow==5.0.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +pyparsing==2.4.7 +pyproj==3.0.1 +python-dateutil==2.8.2 +pytz==2021.1 +PyYAML==5.4.1 +requests==2.26.0 +rsa==4.7.2 +Shapely==1.7.1 +six==1.16.0 +typing-extensions==3.10.0.0 +typing-inspect==0.7.1 +urllib3==1.26.6 +zipp==3.5.0 diff --git a/samples/geography/to_geodataframe.py b/samples/geography/to_geodataframe.py new file mode 100644 index 000000000..fa8073fef --- /dev/null +++ b/samples/geography/to_geodataframe.py @@ -0,0 +1,32 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import bigquery + +client = bigquery.Client() + + +def get_austin_service_requests_as_geography(): + # [START bigquery_query_results_geodataframe] + + sql = """ + SELECT created_date, complaint_description, + ST_GEOGPOINT(longitude, latitude) as location + FROM bigquery-public-data.austin_311.311_service_requests + LIMIT 10 + """ + + df = client.query(sql).to_geodataframe() + # [END bigquery_query_results_geodataframe] + return df diff --git a/samples/geography/to_geodataframe_test.py b/samples/geography/to_geodataframe_test.py new file mode 100644 index 000000000..7a2ba6937 --- /dev/null +++ b/samples/geography/to_geodataframe_test.py @@ -0,0 +1,25 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .to_geodataframe import get_austin_service_requests_as_geography + + +def test_get_austin_service_requests_as_geography(): + geopandas = pytest.importorskip("geopandas") + df = get_austin_service_requests_as_geography() + assert isinstance(df, geopandas.GeoDataFrame) + assert len(list(df)) == 3 # verify the number of columns + assert len(df) == 10 # verify the number of rows diff --git a/samples/load_table_clustered.py b/samples/load_table_clustered.py new file mode 100644 index 000000000..20d412cb3 --- /dev/null +++ b/samples/load_table_clustered.py @@ -0,0 +1,55 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def load_table_clustered(table_id): + + # [START bigquery_load_table_clustered] + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client() + + # TODO(developer): Set table_id to the ID of the table to create. + # table_id = "your-project.your_dataset.your_table_name" + + job_config = bigquery.LoadJobConfig( + skip_leading_rows=1, + source_format=bigquery.SourceFormat.CSV, + schema=[ + bigquery.SchemaField("timestamp", bigquery.SqlTypeNames.TIMESTAMP), + bigquery.SchemaField("origin", bigquery.SqlTypeNames.STRING), + bigquery.SchemaField("destination", bigquery.SqlTypeNames.STRING), + bigquery.SchemaField("amount", bigquery.SqlTypeNames.NUMERIC), + ], + time_partitioning=bigquery.TimePartitioning(field="timestamp"), + clustering_fields=["origin", "destination"], + ) + + job = client.load_table_from_uri( + ["gs://cloud-samples-data/bigquery/sample-transactions/transactions.csv"], + table_id, + job_config=job_config, + ) + + job.result() # Waits for the job to complete. + + table = client.get_table(table_id) # Make an API request. + print( + "Loaded {} rows and {} columns to {}".format( + table.num_rows, len(table.schema), table_id + ) + ) + # [END bigquery_load_table_clustered] + return table diff --git a/samples/load_table_uri_truncate_avro.py b/samples/load_table_uri_truncate_avro.py index 98a791477..1aa0aa49c 100644 --- a/samples/load_table_uri_truncate_avro.py +++ b/samples/load_table_uri_truncate_avro.py @@ -16,7 +16,7 @@ def load_table_uri_truncate_avro(table_id): # [START bigquery_load_table_gcs_avro_truncate] - import six + import io from google.cloud import bigquery @@ -33,7 +33,7 @@ def load_table_uri_truncate_avro(table_id): ], ) - body = six.BytesIO(b"Washington,WA") + body = io.BytesIO(b"Washington,WA") client.load_table_from_file(body, table_id, job_config=job_config).result() previous_rows = client.get_table(table_id).num_rows assert previous_rows > 0 diff --git a/samples/load_table_uri_truncate_csv.py b/samples/load_table_uri_truncate_csv.py index 73de7a8c1..198cdc281 100644 --- a/samples/load_table_uri_truncate_csv.py +++ b/samples/load_table_uri_truncate_csv.py @@ -16,7 +16,7 @@ def load_table_uri_truncate_csv(table_id): # [START bigquery_load_table_gcs_csv_truncate] - import six + import io from google.cloud import bigquery @@ -33,7 +33,7 @@ def load_table_uri_truncate_csv(table_id): ], ) - body = six.BytesIO(b"Washington,WA") + body = io.BytesIO(b"Washington,WA") client.load_table_from_file(body, table_id, job_config=job_config).result() previous_rows = client.get_table(table_id).num_rows assert previous_rows > 0 diff --git a/samples/load_table_uri_truncate_json.py b/samples/load_table_uri_truncate_json.py index a30fae736..d67d93e7b 100644 --- a/samples/load_table_uri_truncate_json.py +++ b/samples/load_table_uri_truncate_json.py @@ -16,7 +16,7 @@ def load_table_uri_truncate_json(table_id): # [START bigquery_load_table_gcs_json_truncate] - import six + import io from google.cloud import bigquery @@ -33,7 +33,7 @@ def load_table_uri_truncate_json(table_id): ], ) - body = six.BytesIO(b"Washington,WA") + body = io.BytesIO(b"Washington,WA") client.load_table_from_file(body, table_id, job_config=job_config).result() previous_rows = client.get_table(table_id).num_rows assert previous_rows > 0 diff --git a/samples/load_table_uri_truncate_orc.py b/samples/load_table_uri_truncate_orc.py index 18f963be2..90543b791 100644 --- a/samples/load_table_uri_truncate_orc.py +++ b/samples/load_table_uri_truncate_orc.py @@ -16,7 +16,7 @@ def load_table_uri_truncate_orc(table_id): # [START bigquery_load_table_gcs_orc_truncate] - import six + import io from google.cloud import bigquery @@ -33,7 +33,7 @@ def load_table_uri_truncate_orc(table_id): ], ) - body = six.BytesIO(b"Washington,WA") + body = io.BytesIO(b"Washington,WA") client.load_table_from_file(body, table_id, job_config=job_config).result() previous_rows = client.get_table(table_id).num_rows assert previous_rows > 0 diff --git a/samples/load_table_uri_truncate_parquet.py b/samples/load_table_uri_truncate_parquet.py index 28692d840..e036fc180 100644 --- a/samples/load_table_uri_truncate_parquet.py +++ b/samples/load_table_uri_truncate_parquet.py @@ -16,7 +16,7 @@ def load_table_uri_truncate_parquet(table_id): # [START bigquery_load_table_gcs_parquet_truncate] - import six + import io from google.cloud import bigquery @@ -33,7 +33,7 @@ def load_table_uri_truncate_parquet(table_id): ], ) - body = six.BytesIO(b"Washington,WA") + body = io.BytesIO(b"Washington,WA") client.load_table_from_file(body, table_id, job_config=job_config).result() previous_rows = client.get_table(table_id).num_rows assert previous_rows > 0 diff --git a/samples/snippets/authenticate_service_account.py b/samples/snippets/authenticate_service_account.py index 58cd2b542..c07848bee 100644 --- a/samples/snippets/authenticate_service_account.py +++ b/samples/snippets/authenticate_service_account.py @@ -30,6 +30,11 @@ def main(): key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"], ) + # Alternatively, use service_account.Credentials.from_service_account_info() + # to set credentials directly via a json object rather than set a filepath + # TODO(developer): Set key_json to the content of the service account key file. + # credentials = service_account.Credentials.from_service_account_info(key_json) + client = bigquery.Client(credentials=credentials, project=credentials.project_id,) # [END bigquery_client_json_credentials] return client diff --git a/samples/snippets/authorized_view_tutorial.py b/samples/snippets/authorized_view_tutorial.py index 6b5cc378f..b6a20c6ec 100644 --- a/samples/snippets/authorized_view_tutorial.py +++ b/samples/snippets/authorized_view_tutorial.py @@ -27,16 +27,18 @@ def run_authorized_view_tutorial(override_values={}): client = bigquery.Client() source_dataset_id = "github_source_data" + source_dataset_id_full = "{}.{}".format(client.project, source_dataset_id) # [END bigquery_authorized_view_tutorial] # [END bigquery_avt_create_source_dataset] # To facilitate testing, we replace values with alternatives # provided by the testing harness. source_dataset_id = override_values.get("source_dataset_id", source_dataset_id) + source_dataset_id_full = "{}.{}".format(client.project, source_dataset_id) # [START bigquery_authorized_view_tutorial] # [START bigquery_avt_create_source_dataset] - source_dataset = bigquery.Dataset(client.dataset(source_dataset_id)) + source_dataset = bigquery.Dataset(source_dataset_id_full) # Specify the geographic location where the dataset should reside. source_dataset.location = "US" source_dataset = client.create_dataset(source_dataset) # API request @@ -66,16 +68,18 @@ def run_authorized_view_tutorial(override_values={}): # Create a separate dataset to store your view # [START bigquery_avt_create_shared_dataset] shared_dataset_id = "shared_views" + shared_dataset_id_full = "{}.{}".format(client.project, shared_dataset_id) # [END bigquery_authorized_view_tutorial] # [END bigquery_avt_create_shared_dataset] # To facilitate testing, we replace values with alternatives # provided by the testing harness. shared_dataset_id = override_values.get("shared_dataset_id", shared_dataset_id) + shared_dataset_id_full = "{}.{}".format(client.project, shared_dataset_id) # [START bigquery_authorized_view_tutorial] # [START bigquery_avt_create_shared_dataset] - shared_dataset = bigquery.Dataset(client.dataset(shared_dataset_id)) + shared_dataset = bigquery.Dataset(shared_dataset_id_full) shared_dataset.location = "US" shared_dataset = client.create_dataset(shared_dataset) # API request # [END bigquery_avt_create_shared_dataset] diff --git a/samples/snippets/authorized_view_tutorial_test.py b/samples/snippets/authorized_view_tutorial_test.py index 4c74020bd..eb247c5eb 100644 --- a/samples/snippets/authorized_view_tutorial_test.py +++ b/samples/snippets/authorized_view_tutorial_test.py @@ -30,7 +30,7 @@ def datasets_to_delete(client): doomed = [] yield doomed for item in doomed: - client.delete_dataset(item, delete_contents=True) + client.delete_dataset(item, delete_contents=True, not_found_ok=True) def test_authorized_view_tutorial(client, datasets_to_delete): @@ -42,8 +42,12 @@ def test_authorized_view_tutorial(client, datasets_to_delete): str(uuid.uuid4()).replace("-", "_") ), } - source_dataset_ref = client.dataset(override_values["source_dataset_id"]) - shared_dataset_ref = client.dataset(override_values["shared_dataset_id"]) + source_dataset_ref = "{}.{}".format( + client.project, override_values["source_dataset_id"] + ) + shared_dataset_ref = "{}.{}".format( + client.project, override_values["shared_dataset_id"] + ) datasets_to_delete.extend( [override_values["source_dataset_id"], override_values["shared_dataset_id"]] ) diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py new file mode 100644 index 000000000..74984f902 --- /dev/null +++ b/samples/snippets/conftest.py @@ -0,0 +1,91 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import bigquery +import pytest +import test_utils.prefixer + + +prefixer = test_utils.prefixer.Prefixer("python-bigquery", "samples/snippets") + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_datasets(bigquery_client: bigquery.Client): + for dataset in bigquery_client.list_datasets(): + if prefixer.should_cleanup(dataset.dataset_id): + bigquery_client.delete_dataset( + dataset, delete_contents=True, not_found_ok=True + ) + + +@pytest.fixture(scope="session") +def bigquery_client(): + bigquery_client = bigquery.Client() + return bigquery_client + + +@pytest.fixture(scope="session") +def project_id(bigquery_client): + return bigquery_client.project + + +@pytest.fixture(scope="session") +def dataset_id(bigquery_client: bigquery.Client, project_id: str): + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + bigquery_client.create_dataset(dataset) + yield dataset_id + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + +@pytest.fixture(scope="session") +def dataset_id_us_east1(bigquery_client: bigquery.Client, project_id: str): + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + dataset.location = "us-east1" + bigquery_client.create_dataset(dataset) + yield dataset_id + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + +@pytest.fixture(scope="session") +def table_id_us_east1( + bigquery_client: bigquery.Client, project_id: str, dataset_id_us_east1: str +): + table_id = prefixer.create_prefix() + full_table_id = f"{project_id}.{dataset_id_us_east1}.{table_id}" + table = bigquery.Table( + full_table_id, schema=[bigquery.SchemaField("string_col", "STRING")] + ) + bigquery_client.create_table(table) + yield full_table_id + bigquery_client.delete_table(table, not_found_ok=True) + + +@pytest.fixture +def random_table_id(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): + """Create a new table ID each time, so random_table_id can be used as + target for load jobs. + """ + random_table_id = prefixer.create_prefix() + full_table_id = f"{project_id}.{dataset_id}.{random_table_id}" + yield full_table_id + bigquery_client.delete_table(full_table_id, not_found_ok=True) + + +@pytest.fixture +def bigquery_client_patch(monkeypatch, bigquery_client): + monkeypatch.setattr(bigquery, "Client", lambda: bigquery_client) diff --git a/samples/snippets/delete_job.py b/samples/snippets/delete_job.py new file mode 100644 index 000000000..abed0c90d --- /dev/null +++ b/samples/snippets/delete_job.py @@ -0,0 +1,44 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def delete_job_metadata(job_id: str, location: str): + orig_job_id = job_id + orig_location = location + # [START bigquery_delete_job] + from google.cloud import bigquery + from google.api_core import exceptions + + # TODO(developer): Set the job ID to the ID of the job whose metadata you + # wish to delete. + job_id = "abcd-efgh-ijkl-mnop" + + # TODO(developer): Set the location to the region or multi-region + # containing the job. + location = "us-east1" + + # [END bigquery_delete_job] + job_id = orig_job_id + location = orig_location + + # [START bigquery_delete_job] + client = bigquery.Client() + + client.delete_job_metadata(job_id, location=location) + + try: + client.get_job(job_id, location=location) + except exceptions.NotFound: + print(f"Job metadata for job {location}:{job_id} was deleted.") + # [END bigquery_delete_job] diff --git a/samples/snippets/delete_job_test.py b/samples/snippets/delete_job_test.py new file mode 100644 index 000000000..c9baa817d --- /dev/null +++ b/samples/snippets/delete_job_test.py @@ -0,0 +1,33 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import bigquery + +import delete_job + + +def test_delete_job_metadata( + capsys, bigquery_client: bigquery.Client, table_id_us_east1: str +): + query_job: bigquery.QueryJob = bigquery_client.query( + f"SELECT COUNT(*) FROM `{table_id_us_east1}`", location="us-east1", + ) + query_job.result() + assert query_job.job_id is not None + + delete_job.delete_job_metadata(query_job.job_id, "us-east1") + + out, _ = capsys.readouterr() + assert "deleted" in out + assert f"us-east1:{query_job.job_id}" in out diff --git a/samples/snippets/load_table_uri_firestore.py b/samples/snippets/load_table_uri_firestore.py new file mode 100644 index 000000000..bf9d01349 --- /dev/null +++ b/samples/snippets/load_table_uri_firestore.py @@ -0,0 +1,55 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def load_table_uri_firestore(table_id): + orig_table_id = table_id + # [START bigquery_load_table_gcs_firestore] + # TODO(developer): Set table_id to the ID of the table to create. + table_id = "your-project.your_dataset.your_table_name" + + # TODO(developer): Set uri to the path of the kind export metadata + uri = ( + "gs://cloud-samples-data/bigquery/us-states" + "/2021-07-02T16:04:48_70344/all_namespaces/kind_us-states" + "/all_namespaces_kind_us-states.export_metadata" + ) + + # TODO(developer): Set projection_fields to a list of document properties + # to import. Leave unset or set to `None` for all fields. + projection_fields = ["name", "post_abbr"] + + # [END bigquery_load_table_gcs_firestore] + table_id = orig_table_id + + # [START bigquery_load_table_gcs_firestore] + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client() + + job_config = bigquery.LoadJobConfig( + source_format=bigquery.SourceFormat.DATASTORE_BACKUP, + projection_fields=projection_fields, + ) + + load_job = client.load_table_from_uri( + uri, table_id, job_config=job_config + ) # Make an API request. + + load_job.result() # Waits for the job to complete. + + destination_table = client.get_table(table_id) + print("Loaded {} rows.".format(destination_table.num_rows)) + # [END bigquery_load_table_gcs_firestore] diff --git a/samples/snippets/load_table_uri_firestore_test.py b/samples/snippets/load_table_uri_firestore_test.py new file mode 100644 index 000000000..ffa02cdf9 --- /dev/null +++ b/samples/snippets/load_table_uri_firestore_test.py @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import load_table_uri_firestore + + +def test_load_table_uri_firestore(capsys, random_table_id): + load_table_uri_firestore.load_table_uri_firestore(random_table_id) + out, _ = capsys.readouterr() + assert "Loaded 50 rows." in out diff --git a/samples/snippets/materialized_view.py b/samples/snippets/materialized_view.py new file mode 100644 index 000000000..429bd98b4 --- /dev/null +++ b/samples/snippets/materialized_view.py @@ -0,0 +1,86 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def create_materialized_view(override_values={}): + # [START bigquery_create_materialized_view] + from google.cloud import bigquery + + bigquery_client = bigquery.Client() + + view_id = "my-project.my_dataset.my_materialized_view" + base_table_id = "my-project.my_dataset.my_base_table" + # [END bigquery_create_materialized_view] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_id = override_values.get("view_id", view_id) + base_table_id = override_values.get("base_table_id", base_table_id) + # [START bigquery_create_materialized_view] + view = bigquery.Table(view_id) + view.mview_query = f""" + SELECT product_id, SUM(clicks) AS sum_clicks + FROM `{base_table_id}` + GROUP BY 1 + """ + + # Make an API request to create the materialized view. + view = bigquery_client.create_table(view) + print(f"Created {view.table_type}: {str(view.reference)}") + # [END bigquery_create_materialized_view] + return view + + +def update_materialized_view(override_values={}): + # [START bigquery_update_materialized_view] + import datetime + from google.cloud import bigquery + + bigquery_client = bigquery.Client() + + view_id = "my-project.my_dataset.my_materialized_view" + # [END bigquery_update_materialized_view] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_id = override_values.get("view_id", view_id) + # [START bigquery_update_materialized_view] + view = bigquery.Table(view_id) + view.mview_enable_refresh = True + view.mview_refresh_interval = datetime.timedelta(hours=1) + + # Make an API request to update the materialized view. + view = bigquery_client.update_table( + view, + # Pass in a list of any fields you need to modify. + ["mview_enable_refresh", "mview_refresh_interval"], + ) + print(f"Updated {view.table_type}: {str(view.reference)}") + # [END bigquery_update_materialized_view] + return view + + +def delete_materialized_view(override_values={}): + # [START bigquery_delete_materialized_view] + from google.cloud import bigquery + + bigquery_client = bigquery.Client() + + view_id = "my-project.my_dataset.my_materialized_view" + # [END bigquery_delete_materialized_view] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_id = override_values.get("view_id", view_id) + # [START bigquery_delete_materialized_view] + # Make an API request to delete the materialized view. + bigquery_client.delete_table(view_id) + # [END bigquery_delete_materialized_view] diff --git a/samples/snippets/materialized_view_test.py b/samples/snippets/materialized_view_test.py new file mode 100644 index 000000000..75c6b2106 --- /dev/null +++ b/samples/snippets/materialized_view_test.py @@ -0,0 +1,83 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import uuid + +from google.api_core import exceptions +from google.cloud import bigquery +import pytest + +import materialized_view + + +def temp_suffix(): + now = datetime.datetime.now() + return f"{now.strftime('%Y%m%d%H%M%S')}_{uuid.uuid4().hex[:8]}" + + +@pytest.fixture(autouse=True) +def bigquery_client_patch(monkeypatch, bigquery_client): + monkeypatch.setattr(bigquery, "Client", lambda: bigquery_client) + + +@pytest.fixture(scope="module") +def dataset_id(bigquery_client): + dataset_id = f"mvdataset_{temp_suffix()}" + bigquery_client.create_dataset(dataset_id) + yield dataset_id + bigquery_client.delete_dataset(dataset_id, delete_contents=True) + + +@pytest.fixture(scope="module") +def base_table_id(bigquery_client, project_id, dataset_id): + base_table_id = f"{project_id}.{dataset_id}.base_{temp_suffix()}" + # Schema from materialized views guide: + # https://cloud.google.com/bigquery/docs/materialized-views#create + base_table = bigquery.Table(base_table_id) + base_table.schema = [ + bigquery.SchemaField("product_id", bigquery.SqlTypeNames.INT64), + bigquery.SchemaField("clicks", bigquery.SqlTypeNames.INT64), + ] + bigquery_client.create_table(base_table) + yield base_table_id + bigquery_client.delete_table(base_table_id) + + +@pytest.fixture(scope="module") +def view_id(bigquery_client, project_id, dataset_id): + view_id = f"{project_id}.{dataset_id}.mview_{temp_suffix()}" + yield view_id + bigquery_client.delete_table(view_id, not_found_ok=True) + + +def test_materialized_view(capsys, bigquery_client, base_table_id, view_id): + override_values = { + "base_table_id": base_table_id, + "view_id": view_id, + } + view = materialized_view.create_materialized_view(override_values) + assert base_table_id in view.mview_query + out, _ = capsys.readouterr() + assert view_id in out + + view = materialized_view.update_materialized_view(override_values) + assert view.mview_enable_refresh + assert view.mview_refresh_interval == datetime.timedelta(hours=1) + out, _ = capsys.readouterr() + assert view_id in out + + materialized_view.delete_materialized_view(override_values) + with pytest.raises(exceptions.NotFound): + bigquery_client.get_table(view_id) diff --git a/samples/snippets/natality_tutorial.py b/samples/snippets/natality_tutorial.py index b2b607b0d..a8d90501a 100644 --- a/samples/snippets/natality_tutorial.py +++ b/samples/snippets/natality_tutorial.py @@ -38,13 +38,15 @@ def run_natality_tutorial(override_values={}): # Prepare a reference to a new dataset for storing the query results. dataset_id = "natality_regression" + dataset_id_full = "{}.{}".format(client.project, dataset_id) # [END bigquery_query_natality_tutorial] # To facilitate testing, we replace values with alternatives # provided by the testing harness. dataset_id = override_values.get("dataset_id", dataset_id) + dataset_id_full = "{}.{}".format(client.project, dataset_id) # [START bigquery_query_natality_tutorial] - dataset = bigquery.Dataset(client.dataset(dataset_id)) + dataset = bigquery.Dataset(dataset_id_full) # Create the new BigQuery dataset. dataset = client.create_dataset(dataset) diff --git a/samples/snippets/natality_tutorial_test.py b/samples/snippets/natality_tutorial_test.py index fae72fa46..d9c89bef2 100644 --- a/samples/snippets/natality_tutorial_test.py +++ b/samples/snippets/natality_tutorial_test.py @@ -43,8 +43,8 @@ def test_natality_tutorial(client, datasets_to_delete): natality_tutorial.run_natality_tutorial(override_values) - table_ref = bigquery.Dataset(client.dataset(override_values["dataset_id"])).table( - "regression_input" + table_ref = "{}.{}.{}".format( + client.project, override_values["dataset_id"], "regression_input" ) table = client.get_table(table_ref) assert table.num_rows > 0 diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index 5660f08be..b008613f0 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -17,6 +17,7 @@ import os from pathlib import Path import sys +from typing import Callable, Dict, List, Optional import nox @@ -27,8 +28,9 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -# Copy `noxfile_config.py` to your directory and modify it instead. +BLACK_VERSION = "black==19.10b0" +# Copy `noxfile_config.py` to your directory and modify it instead. # `TEST_CONFIG` dict is a configuration hook that allows users to # modify the test configurations. The values here should be in sync @@ -37,13 +39,20 @@ TEST_CONFIG = { # You can opt out from the test for specific Python versions. - "ignored_versions": ["2.7"], + "ignored_versions": [], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": False, # An envvar key for determining the project id to use. Change it # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a # build specific Cloud project. You can also use your own string # to use your own Cloud project. "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # If you need to use a specific version of pip, + # change pip_version_override to the string representation + # of the version number, for example, "20.2.4" + "pip_version_override": None, # A dictionary you want to inject into your test. Don't put any # secrets here. These values will override predefined values. "envs": {}, @@ -62,7 +71,7 @@ TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) -def get_pytest_env_vars(): +def get_pytest_env_vars() -> Dict[str, str]: """Returns a dict for pytest invocation.""" ret = {} @@ -77,21 +86,24 @@ def get_pytest_env_vars(): # DO NOT EDIT - automatically generated. -# All versions used to tested samples. -ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] +# All versions used to test samples. +ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) -INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) +INSTALL_LIBRARY_FROM_SOURCE = os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False) in ( + "True", + "true", +) # # Style Checks # -def _determine_local_import_names(start_dir): +def _determine_local_import_names(start_dir: str) -> List[str]: """Determines all import names that should be considered "local". This is used when running the linter to insure that import order is @@ -129,8 +141,11 @@ def _determine_local_import_names(start_dir): @nox.session -def lint(session): - session.install("flake8", "flake8-import-order") +def lint(session: nox.sessions.Session) -> None: + if not TEST_CONFIG["enforce_type_hints"]: + session.install("flake8", "flake8-import-order") + else: + session.install("flake8", "flake8-import-order", "flake8-annotations") local_names = _determine_local_import_names(".") args = FLAKE8_COMMON_ARGS + [ @@ -141,6 +156,19 @@ def lint(session): session.run("flake8", *args) +# +# Black +# + + +@nox.session +def blacken(session: nox.sessions.Session) -> None: + session.install(BLACK_VERSION) + python_files = [path for path in os.listdir(".") if path.endswith(".py")] + + session.run("black", *python_files) + + # # Sample Tests # @@ -149,13 +177,24 @@ def lint(session): PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] -def _session_tests(session, post_install=None): +def _session_tests( + session: nox.sessions.Session, post_install: Callable = None +) -> None: + if TEST_CONFIG["pip_version_override"]: + pip_version = TEST_CONFIG["pip_version_override"] + session.install(f"pip=={pip_version}") """Runs py.test for a particular project.""" if os.path.exists("requirements.txt"): - session.install("-r", "requirements.txt") + if os.path.exists("constraints.txt"): + session.install("-r", "requirements.txt", "-c", "constraints.txt") + else: + session.install("-r", "requirements.txt") if os.path.exists("requirements-test.txt"): - session.install("-r", "requirements-test.txt") + if os.path.exists("constraints-test.txt"): + session.install("-r", "requirements-test.txt", "-c", "constraints-test.txt") + else: + session.install("-r", "requirements-test.txt") if INSTALL_LIBRARY_FROM_SOURCE: session.install("-e", _get_repo_root()) @@ -170,12 +209,12 @@ def _session_tests(session, post_install=None): # on travis where slow and flaky tests are excluded. # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html success_codes=[0, 5], - env=get_pytest_env_vars() + env=get_pytest_env_vars(), ) @nox.session(python=ALL_VERSIONS) -def py(session): +def py(session: nox.sessions.Session) -> None: """Runs py.test for a sample using the specified version of Python.""" if session.python in TESTED_VERSIONS: _session_tests(session) @@ -190,7 +229,7 @@ def py(session): # -def _get_repo_root(): +def _get_repo_root() -> Optional[str]: """ Returns the root folder of the project. """ # Get root of this repository. Assume we don't have directories nested deeper than 10 items. p = Path(os.getcwd()) @@ -199,6 +238,11 @@ def _get_repo_root(): break if Path(p / ".git").exists(): return str(p) + # .git is not available in repos cloned via Cloud Build + # setup.py is always in the library's root, so use that instead + # https://github.com/googleapis/synthtool/issues/792 + if Path(p / "setup.py").exists(): + return str(p) p = p.parent raise Exception("Unable to detect repository root.") @@ -208,7 +252,7 @@ def _get_repo_root(): @nox.session @nox.parametrize("path", GENERATED_READMES) -def readmegen(session, path): +def readmegen(session: nox.sessions.Session, path: str) -> None: """(Re-)generates the readme for a sample.""" session.install("jinja2", "pyyaml") dir_ = os.path.dirname(path) diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index 56d6fd843..1b0ef5b3a 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -33,8 +33,8 @@ def run_quickstart(override_values={}): # [START bigquery_quickstart] # Prepares a reference to the new dataset - dataset_ref = bigquery_client.dataset(dataset_id) - dataset = bigquery.Dataset(dataset_ref) + dataset_id_full = "{}.{}".format(bigquery_client.project, dataset_id) + dataset = bigquery.Dataset(dataset_id_full) # Creates the new dataset dataset = bigquery_client.create_dataset(dataset) diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 676ff949e..b8dee50d0 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,2 +1,3 @@ -pytest==5.4.3 -mock==4.0.2 +google-cloud-testutils==1.0.0 +pytest==6.2.4 +mock==4.0.3 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 7fe839119..d75c747fb 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,6 +1,12 @@ -google-cloud-bigquery[pandas,bqstorage,pyarrow]==1.26.1 -google-auth-oauthlib==0.4.1 +google-cloud-bigquery==2.25.0 +google-cloud-bigquery-storage==2.6.3 +google-auth-oauthlib==0.4.5 +grpcio==1.39.0 ipython==7.16.1; python_version < '3.7' ipython==7.17.0; python_version >= '3.7' -matplotlib==3.3.1 -pytz==2020.1 +matplotlib==3.3.4; python_version < '3.7' +matplotlib==3.4.1; python_version >= '3.7' +pandas==1.1.5; python_version < '3.7' +pandas==1.3.2; python_version >= '3.7' +pyarrow==5.0.0 +pytz==2021.1 diff --git a/samples/snippets/test_update_with_dml.py b/samples/snippets/test_update_with_dml.py new file mode 100644 index 000000000..912fd76e2 --- /dev/null +++ b/samples/snippets/test_update_with_dml.py @@ -0,0 +1,36 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import bigquery +import pytest + +from conftest import prefixer +import update_with_dml + + +@pytest.fixture +def table_id(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): + table_id = f"{prefixer.create_prefix()}_update_with_dml" + yield table_id + full_table_id = f"{project_id}.{dataset_id}.{table_id}" + bigquery_client.delete_table(full_table_id, not_found_ok=True) + + +def test_update_with_dml(bigquery_client_patch, dataset_id, table_id): + override_values = { + "dataset_id": dataset_id, + "table_id": table_id, + } + num_rows = update_with_dml.run_sample(override_values=override_values) + assert num_rows > 0 diff --git a/samples/snippets/update_with_dml.py b/samples/snippets/update_with_dml.py new file mode 100644 index 000000000..7fd09dd80 --- /dev/null +++ b/samples/snippets/update_with_dml.py @@ -0,0 +1,82 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START bigquery_update_with_dml] +import pathlib + +from google.cloud import bigquery +from google.cloud.bigquery import enums + + +def load_from_newline_delimited_json( + client: bigquery.Client, + filepath: pathlib.Path, + project_id: str, + dataset_id: str, + table_id: str, +): + full_table_id = f"{project_id}.{dataset_id}.{table_id}" + job_config = bigquery.LoadJobConfig() + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + job_config.schema = [ + bigquery.SchemaField("id", enums.SqlTypeNames.STRING), + bigquery.SchemaField("user_id", enums.SqlTypeNames.INTEGER), + bigquery.SchemaField("login_time", enums.SqlTypeNames.TIMESTAMP), + bigquery.SchemaField("logout_time", enums.SqlTypeNames.TIMESTAMP), + bigquery.SchemaField("ip_address", enums.SqlTypeNames.STRING), + ] + + with open(filepath, "rb") as json_file: + load_job = client.load_table_from_file( + json_file, full_table_id, job_config=job_config + ) + + # Wait for load job to finish. + load_job.result() + + +def update_with_dml( + client: bigquery.Client, project_id: str, dataset_id: str, table_id: str +): + query_text = f""" + UPDATE `{project_id}.{dataset_id}.{table_id}` + SET ip_address = REGEXP_REPLACE(ip_address, r"(\\.[0-9]+)$", ".0") + WHERE TRUE + """ + query_job = client.query(query_text) + + # Wait for query job to finish. + query_job.result() + + print(f"DML query modified {query_job.num_dml_affected_rows} rows.") + return query_job.num_dml_affected_rows + + +def run_sample(override_values={}): + client = bigquery.Client() + filepath = pathlib.Path(__file__).parent / "user_sessions_data.json" + project_id = client.project + dataset_id = "sample_db" + table_id = "UserSessions" + # [END bigquery_update_with_dml] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + dataset_id = override_values.get("dataset_id", dataset_id) + table_id = override_values.get("table_id", table_id) + # [START bigquery_update_with_dml] + load_from_newline_delimited_json(client, filepath, project_id, dataset_id, table_id) + return update_with_dml(client, project_id, dataset_id, table_id) + + +# [END bigquery_update_with_dml] diff --git a/samples/snippets/user_sessions_data.json b/samples/snippets/user_sessions_data.json new file mode 100644 index 000000000..7ea3715ad --- /dev/null +++ b/samples/snippets/user_sessions_data.json @@ -0,0 +1,10 @@ +{"id":"2ad525d6-c832-4c3d-b7fe-59d104885519","user_id":"38","login_time":"1.47766087E9","logout_time":"1.477661109E9","ip_address":"192.0.2.12"} +{"id":"53d65e20-6ea9-4650-98d9-a2111fbd1122","user_id":"88","login_time":"1.47707544E9","logout_time":"1.477075519E9","ip_address":"192.0.2.88"} +{"id":"5e6c3021-d5e7-4ccd-84b2-adfa9176d13d","user_id":"39","login_time":"1.474022869E9","logout_time":"1.474022961E9","ip_address":"203.0.113.52"} +{"id":"6196eefa-1498-4567-8ef0-498845b888d9","user_id":"52","login_time":"1.478604612E9","logout_time":"1.478604691E9","ip_address":"203.0.113.169"} +{"id":"70656dc5-7e0f-49cf-9e00-f06ed93c1f5b","user_id":"46","login_time":"1.474089924E9","logout_time":"1.474090227E9","ip_address":"192.0.2.10"} +{"id":"aafa5eef-ad49-49a7-9a0f-fbc7fd639bd3","user_id":"40","login_time":"1.478031161E9","logout_time":"1.478031388E9","ip_address":"203.0.113.18"} +{"id":"d2792fc2-24dd-4260-9456-3fbe6cdfdd90","user_id":"5","login_time":"1.481259081E9","logout_time":"1.481259247E9","ip_address":"192.0.2.140"} +{"id":"d835dc49-32f9-4790-b4eb-dddee62e0dcc","user_id":"62","login_time":"1.478892977E9","logout_time":"1.478893219E9","ip_address":"203.0.113.83"} +{"id":"f4a0d3c7-351f-471c-8e11-e093e7a6ce75","user_id":"89","login_time":"1.459031555E9","logout_time":"1.459031831E9","ip_address":"203.0.113.233"} +{"id":"f6e9f526-5b22-4679-9c3e-56a636e815bb","user_id":"97","login_time":"1.482426034E9","logout_time":"1.482426415E9","ip_address":"203.0.113.167"} diff --git a/samples/snippets/view.py b/samples/snippets/view.py new file mode 100644 index 000000000..ad3f11717 --- /dev/null +++ b/samples/snippets/view.py @@ -0,0 +1,164 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def create_view(override_values={}): + # [START bigquery_create_view] + from google.cloud import bigquery + + client = bigquery.Client() + + view_id = "my-project.my_dataset.my_view" + source_id = "my-project.my_dataset.my_table" + # [END bigquery_create_view] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_id = override_values.get("view_id", view_id) + source_id = override_values.get("source_id", source_id) + # [START bigquery_create_view] + view = bigquery.Table(view_id) + + # The source table in this example is created from a CSV file in Google + # Cloud Storage located at + # `gs://cloud-samples-data/bigquery/us-states/us-states.csv`. It contains + # 50 US states, while the view returns only those states with names + # starting with the letter 'W'. + view.view_query = f"SELECT name, post_abbr FROM `{source_id}` WHERE name LIKE 'W%'" + + # Make an API request to create the view. + view = client.create_table(view) + print(f"Created {view.table_type}: {str(view.reference)}") + # [END bigquery_create_view] + return view + + +def get_view(override_values={}): + # [START bigquery_get_view] + from google.cloud import bigquery + + client = bigquery.Client() + + view_id = "my-project.my_dataset.my_view" + # [END bigquery_get_view] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_id = override_values.get("view_id", view_id) + # [START bigquery_get_view] + # Make an API request to get the table resource. + view = client.get_table(view_id) + + # Display view properties + print(f"Retrieved {view.table_type}: {str(view.reference)}") + print(f"View Query:\n{view.view_query}") + # [END bigquery_get_view] + return view + + +def update_view(override_values={}): + # [START bigquery_update_view_query] + from google.cloud import bigquery + + client = bigquery.Client() + + view_id = "my-project.my_dataset.my_view" + source_id = "my-project.my_dataset.my_table" + # [END bigquery_update_view_query] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_id = override_values.get("view_id", view_id) + source_id = override_values.get("source_id", source_id) + # [START bigquery_update_view_query] + view = bigquery.Table(view_id) + + # The source table in this example is created from a CSV file in Google + # Cloud Storage located at + # `gs://cloud-samples-data/bigquery/us-states/us-states.csv`. It contains + # 50 US states, while the view returns only those states with names + # starting with the letter 'M'. + view.view_query = f"SELECT name, post_abbr FROM `{source_id}` WHERE name LIKE 'M%'" + + # Make an API request to update the query property of the view. + view = client.update_table(view, ["view_query"]) + print(f"Updated {view.table_type}: {str(view.reference)}") + # [END bigquery_update_view_query] + return view + + +def grant_access(override_values={}): + # [START bigquery_grant_view_access] + from google.cloud import bigquery + + client = bigquery.Client() + + # To use a view, the analyst requires ACLs to both the view and the source + # table. Create an authorized view to allow an analyst to use a view + # without direct access permissions to the source table. + view_dataset_id = "my-project.my_view_dataset" + # [END bigquery_grant_view_access] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_dataset_id = override_values.get("view_dataset_id", view_dataset_id) + # [START bigquery_grant_view_access] + # Make an API request to get the view dataset ACLs. + view_dataset = client.get_dataset(view_dataset_id) + + analyst_group_email = "data_analysts@example.com" + # [END bigquery_grant_view_access] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + analyst_group_email = override_values.get( + "analyst_group_email", analyst_group_email + ) + # [START bigquery_grant_view_access] + access_entries = view_dataset.access_entries + access_entries.append( + bigquery.AccessEntry("READER", "groupByEmail", analyst_group_email) + ) + view_dataset.access_entries = access_entries + + # Make an API request to update the ACLs property of the view dataset. + view_dataset = client.update_dataset(view_dataset, ["access_entries"]) + print(f"Access to view: {view_dataset.access_entries}") + + # Group members of "data_analysts@example.com" now have access to the view, + # but they require access to the source table to use it. To remove this + # restriction, authorize the view to access the source dataset. + source_dataset_id = "my-project.my_source_dataset" + # [END bigquery_grant_view_access] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + source_dataset_id = override_values.get("source_dataset_id", source_dataset_id) + # [START bigquery_grant_view_access] + # Make an API request to set the source dataset ACLs. + source_dataset = client.get_dataset(source_dataset_id) + + view_reference = { + "projectId": "my-project", + "datasetId": "my_view_dataset", + "tableId": "my_authorized_view", + } + # [END bigquery_grant_view_access] + # To facilitate testing, we replace values with alternatives + # provided by the testing harness. + view_reference = override_values.get("view_reference", view_reference) + # [START bigquery_grant_view_access] + access_entries = source_dataset.access_entries + access_entries.append(bigquery.AccessEntry(None, "view", view_reference)) + source_dataset.access_entries = access_entries + + # Make an API request to update the ACLs property of the source dataset. + source_dataset = client.update_dataset(source_dataset, ["access_entries"]) + print(f"Access to source: {source_dataset.access_entries}") + # [END bigquery_grant_view_access] + return view_dataset, source_dataset diff --git a/samples/snippets/view_test.py b/samples/snippets/view_test.py new file mode 100644 index 000000000..77105b61a --- /dev/null +++ b/samples/snippets/view_test.py @@ -0,0 +1,117 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import uuid + +from google.cloud import bigquery +import pytest + +import view + + +def temp_suffix(): + now = datetime.datetime.now() + return f"{now.strftime('%Y%m%d%H%M%S')}_{uuid.uuid4().hex[:8]}" + + +@pytest.fixture(autouse=True) +def bigquery_client_patch(monkeypatch, bigquery_client): + monkeypatch.setattr(bigquery, "Client", lambda: bigquery_client) + + +@pytest.fixture(scope="module") +def view_dataset_id(bigquery_client, project_id): + dataset_id = f"{project_id}.view_{temp_suffix()}" + bigquery_client.create_dataset(dataset_id) + yield dataset_id + bigquery_client.delete_dataset(dataset_id, delete_contents=True) + + +@pytest.fixture(scope="module") +def view_id(bigquery_client, view_dataset_id): + view_id = f"{view_dataset_id}.my_view" + yield view_id + bigquery_client.delete_table(view_id, not_found_ok=True) + + +@pytest.fixture(scope="module") +def source_dataset_id(bigquery_client, project_id): + dataset_id = f"{project_id}.view_{temp_suffix()}" + bigquery_client.create_dataset(dataset_id) + yield dataset_id + bigquery_client.delete_dataset(dataset_id, delete_contents=True) + + +@pytest.fixture(scope="module") +def source_table_id(bigquery_client, source_dataset_id): + source_table_id = f"{source_dataset_id}.us_states" + job_config = bigquery.LoadJobConfig( + schema=[ + bigquery.SchemaField("name", "STRING"), + bigquery.SchemaField("post_abbr", "STRING"), + ], + skip_leading_rows=1, + ) + load_job = bigquery_client.load_table_from_uri( + "gs://cloud-samples-data/bigquery/us-states/us-states.csv", + source_table_id, + job_config=job_config, + ) + load_job.result() + yield source_table_id + bigquery_client.delete_table(source_table_id, not_found_ok=True) + + +def test_view(capsys, view_id, view_dataset_id, source_table_id, source_dataset_id): + override_values = { + "view_id": view_id, + "source_id": source_table_id, + } + got = view.create_view(override_values) + assert source_table_id in got.view_query + out, _ = capsys.readouterr() + assert view_id in out + + got = view.get_view(override_values) + assert source_table_id in got.view_query + assert "'W%'" in got.view_query + out, _ = capsys.readouterr() + assert view_id in out + assert source_table_id in out + assert "'W%'" in out + + got = view.update_view(override_values) + assert source_table_id in got.view_query + assert "'M%'" in got.view_query + out, _ = capsys.readouterr() + assert view_id in out + + project_id, dataset_id, table_id = view_id.split(".") + override_values = { + "analyst_group_email": "cloud-dpes-bigquery@google.com", + "view_dataset_id": view_dataset_id, + "source_dataset_id": source_dataset_id, + "view_reference": { + "projectId": project_id, + "datasetId": dataset_id, + "tableId": table_id, + }, + } + view_dataset, source_dataset = view.grant_access(override_values) + assert len(view_dataset.access_entries) != 0 + assert len(source_dataset.access_entries) != 0 + out, _ = capsys.readouterr() + assert "cloud-dpes-bigquery@google.com" in out + assert table_id in out diff --git a/samples/tests/conftest.py b/samples/tests/conftest.py index d80085dd3..0fdacaaec 100644 --- a/samples/tests/conftest.py +++ b/samples/tests/conftest.py @@ -126,7 +126,7 @@ def routine_id(client, dataset_id): bigquery.RoutineArgument( name="x", data_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ), ) ] diff --git a/samples/tests/test_client_query_destination_table_clustered.py b/samples/tests/test_client_query_destination_table_clustered.py new file mode 100644 index 000000000..b4bdd588c --- /dev/null +++ b/samples/tests/test_client_query_destination_table_clustered.py @@ -0,0 +1,27 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .. import client_query_destination_table_clustered + + +def test_client_query_destination_table_clustered(capsys, random_table_id): + + client_query_destination_table_clustered.client_query_destination_table_clustered( + random_table_id + ) + out, err = capsys.readouterr() + assert ( + "The destination table is written using the cluster_fields configuration." + in out + ) diff --git a/samples/tests/test_copy_table_multiple_source.py b/samples/tests/test_copy_table_multiple_source.py index 45c6d34f5..5bc4668b0 100644 --- a/samples/tests/test_copy_table_multiple_source.py +++ b/samples/tests/test_copy_table_multiple_source.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import six +import io from google.cloud import bigquery from .. import copy_table_multiple_source @@ -32,7 +32,7 @@ def test_copy_table_multiple_source(capsys, random_table_id, random_dataset_id, bigquery.SchemaField("post_abbr", "STRING"), ] ) - body = six.BytesIO(data) + body = io.BytesIO(data) client.load_table_from_file( body, table_ref, location="US", job_config=job_config ).result() diff --git a/samples/tests/test_create_table_clustered.py b/samples/tests/test_create_table_clustered.py new file mode 100644 index 000000000..8eab5d48b --- /dev/null +++ b/samples/tests/test_create_table_clustered.py @@ -0,0 +1,22 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .. import create_table_clustered + + +def test_create_table_clustered(capsys, random_table_id): + table = create_table_clustered.create_table_clustered(random_table_id) + out, _ = capsys.readouterr() + assert "Created clustered table {}".format(random_table_id) in out + assert table.clustering_fields == ["city", "zipcode"] diff --git a/samples/tests/test_load_table_clustered.py b/samples/tests/test_load_table_clustered.py new file mode 100644 index 000000000..bafdc2051 --- /dev/null +++ b/samples/tests/test_load_table_clustered.py @@ -0,0 +1,27 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .. import load_table_clustered + + +def test_load_table_clustered(capsys, random_table_id, client): + + table = load_table_clustered.load_table_clustered(random_table_id) + + out, _ = capsys.readouterr() + assert "rows and 4 columns" in out + + rows = list(client.list_rows(table)) # Make an API request. + assert len(rows) > 0 + assert table.clustering_fields == ["origin", "destination"] diff --git a/samples/tests/test_routine_samples.py b/samples/tests/test_routine_samples.py index a4467c59a..59ec1fae9 100644 --- a/samples/tests/test_routine_samples.py +++ b/samples/tests/test_routine_samples.py @@ -39,21 +39,21 @@ def test_create_routine_ddl(capsys, random_routine_id, client): bigquery.RoutineArgument( name="arr", data_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.ARRAY, + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.ARRAY, array_element_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.STRUCT, + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.STRUCT, struct_type=bigquery_v2.types.StandardSqlStructType( fields=[ bigquery_v2.types.StandardSqlField( name="name", type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.STRING + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.STRING ), ), bigquery_v2.types.StandardSqlField( name="val", type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ), ), ] diff --git a/scripts/decrypt-secrets.sh b/scripts/decrypt-secrets.sh index ff599eb2a..21f6d2a26 100755 --- a/scripts/decrypt-secrets.sh +++ b/scripts/decrypt-secrets.sh @@ -20,14 +20,27 @@ ROOT=$( dirname "$DIR" ) # Work from the project root. cd $ROOT +# Prevent it from overriding files. +# We recommend that sample authors use their own service account files and cloud project. +# In that case, they are supposed to prepare these files by themselves. +if [[ -f "testing/test-env.sh" ]] || \ + [[ -f "testing/service-account.json" ]] || \ + [[ -f "testing/client-secrets.json" ]]; then + echo "One or more target files exist, aborting." + exit 1 +fi + # Use SECRET_MANAGER_PROJECT if set, fallback to cloud-devrel-kokoro-resources. PROJECT_ID="${SECRET_MANAGER_PROJECT:-cloud-devrel-kokoro-resources}" gcloud secrets versions access latest --secret="python-docs-samples-test-env" \ + --project="${PROJECT_ID}" \ > testing/test-env.sh gcloud secrets versions access latest \ --secret="python-docs-samples-service-account" \ + --project="${PROJECT_ID}" \ > testing/service-account.json gcloud secrets versions access latest \ --secret="python-docs-samples-client-secrets" \ - > testing/client-secrets.json \ No newline at end of file + --project="${PROJECT_ID}" \ + > testing/client-secrets.json diff --git a/scripts/readme-gen/templates/install_deps.tmpl.rst b/scripts/readme-gen/templates/install_deps.tmpl.rst index a0406dba8..275d64989 100644 --- a/scripts/readme-gen/templates/install_deps.tmpl.rst +++ b/scripts/readme-gen/templates/install_deps.tmpl.rst @@ -12,7 +12,7 @@ Install Dependencies .. _Python Development Environment Setup Guide: https://cloud.google.com/python/setup -#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. +#. Create a virtualenv. Samples are compatible with Python 3.6+. .. code-block:: bash diff --git a/setup.cfg b/setup.cfg index c3a2b39f6..8eefc4435 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,3 +17,17 @@ # Generated by synthtool. DO NOT EDIT! [bdist_wheel] universal = 1 + +[pytype] +python_version = 3.8 +inputs = + google/cloud/ +exclude = + tests/ + google/cloud/bigquery_v2/ +output = .pytype/ +disable = + # There's some issue with finding some pyi files, thus disabling. + # The issue https://github.com/google/pytype/issues/150 is closed, but the + # error still occurs for some reason. + pyi-error diff --git a/setup.py b/setup.py index 73d9a03ca..e7515493d 100644 --- a/setup.py +++ b/setup.py @@ -22,63 +22,58 @@ name = "google-cloud-bigquery" description = "Google BigQuery API client library" -version = "1.28.0" + # Should be one of: # 'Development Status :: 3 - Alpha' # 'Development Status :: 4 - Beta' # 'Development Status :: 5 - Production/Stable' release_status = "Development Status :: 5 - Production/Stable" dependencies = [ - 'enum34; python_version < "3.4"', - "google-api-core >= 1.21.0, < 2.0dev", - "google-cloud-core >= 1.4.1, < 2.0dev", - "google-resumable-media >= 0.6.0, < 2.0dev", - "six >=1.13.0,< 2.0.0dev", + "grpcio >= 1.38.1, < 2.0dev", # https://github.com/googleapis/python-bigquery/issues/695 + # NOTE: Maintainers, please do not require google-api-core>=2.x.x + # Until this issue is closed + # https://github.com/googleapis/google-cloud-python/issues/10566 + "google-api-core[grpc] >= 1.29.0, <3.0.0dev", + "proto-plus >= 1.10.0", + # NOTE: Maintainers, please do not require google-cloud-core>=2.x.x + # Until this issue is closed + # https://github.com/googleapis/google-cloud-python/issues/10566 + "google-cloud-core >= 1.4.1, <3.0.0dev", + "google-resumable-media >= 0.6.0, < 3.0dev", + "packaging >= 14.3", + "protobuf >= 3.12.0", + "requests >= 2.18.0, < 3.0.0dev", ] extras = { "bqstorage": [ - "google-cloud-bigquery-storage >= 1.0.0, <2.0.0dev", + "google-cloud-bigquery-storage >= 2.0.0, <3.0.0dev", # Due to an issue in pip's dependency resolver, the `grpc` extra is not # installed, even though `google-cloud-bigquery-storage` specifies it # as `google-api-core[grpc]`. We thus need to explicitly specify it here. # See: https://github.com/googleapis/python-bigquery/issues/83 The # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 - "grpcio >= 1.32.0, < 2.0dev", - "pyarrow >= 1.0.0, < 2.0dev", - ], - "pandas": ["pandas>=0.23.0"], - "pyarrow": [ - # pyarrow 1.0.0 is required for the use of timestamp_as_object keyword. - "pyarrow >= 1.0.0, < 2.0de ; python_version>='3.5'", - "pyarrow >= 0.16.0, < 0.17.0dev ; python_version<'3.5'", + "grpcio >= 1.38.1, < 2.0dev", + "pyarrow >= 3.0.0, < 6.0dev", ], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], + "pandas": ["pandas>=0.23.0", "pyarrow >= 3.0.0, < 6.0dev"], + "bignumeric_type": ["pyarrow >= 3.0.0, < 6.0dev"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], - "fastparquet": [ - "fastparquet", - "python-snappy", - # llvmlite >= 0.32.0 cannot be installed on Python 3.5 and below - # (building the wheel fails), thus needs to be restricted. - # See: https://github.com/googleapis/python-bigquery/issues/78 - "llvmlite<=0.34.0;python_version>='3.6'", - "llvmlite<=0.31.0;python_version<'3.6'", - ], "opentelemetry": [ - "opentelemetry-api==0.9b0", - "opentelemetry-sdk==0.9b0", - "opentelemetry-instrumentation==0.9b0 ", + "opentelemetry-api >= 0.11b0", + "opentelemetry-sdk >= 0.11b0", + "opentelemetry-instrumentation >= 0.11b0", ], } all_extras = [] for extra in extras: - if extra in ( - # Skip fastparquet from "all" because it is redundant with pyarrow and - # creates a dependency on pre-release versions of numpy. See: - # https://github.com/googleapis/google-cloud-python/issues/8549 - "fastparquet", - ): + # Exclude this extra from all to avoid overly strict dependencies on core + # libraries such as pyarrow. + # https://github.com/googleapis/python-bigquery/issues/563 + if extra in {"bignumeric_type"}: continue all_extras.extend(extras[extra]) @@ -92,10 +87,17 @@ with io.open(readme_filename, encoding="utf-8") as readme_file: readme = readme_file.read() +version = {} +with open(os.path.join(package_root, "google/cloud/bigquery/version.py")) as fp: + exec(fp.read(), version) +version = version["__version__"] + # Only include packages under the 'google' namespace. Do not include tests, # benchmarks, etc. packages = [ - package for package in setuptools.find_packages() if package.startswith("google") + package + for package in setuptools.PEP420PackageFinder.find() + if package.startswith("google") ] # Determine which namespaces are needed. @@ -118,13 +120,11 @@ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Operating System :: OS Independent", "Topic :: Internet", ], @@ -133,7 +133,7 @@ namespace_packages=namespaces, install_requires=dependencies, extras_require=extras, - python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*", + python_requires=">=3.6, <3.10", include_package_data=True, zip_safe=False, ) diff --git a/synth.metadata b/synth.metadata deleted file mode 100644 index efee17785..000000000 --- a/synth.metadata +++ /dev/null @@ -1,127 +0,0 @@ -{ - "sources": [ - { - "git": { - "name": ".", - "remote": "https://github.com/googleapis/python-bigquery.git", - "sha": "a125160696d1453b04a66c967819f90e70e03a52" - } - }, - { - "git": { - "name": "googleapis", - "remote": "https://github.com/googleapis/googleapis.git", - "sha": "868615a5c1c1059c636bb3d82a555edb1d5a251e", - "internalRef": "324294521" - } - }, - { - "git": { - "name": "synthtool", - "remote": "https://github.com/googleapis/synthtool.git", - "sha": "32c758f11b8c578f515a746c9d263b82a615a77c" - } - }, - { - "git": { - "name": "synthtool", - "remote": "https://github.com/googleapis/synthtool.git", - "sha": "32c758f11b8c578f515a746c9d263b82a615a77c" - } - } - ], - "destinations": [ - { - "client": { - "source": "googleapis", - "apiName": "bigquery", - "apiVersion": "v2", - "language": "python", - "generator": "bazel" - } - } - ], - "generatedFiles": [ - ".coveragerc", - ".flake8", - ".github/CONTRIBUTING.md", - ".github/ISSUE_TEMPLATE/bug_report.md", - ".github/ISSUE_TEMPLATE/feature_request.md", - ".github/ISSUE_TEMPLATE/support_request.md", - ".github/PULL_REQUEST_TEMPLATE.md", - ".github/release-please.yml", - ".github/snippet-bot.yml", - ".gitignore", - ".kokoro/build.sh", - ".kokoro/continuous/common.cfg", - ".kokoro/continuous/continuous.cfg", - ".kokoro/docker/docs/Dockerfile", - ".kokoro/docker/docs/fetch_gpg_keys.sh", - ".kokoro/docs/common.cfg", - ".kokoro/docs/docs-presubmit.cfg", - ".kokoro/docs/docs.cfg", - ".kokoro/presubmit/common.cfg", - ".kokoro/presubmit/presubmit.cfg", - ".kokoro/presubmit/system-2.7.cfg", - ".kokoro/presubmit/system-3.8.cfg", - ".kokoro/publish-docs.sh", - ".kokoro/release.sh", - ".kokoro/release/common.cfg", - ".kokoro/release/release.cfg", - ".kokoro/samples/lint/common.cfg", - ".kokoro/samples/lint/continuous.cfg", - ".kokoro/samples/lint/periodic.cfg", - ".kokoro/samples/lint/presubmit.cfg", - ".kokoro/samples/python3.6/common.cfg", - ".kokoro/samples/python3.6/continuous.cfg", - ".kokoro/samples/python3.6/periodic.cfg", - ".kokoro/samples/python3.6/presubmit.cfg", - ".kokoro/samples/python3.7/common.cfg", - ".kokoro/samples/python3.7/continuous.cfg", - ".kokoro/samples/python3.7/periodic.cfg", - ".kokoro/samples/python3.7/presubmit.cfg", - ".kokoro/samples/python3.8/common.cfg", - ".kokoro/samples/python3.8/continuous.cfg", - ".kokoro/samples/python3.8/periodic.cfg", - ".kokoro/samples/python3.8/presubmit.cfg", - ".kokoro/test-samples.sh", - ".kokoro/trampoline.sh", - ".kokoro/trampoline_v2.sh", - ".trampolinerc", - "CODE_OF_CONDUCT.md", - "CONTRIBUTING.rst", - "LICENSE", - "MANIFEST.in", - "docs/_static/custom.css", - "docs/_templates/layout.html", - "docs/conf.py", - "google/cloud/bigquery_v2/gapic/enums.py", - "google/cloud/bigquery_v2/proto/encryption_config.proto", - "google/cloud/bigquery_v2/proto/encryption_config_pb2.py", - "google/cloud/bigquery_v2/proto/encryption_config_pb2_grpc.py", - "google/cloud/bigquery_v2/proto/model.proto", - "google/cloud/bigquery_v2/proto/model_pb2.py", - "google/cloud/bigquery_v2/proto/model_pb2_grpc.py", - "google/cloud/bigquery_v2/proto/model_reference.proto", - "google/cloud/bigquery_v2/proto/model_reference_pb2.py", - "google/cloud/bigquery_v2/proto/model_reference_pb2_grpc.py", - "google/cloud/bigquery_v2/proto/standard_sql.proto", - "google/cloud/bigquery_v2/proto/standard_sql_pb2.py", - "google/cloud/bigquery_v2/proto/standard_sql_pb2_grpc.py", - "google/cloud/bigquery_v2/types.py", - "renovate.json", - "samples/AUTHORING_GUIDE.md", - "samples/CONTRIBUTING.md", - "samples/snippets/README.rst", - "samples/snippets/noxfile.py", - "scripts/decrypt-secrets.sh", - "scripts/readme-gen/readme_gen.py", - "scripts/readme-gen/templates/README.tmpl.rst", - "scripts/readme-gen/templates/auth.tmpl.rst", - "scripts/readme-gen/templates/auth_api_key.tmpl.rst", - "scripts/readme-gen/templates/install_deps.tmpl.rst", - "scripts/readme-gen/templates/install_portaudio.tmpl.rst", - "setup.cfg", - "testing/.gitignore" - ] -} \ No newline at end of file diff --git a/synth.py b/synth.py deleted file mode 100644 index ac20c9aec..000000000 --- a/synth.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""This script is used to synthesize generated parts of this library.""" - -import synthtool as s -from synthtool import gcp -from synthtool.languages import python - -gapic = gcp.GAPICBazel() -common = gcp.CommonTemplates() -version = 'v2' - -library = gapic.py_library( - service='bigquery', - version=version, - bazel_target=f"//google/cloud/bigquery/{version}:bigquery-{version}-py", - include_protos=True, -) - -s.move( - [ - library / "google/cloud/bigquery_v2/gapic/enums.py", - library / "google/cloud/bigquery_v2/types.py", - library / "google/cloud/bigquery_v2/proto/location*", - library / "google/cloud/bigquery_v2/proto/encryption_config*", - library / "google/cloud/bigquery_v2/proto/model*", - library / "google/cloud/bigquery_v2/proto/standard_sql*", - ], -) - -# Fix up proto docs that are missing summary line. -s.replace( - "google/cloud/bigquery_v2/proto/model_pb2.py", - '"""Attributes:', - '"""Protocol buffer.\n\n Attributes:', -) -s.replace( - "google/cloud/bigquery_v2/proto/encryption_config_pb2.py", - '"""Attributes:', - '"""Encryption configuration.\n\n Attributes:', -) - -# Remove non-ascii characters from docstrings for Python 2.7. -# Format quoted strings as plain text. -s.replace("google/cloud/bigquery_v2/proto/*.py", "[“”]", '``') - -# ---------------------------------------------------------------------------- -# Add templated files -# ---------------------------------------------------------------------------- -templated_files = common.py_library(cov_level=100, samples=True, split_system_tests=True) - -# BigQuery has a custom multiprocessing note -s.move(templated_files, excludes=["noxfile.py", "docs/multiprocessing.rst"]) - -# ---------------------------------------------------------------------------- -# Samples templates -# ---------------------------------------------------------------------------- - -python.py_samples() - - -s.replace( - "docs/conf.py", - r'\{"members": True\}', - '{"members": True, "inherited-members": True}' -) - -s.shell.run(["nox", "-s", "blacken"], hide_output=False) diff --git a/testing/constraints-2.7.txt b/testing/constraints-2.7.txt deleted file mode 100644 index fafbaa27f..000000000 --- a/testing/constraints-2.7.txt +++ /dev/null @@ -1,9 +0,0 @@ -google-api-core==1.21.0 -google-cloud-core==1.4.1 -google-cloud-storage==1.30.0 -google-resumable-media==0.6.0 -ipython==5.5 -pandas==0.23.0 -pyarrow==0.16.0 -six==1.13.0 -tqdm==4.7.4 \ No newline at end of file diff --git a/google/cloud/bigquery_v2/gapic/__init__.py b/testing/constraints-3.10.txt similarity index 100% rename from google/cloud/bigquery_v2/gapic/__init__.py rename to testing/constraints-3.10.txt diff --git a/google/cloud/bigquery_v2/proto/__init__.py b/testing/constraints-3.11.txt similarity index 100% rename from google/cloud/bigquery_v2/proto/__init__.py rename to testing/constraints-3.11.txt diff --git a/testing/constraints-3.5.txt b/testing/constraints-3.5.txt deleted file mode 100644 index a262dbe5f..000000000 --- a/testing/constraints-3.5.txt +++ /dev/null @@ -1,12 +0,0 @@ -google-api-core==1.21.0 -google-cloud-bigquery-storage==1.0.0 -google-cloud-core==1.4.1 -google-resumable-media==0.6.0 -google-cloud-storage==1.30.0 -grpcio==1.32.0 -ipython==5.5 -# pandas 0.23.0 is the first version to work with pyarrow to_pandas. -pandas==0.23.0 -pyarrow==1.0.0 -six==1.13.0 -tqdm==4.7.4 \ No newline at end of file diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index e69de29bb..be1a992fa 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -0,0 +1,24 @@ +# This constraints file is used to check that lower bounds +# are correct in setup.py +# List *all* library dependencies and extras in this file. +# Pin the version to the lower bound. +# +# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", +# Then this file should have foo==1.14.0 +geopandas==0.9.0 +google-api-core==1.29.0 +google-cloud-bigquery-storage==2.0.0 +google-cloud-core==1.4.1 +google-resumable-media==0.6.0 +grpcio==1.38.1 +opentelemetry-api==0.11b0 +opentelemetry-instrumentation==0.11b0 +opentelemetry-sdk==0.11b0 +pandas==0.24.2 +proto-plus==1.10.0 +protobuf==3.12.0 +pyarrow==3.0.0 +requests==2.18.0 +shapely==1.6.0 +six==1.13.0 +tqdm==4.7.4 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt new file mode 100644 index 000000000..39dc6250e --- /dev/null +++ b/testing/constraints-3.9.txt @@ -0,0 +1,7 @@ +# This constraints file is used to make sure that the latest dependency versions +# we claim to support in setup.py are indeed installed in test sessions in the most +# recent Python version supported (3.9 at the time of writing - 2021-05-05). +# +# NOTE: Not comprehensive yet, will eventually be maintained semi-automatically by +# the renovate bot. +pyarrow>=4.0.0 diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb..4de65971c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/tests/data/numeric_38_12.parquet b/tests/data/numeric_38_12.parquet new file mode 100644 index 000000000..ef4db91ea Binary files /dev/null and b/tests/data/numeric_38_12.parquet differ diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl new file mode 100644 index 000000000..e06139e5c --- /dev/null +++ b/tests/data/scalars.jsonl @@ -0,0 +1,2 @@ +{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "interval_col": "P7Y11M9DT4H15M37.123456S", "numeric_col": "1.23456789", "bignumeric_col": "10.111213141516171819", "float64_col": "1.25", "rowindex": 0, "string_col": "Hello, World!", "time_col": "11:41:43.07616", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "interval_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "rowindex": 1, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_extreme.jsonl b/tests/data/scalars_extreme.jsonl new file mode 100644 index 000000000..d0a33fdba --- /dev/null +++ b/tests/data/scalars_extreme.jsonl @@ -0,0 +1,5 @@ +{"bool_col": true, "bytes_col": "DQo=\n", "date_col": "9999-12-31", "datetime_col": "9999-12-31 23:59:59.999999", "geography_col": "POINT(-135.0000 90.0000)", "int64_col": "9223372036854775807", "interval_col": "P-10000Y0M-3660000DT-87840000H0M0S", "numeric_col": "9.9999999999999999999999999999999999999E+28", "bignumeric_col": "9.999999999999999999999999999999999999999999999999999999999999999999999999999E+37", "float64_col": "+inf", "rowindex": 0, "string_col": "Hello, World", "time_col": "23:59:59.999999", "timestamp_col": "9999-12-31T23:59:59.999999Z"} +{"bool_col": false, "bytes_col": "8J+Zgw==\n", "date_col": "0001-01-01", "datetime_col": "0001-01-01 00:00:00", "geography_col": "POINT(45.0000 -90.0000)", "int64_col": "-9223372036854775808", "interval_col": "P10000Y0M3660000DT87840000H0M0S", "numeric_col": "-9.9999999999999999999999999999999999999E+28", "bignumeric_col": "-9.999999999999999999999999999999999999999999999999999999999999999999999999999E+37", "float64_col": "-inf", "rowindex": 1, "string_col": "Hello, World", "time_col": "00:00:00", "timestamp_col": "0001-01-01T00:00:00.000000Z"} +{"bool_col": true, "bytes_col": "AA==\n", "date_col": "1900-01-01", "datetime_col": "1900-01-01 00:00:00", "geography_col": "POINT(-180.0000 0.0000)", "int64_col": "-1", "interval_col": "P0Y0M0DT0H0M0.000001S", "numeric_col": "0.000000001", "bignumeric_col": "-0.00000000000000000000000000000000000001", "float64_col": "nan", "rowindex": 2, "string_col": "こんにちは", "time_col": "00:00:00.000001", "timestamp_col": "1900-01-01T00:00:00.000000Z"} +{"bool_col": false, "bytes_col": "", "date_col": "1970-01-01", "datetime_col": "1970-01-01 00:00:00", "geography_col": "POINT(0 0)", "int64_col": "0", "interval_col": "P0Y0M0DT0H0M0S", "numeric_col": "0.0", "bignumeric_col": "0.0", "float64_col": 0.0, "rowindex": 3, "string_col": "", "time_col": "12:00:00", "timestamp_col": "1970-01-01T00:00:00.000000Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "interval_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "rowindex": 4, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_schema.json b/tests/data/scalars_schema.json new file mode 100644 index 000000000..676d37d56 --- /dev/null +++ b/tests/data/scalars_schema.json @@ -0,0 +1,72 @@ +[ + { + "mode": "NULLABLE", + "name": "bool_col", + "type": "BOOLEAN" + }, + { + "mode": "NULLABLE", + "name": "bignumeric_col", + "type": "BIGNUMERIC" + }, + { + "mode": "NULLABLE", + "name": "bytes_col", + "type": "BYTES" + }, + { + "mode": "NULLABLE", + "name": "date_col", + "type": "DATE" + }, + { + "mode": "NULLABLE", + "name": "datetime_col", + "type": "DATETIME" + }, + { + "mode": "NULLABLE", + "name": "float64_col", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "geography_col", + "type": "GEOGRAPHY" + }, + { + "mode": "NULLABLE", + "name": "int64_col", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "interval_col", + "type": "INTERVAL" + }, + { + "mode": "NULLABLE", + "name": "numeric_col", + "type": "NUMERIC" + }, + { + "mode": "REQUIRED", + "name": "rowindex", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "string_col", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "time_col", + "type": "TIME" + }, + { + "mode": "NULLABLE", + "name": "timestamp_col", + "type": "TIMESTAMP" + } +] diff --git a/tests/system/__init__.py b/tests/system/__init__.py new file mode 100644 index 000000000..4fbd93bb2 --- /dev/null +++ b/tests/system/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/conftest.py b/tests/system/conftest.py new file mode 100644 index 000000000..cc2c2a4dc --- /dev/null +++ b/tests/system/conftest.py @@ -0,0 +1,100 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib + +import pytest +import test_utils.prefixer + +from google.cloud import bigquery +from google.cloud.bigquery import enums +from . import helpers + + +prefixer = test_utils.prefixer.Prefixer("python-bigquery", "tests/system") + +DATA_DIR = pathlib.Path(__file__).parent.parent / "data" + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_datasets(bigquery_client: bigquery.Client): + for dataset in bigquery_client.list_datasets(): + if prefixer.should_cleanup(dataset.dataset_id): + bigquery_client.delete_dataset( + dataset, delete_contents=True, not_found_ok=True + ) + + +@pytest.fixture(scope="session") +def bigquery_client(): + return bigquery.Client() + + +@pytest.fixture(scope="session") +def project_id(bigquery_client: bigquery.Client): + return bigquery_client.project + + +@pytest.fixture(scope="session") +def bqstorage_client(bigquery_client): + from google.cloud import bigquery_storage + + return bigquery_storage.BigQueryReadClient(credentials=bigquery_client._credentials) + + +@pytest.fixture(scope="session") +def dataset_id(bigquery_client): + dataset_id = prefixer.create_prefix() + bigquery_client.create_dataset(dataset_id) + yield dataset_id + bigquery_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True) + + +@pytest.fixture +def table_id(dataset_id): + return f"{dataset_id}.table_{helpers.temp_suffix()}" + + +@pytest.fixture(scope="session") +def scalars_table(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars" + with open(DATA_DIR / "scalars.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) + + +@pytest.fixture(scope="session") +def scalars_extreme_table( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars_extreme" + with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) diff --git a/tests/system/helpers.py b/tests/system/helpers.py new file mode 100644 index 000000000..76e609345 --- /dev/null +++ b/tests/system/helpers.py @@ -0,0 +1,94 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import decimal +import uuid + +import google.api_core.exceptions +import test_utils.retry + +from google.cloud._helpers import UTC + + +_naive = datetime.datetime(2016, 12, 5, 12, 41, 9) +_naive_microseconds = datetime.datetime(2016, 12, 5, 12, 41, 9, 250000) +_stamp = "%s %s" % (_naive.date().isoformat(), _naive.time().isoformat()) +_stamp_microseconds = _stamp + ".250000" +_zoned = _naive.replace(tzinfo=UTC) +_zoned_microseconds = _naive_microseconds.replace(tzinfo=UTC) +_numeric = decimal.Decimal("123456789.123456789") + + +# Examples of most data types to test with query() and DB-API. +STANDARD_SQL_EXAMPLES = [ + ("SELECT 1", 1), + ("SELECT 1.3", 1.3), + ("SELECT TRUE", True), + ('SELECT "ABC"', "ABC"), + ('SELECT CAST("foo" AS BYTES)', b"foo"), + ('SELECT TIMESTAMP "%s"' % (_stamp,), _zoned), + ('SELECT TIMESTAMP "%s"' % (_stamp_microseconds,), _zoned_microseconds,), + ('SELECT DATETIME(TIMESTAMP "%s")' % (_stamp,), _naive), + ('SELECT DATETIME(TIMESTAMP "%s")' % (_stamp_microseconds,), _naive_microseconds,), + ('SELECT DATE(TIMESTAMP "%s")' % (_stamp,), _naive.date()), + ('SELECT TIME(TIMESTAMP "%s")' % (_stamp,), _naive.time()), + ('SELECT NUMERIC "%s"' % (_numeric,), _numeric), + ("SELECT (1, 2)", {"_field_1": 1, "_field_2": 2}), + ( + "SELECT ((1, 2), (3, 4), 5)", + { + "_field_1": {"_field_1": 1, "_field_2": 2}, + "_field_2": {"_field_1": 3, "_field_2": 4}, + "_field_3": 5, + }, + ), + ("SELECT [1, 2, 3]", [1, 2, 3]), + ( + "SELECT ([1, 2], 3, [4, 5])", + {"_field_1": [1, 2], "_field_2": 3, "_field_3": [4, 5]}, + ), + ( + "SELECT [(1, 2, 3), (4, 5, 6)]", + [ + {"_field_1": 1, "_field_2": 2, "_field_3": 3}, + {"_field_1": 4, "_field_2": 5, "_field_3": 6}, + ], + ), + ( + "SELECT [([1, 2, 3], 4), ([5, 6], 7)]", + [{"_field_1": [1, 2, 3], "_field_2": 4}, {"_field_1": [5, 6], "_field_2": 7}], + ), + ("SELECT ARRAY(SELECT STRUCT([1, 2]))", [{"_field_1": [1, 2]}]), + ("SELECT ST_GeogPoint(1, 2)", "POINT(1 2)"), +] + + +def temp_suffix(): + now = datetime.datetime.now() + return f"{now.strftime('%Y%m%d%H%M%S')}_{uuid.uuid4().hex[:8]}" + + +def _rate_limit_exceeded(forbidden): + """Predicate: pass only exceptions with 'rateLimitExceeded' as reason.""" + return any(error["reason"] == "rateLimitExceeded" for error in forbidden._errors) + + +# We need to wait to stay within the rate limits. +# The alternative outcome is a 403 Forbidden response from upstream, which +# they return instead of the more appropriate 429. +# See https://cloud.google.com/bigquery/quota-policy +retry_403 = test_utils.retry.RetryErrors( + google.api_core.exceptions.Forbidden, error_predicate=_rate_limit_exceeded, +) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py new file mode 100644 index 000000000..12f7af9cb --- /dev/null +++ b/tests/system/test_arrow.py @@ -0,0 +1,112 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for Arrow connector.""" + +from typing import Optional + +import pytest + +from google.cloud import bigquery +from google.cloud.bigquery import enums + + +pyarrow = pytest.importorskip( + "pyarrow", minversion="3.0.0" +) # Needs decimal256 for BIGNUMERIC columns. + + +@pytest.mark.parametrize( + ("max_results", "scalars_table_name"), + ( + (None, "scalars_table"), # Use BQ Storage API. + (10, "scalars_table"), # Use REST API. + (None, "scalars_extreme_table"), # Use BQ Storage API. + (10, "scalars_extreme_table"), # Use REST API. + ), +) +def test_list_rows_nullable_scalars_dtypes( + bigquery_client: bigquery.Client, + scalars_table: str, + scalars_extreme_table: str, + max_results: Optional[int], + scalars_table_name: str, +): + table_id = scalars_table + if scalars_table_name == "scalars_extreme_table": + table_id = scalars_extreme_table + + # TODO(GH#836): Avoid INTERVAL columns until they are supported by the + # BigQuery Storage API and pyarrow. + schema = [ + bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), + bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), + bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), + bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), + bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), + bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), + bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), + bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), + bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), + bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), + bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), + bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), + ] + + arrow_table = bigquery_client.list_rows( + table_id, max_results=max_results, selected_fields=schema, + ).to_arrow() + + schema = arrow_table.schema + bignumeric_type = schema.field("bignumeric_col").type + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + assert bignumeric_type.precision in {76, 77} + assert bignumeric_type.scale == 38 + + bool_type = schema.field("bool_col").type + assert bool_type.equals(pyarrow.bool_()) + + bytes_type = schema.field("bytes_col").type + assert bytes_type.equals(pyarrow.binary()) + + date_type = schema.field("date_col").type + assert date_type.equals(pyarrow.date32()) + + datetime_type = schema.field("datetime_col").type + assert datetime_type.unit == "us" + assert datetime_type.tz is None + + float64_type = schema.field("float64_col").type + assert float64_type.equals(pyarrow.float64()) + + geography_type = schema.field("geography_col").type + assert geography_type.equals(pyarrow.string()) + + int64_type = schema.field("int64_col").type + assert int64_type.equals(pyarrow.int64()) + + numeric_type = schema.field("numeric_col").type + assert numeric_type.precision == 38 + assert numeric_type.scale == 9 + + string_type = schema.field("string_col").type + assert string_type.equals(pyarrow.string()) + + time_type = schema.field("time_col").type + assert time_type.equals(pyarrow.time64("us")) + + timestamp_type = schema.field("timestamp_col").type + assert timestamp_type.unit == "us" + assert timestamp_type.tz is not None diff --git a/tests/system.py b/tests/system/test_client.py similarity index 65% rename from tests/system.py rename to tests/system/test_client.py index 02cc8e139..9da45ee6e 100644 --- a/tests/system.py +++ b/tests/system/test_client.py @@ -13,60 +13,45 @@ # limitations under the License. import base64 -import collections import concurrent.futures import csv import datetime import decimal +import io import json import operator import os +import pathlib import time import unittest import uuid -import re +from typing import Optional -import requests -import six import psutil import pytest -import pytz -import pkg_resources + +from . import helpers try: - from google.cloud import bigquery_storage_v1 - from google.cloud import bigquery_storage_v1beta1 + from google.cloud import bigquery_storage except ImportError: # pragma: NO COVER - bigquery_storage_v1 = None - bigquery_storage_v1beta1 = None + bigquery_storage = None try: import fastavro # to parse BQ storage client results except ImportError: # pragma: NO COVER fastavro = None -try: - import pandas -except ImportError: # pragma: NO COVER - pandas = None try: import pyarrow import pyarrow.types except ImportError: # pragma: NO COVER pyarrow = None -try: - import IPython - from IPython.utils import io - from IPython.testing import tools - from IPython.terminal import interactiveshell -except ImportError: # pragma: NO COVER - IPython = None from google.api_core.exceptions import PreconditionFailed from google.api_core.exceptions import BadRequest from google.api_core.exceptions import ClientError from google.api_core.exceptions import Conflict -from google.api_core.exceptions import Forbidden from google.api_core.exceptions import GoogleAPICallError from google.api_core.exceptions import NotFound from google.api_core.exceptions import InternalServerError @@ -79,8 +64,10 @@ from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import Table from google.cloud._helpers import UTC -from google.cloud.bigquery import dbapi +from google.cloud.bigquery import dbapi, enums from google.cloud import storage +from google.cloud.datacatalog_v1 import types as datacatalog_types +from google.cloud.datacatalog_v1 import PolicyTagManagerClient from test_utils.retry import RetryErrors from test_utils.retry import RetryInstanceState @@ -89,7 +76,7 @@ JOB_TIMEOUT = 120 # 2 minutes -WHERE = os.path.abspath(os.path.dirname(__file__)) +DATA_PATH = pathlib.Path(__file__).parent.parent / "data" # Common table data used for many tests. ROWS = [ @@ -103,6 +90,12 @@ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), ] +CLUSTERING_SCHEMA = [ + bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("body_height_cm", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("date_of_birth", "DATE", mode="REQUIRED"), +] TIME_PARTITIONING_CLUSTERING_FIELDS_SCHEMA = [ bigquery.SchemaField("transaction_time", "TIMESTAMP", mode="REQUIRED"), bigquery.SchemaField("transaction_id", "INTEGER", mode="REQUIRED"), @@ -130,18 +123,7 @@ (TooManyRequests, InternalServerError, ServiceUnavailable) ) -PANDAS_MINIMUM_VERSION = pkg_resources.parse_version("1.0.0") -PYARROW_MINIMUM_VERSION = pkg_resources.parse_version("0.17.0") - -if pandas: - PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version -else: - PANDAS_INSTALLED_VERSION = None - -if pyarrow: - PYARROW_INSTALLED_VERSION = pkg_resources.get_distribution("pyarrow").parsed_version -else: - PYARROW_INSTALLED_VERSION = None +MTLS_TESTING = os.getenv("GOOGLE_API_USE_CLIENT_CERTIFICATE") == "true" def _has_rows(result): @@ -149,30 +131,18 @@ def _has_rows(result): def _make_dataset_id(prefix): - return "%s%s" % (prefix, unique_resource_id()) + return f"python_bigquery_tests_system_{prefix}{unique_resource_id()}" -def _load_json_schema(filename="data/schema.json"): +def _load_json_schema(filename="schema.json"): from google.cloud.bigquery.table import _parse_schema_resource - json_filename = os.path.join(WHERE, filename) + json_filename = DATA_PATH / filename with open(json_filename, "r") as schema_file: return _parse_schema_resource(json.load(schema_file)) -def _rate_limit_exceeded(forbidden): - """Predicate: pass only exceptions with 'rateLimitExceeded' as reason.""" - return any(error["reason"] == "rateLimitExceeded" for error in forbidden._errors) - - -# We need to wait to stay within the rate limits. -# The alternative outcome is a 403 Forbidden response from upstream, which -# they return instead of the more appropriate 429. -# See https://cloud.google.com/bigquery/quota-policy -retry_403 = RetryErrors(Forbidden, error_predicate=_rate_limit_exceeded) - - class Config(object): """Run-time configuration to be modified at set-up. @@ -180,7 +150,7 @@ class Config(object): global state. """ - CLIENT = None + CLIENT: Optional[bigquery.Client] = None CURSOR = None @@ -194,6 +164,8 @@ def setUp(self): self.to_delete = [] def tearDown(self): + policy_tag_client = PolicyTagManagerClient() + def _still_in_use(bad_request): return any( error["reason"] == "resourceInUse" for error in bad_request._errors @@ -210,6 +182,8 @@ def _still_in_use(bad_request): retry_in_use(Config.CLIENT.delete_dataset)(doomed, delete_contents=True) elif isinstance(doomed, (Table, bigquery.TableReference)): retry_in_use(Config.CLIENT.delete_table)(doomed) + elif isinstance(doomed, datacatalog_types.Taxonomy): + policy_tag_client.delete_taxonomy(name=doomed.name) else: doomed.delete() @@ -218,13 +192,15 @@ def test_get_service_account_email(self): got = client.get_service_account_email() - self.assertIsInstance(got, six.text_type) + self.assertIsInstance(got, str) self.assertIn("@", got) def _create_bucket(self, bucket_name, location=None): storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) - retry_storage_errors(bucket.create)(location=location) + retry_storage_errors(storage_client.create_bucket)( + bucket_name, location=location + ) self.to_delete.append(bucket) return bucket @@ -248,7 +224,7 @@ def test_close_releases_open_sockets(self): client.close() conn_count_end = len(current_process.connections()) - self.assertEqual(conn_count_end, conn_count_start) + self.assertLessEqual(conn_count_end, conn_count_start) def test_create_dataset(self): DATASET_ID = _make_dataset_id("create_dataset") @@ -266,7 +242,7 @@ def test_get_dataset(self): dataset_arg = Dataset(dataset_ref) dataset_arg.friendly_name = "Friendly" dataset_arg.description = "Description" - dataset = retry_403(client.create_dataset)(dataset_arg) + dataset = helpers.retry_403(client.create_dataset)(dataset_arg) self.to_delete.append(dataset) dataset_ref = bigquery.DatasetReference(project, dataset_id) @@ -349,7 +325,7 @@ def test_create_table(self): table_arg = Table(dataset.table(table_id), schema=SCHEMA) self.assertFalse(_table_exists(table_arg)) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) @@ -384,7 +360,7 @@ def test_create_table_with_policy(self): table_arg = Table(dataset.table(table_id), schema=schema) self.assertFalse(_table_exists(table_arg)) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) @@ -406,6 +382,68 @@ def test_create_table_with_policy(self): table2 = Config.CLIENT.update_table(table, ["schema"]) self.assertEqual(policy_2, table2.schema[1].policy_tags) + def test_create_table_with_real_custom_policy(self): + from google.cloud.bigquery.schema import PolicyTagList + + policy_tag_client = PolicyTagManagerClient() + taxonomy_parent = f"projects/{Config.CLIENT.project}/locations/us" + + new_taxonomy = datacatalog_types.Taxonomy( + display_name="Custom test taxonomy" + unique_resource_id(), + description="This taxonomy is ony used for a test.", + activated_policy_types=[ + datacatalog_types.Taxonomy.PolicyType.FINE_GRAINED_ACCESS_CONTROL + ], + ) + + taxonomy = policy_tag_client.create_taxonomy( + parent=taxonomy_parent, taxonomy=new_taxonomy + ) + self.to_delete.insert(0, taxonomy) + + parent_policy_tag = policy_tag_client.create_policy_tag( + parent=taxonomy.name, + policy_tag=datacatalog_types.PolicyTag( + display_name="Parent policy tag", parent_policy_tag=None + ), + ) + child_policy_tag = policy_tag_client.create_policy_tag( + parent=taxonomy.name, + policy_tag=datacatalog_types.PolicyTag( + display_name="Child policy tag", + parent_policy_tag=parent_policy_tag.name, + ), + ) + + dataset = self.temp_dataset( + _make_dataset_id("create_table_with_real_custom_policy") + ) + table_id = "test_table" + policy_1 = PolicyTagList(names=[parent_policy_tag.name]) + policy_2 = PolicyTagList(names=[child_policy_tag.name]) + + schema = [ + bigquery.SchemaField( + "first_name", "STRING", mode="REQUIRED", policy_tags=policy_1 + ), + bigquery.SchemaField( + "age", "INTEGER", mode="REQUIRED", policy_tags=policy_2 + ), + ] + table_arg = Table(dataset.table(table_id), schema=schema) + self.assertFalse(_table_exists(table_arg)) + + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) + self.to_delete.insert(0, table) + + self.assertTrue(_table_exists(table)) + self.assertCountEqual( + list(table.schema[0].policy_tags.names), [parent_policy_tag.name] + ) + self.assertCountEqual( + list(table.schema[1].policy_tags.names), [child_policy_tag.name] + ) + def test_create_table_w_time_partitioning_w_clustering_fields(self): from google.cloud.bigquery.table import TimePartitioning from google.cloud.bigquery.table import TimePartitioningType @@ -420,7 +458,7 @@ def test_create_table_w_time_partitioning_w_clustering_fields(self): table_arg.time_partitioning = TimePartitioning(field="transaction_time") table_arg.clustering_fields = ["user_email", "store_code"] - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) @@ -434,7 +472,7 @@ def test_delete_dataset_with_string(self): dataset_id = _make_dataset_id("delete_table_true_with_string") project = Config.CLIENT.project dataset_ref = bigquery.DatasetReference(project, dataset_id) - retry_403(Config.CLIENT.create_dataset)(Dataset(dataset_ref)) + helpers.retry_403(Config.CLIENT.create_dataset)(Dataset(dataset_ref)) self.assertTrue(_dataset_exists(dataset_ref)) Config.CLIENT.delete_dataset(dataset_id) self.assertFalse(_dataset_exists(dataset_ref)) @@ -443,11 +481,11 @@ def test_delete_dataset_delete_contents_true(self): dataset_id = _make_dataset_id("delete_table_true_with_content") project = Config.CLIENT.project dataset_ref = bigquery.DatasetReference(project, dataset_id) - dataset = retry_403(Config.CLIENT.create_dataset)(Dataset(dataset_ref)) + dataset = helpers.retry_403(Config.CLIENT.create_dataset)(Dataset(dataset_ref)) table_id = "test_table" table_arg = Table(dataset.table(table_id), schema=SCHEMA) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) Config.CLIENT.delete_dataset(dataset, delete_contents=True) self.assertFalse(_table_exists(table)) @@ -459,7 +497,7 @@ def test_delete_dataset_delete_contents_false(self): table_id = "test_table" table_arg = Table(dataset.table(table_id), schema=SCHEMA) - retry_403(Config.CLIENT.create_table)(table_arg) + helpers.retry_403(Config.CLIENT.create_table)(table_arg) with self.assertRaises(exceptions.BadRequest): Config.CLIENT.delete_dataset(dataset) @@ -508,7 +546,7 @@ def test_list_tables(self): ] for table_name in tables_to_create: table = Table(dataset.table(table_name), schema=SCHEMA) - created_table = retry_403(Config.CLIENT.create_table)(table) + created_table = helpers.retry_403(Config.CLIENT.create_table)(table) self.to_delete.insert(0, created_table) # Retrieve the tables. @@ -538,7 +576,7 @@ def test_update_table(self): TABLE_NAME = "test_table" table_arg = Table(dataset.table(TABLE_NAME), schema=SCHEMA) self.assertFalse(_table_exists(table_arg)) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) self.assertIsNone(table.friendly_name) @@ -578,7 +616,7 @@ def test_update_table_schema(self): TABLE_NAME = "test_table" table_arg = Table(dataset.table(TABLE_NAME), schema=SCHEMA) self.assertFalse(_table_exists(table_arg)) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) voter = bigquery.SchemaField("voter", "BOOLEAN", mode="NULLABLE") @@ -594,10 +632,79 @@ def test_update_table_schema(self): self.assertEqual(found.field_type, expected.field_type) self.assertEqual(found.mode, expected.mode) + def test_unset_table_schema_attributes(self): + from google.cloud.bigquery.schema import PolicyTagList + + dataset = self.temp_dataset(_make_dataset_id("unset_policy_tags")) + table_id = "test_table" + policy_tags = PolicyTagList( + names=[ + "projects/{}/locations/us/taxonomies/1/policyTags/2".format( + Config.CLIENT.project + ), + ] + ) + + schema = [ + bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), + bigquery.SchemaField( + "secret_int", + "INTEGER", + mode="REQUIRED", + description="This field is numeric", + policy_tags=policy_tags, + ), + ] + table_arg = Table(dataset.table(table_id), schema=schema) + self.assertFalse(_table_exists(table_arg)) + + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) + self.to_delete.insert(0, table) + + self.assertTrue(_table_exists(table)) + self.assertEqual(policy_tags, table.schema[1].policy_tags) + + # Amend the schema to replace the policy tags + new_schema = table.schema[:] + old_field = table.schema[1] + new_schema[1] = bigquery.SchemaField( + name=old_field.name, + field_type=old_field.field_type, + mode=old_field.mode, + description=None, + fields=old_field.fields, + policy_tags=None, + ) + + table.schema = new_schema + updated_table = Config.CLIENT.update_table(table, ["schema"]) + + self.assertFalse(updated_table.schema[1].description) # Empty string or None. + self.assertEqual(updated_table.schema[1].policy_tags.names, ()) + + def test_update_table_clustering_configuration(self): + dataset = self.temp_dataset(_make_dataset_id("update_table")) + + TABLE_NAME = "test_table" + table_arg = Table(dataset.table(TABLE_NAME), schema=CLUSTERING_SCHEMA) + self.assertFalse(_table_exists(table_arg)) + + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) + self.to_delete.insert(0, table) + self.assertTrue(_table_exists(table)) + + table.clustering_fields = ["full_name", "date_of_birth"] + table2 = Config.CLIENT.update_table(table, ["clustering_fields"]) + self.assertEqual(table2.clustering_fields, ["full_name", "date_of_birth"]) + + table2.clustering_fields = None + table3 = Config.CLIENT.update_table(table2, ["clustering_fields"]) + self.assertIsNone(table3.clustering_fields, None) + @staticmethod def _fetch_single_page(table, selected_fields=None): iterator = Config.CLIENT.list_rows(table, selected_fields=selected_fields) - page = six.next(iterator.pages) + page = next(iterator.pages) return list(page) def _create_table_many_columns(self, rowcount): @@ -678,7 +785,7 @@ def test_insert_rows_then_dump_table(self): ] table_arg = Table(dataset.table(TABLE_ID), schema=schema) self.assertFalse(_table_exists(table_arg)) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) @@ -716,7 +823,7 @@ def test_load_table_from_local_avro_file_then_dump_table(self): table = Table(table_ref) self.to_delete.insert(0, table) - with open(os.path.join(WHERE, "data", "colors.avro"), "rb") as avrof: + with open(DATA_PATH / "colors.avro", "rb") as avrof: config = bigquery.LoadJobConfig() config.source_format = SourceFormat.AVRO config.write_disposition = WriteDisposition.WRITE_TRUNCATE @@ -736,397 +843,59 @@ def test_load_table_from_local_avro_file_then_dump_table(self): sorted(row_tuples, key=by_wavelength), sorted(ROWS, key=by_wavelength) ) - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_automatic_schema(self): - """Test that a DataFrame with dtypes that map well to BigQuery types - can be uploaded without specifying a schema. - - https://github.com/googleapis/google-cloud-python/issues/9044 - """ - df_data = collections.OrderedDict( - [ - ("bool_col", pandas.Series([True, False, True], dtype="bool")), - ( - "ts_col", - pandas.Series( - [ - datetime.datetime(2010, 1, 2, 3, 44, 50), - datetime.datetime(2011, 2, 3, 14, 50, 59), - datetime.datetime(2012, 3, 14, 15, 16), - ], - dtype="datetime64[ns]", - ).dt.tz_localize(pytz.utc), - ), - ( - "dt_col", - pandas.Series( - [ - datetime.datetime(2010, 1, 2, 3, 44, 50), - datetime.datetime(2011, 2, 3, 14, 50, 59), - datetime.datetime(2012, 3, 14, 15, 16), - ], - dtype="datetime64[ns]", - ), - ), - ("float32_col", pandas.Series([1.0, 2.0, 3.0], dtype="float32")), - ("float64_col", pandas.Series([4.0, 5.0, 6.0], dtype="float64")), - ("int8_col", pandas.Series([-12, -11, -10], dtype="int8")), - ("int16_col", pandas.Series([-9, -8, -7], dtype="int16")), - ("int32_col", pandas.Series([-6, -5, -4], dtype="int32")), - ("int64_col", pandas.Series([-3, -2, -1], dtype="int64")), - ("uint8_col", pandas.Series([0, 1, 2], dtype="uint8")), - ("uint16_col", pandas.Series([3, 4, 5], dtype="uint16")), - ("uint32_col", pandas.Series([6, 7, 8], dtype="uint32")), - ] - ) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - - dataset_id = _make_dataset_id("bq_load_test") - self.temp_dataset(dataset_id) - table_id = "{}.{}.load_table_from_dataframe_w_automatic_schema".format( - Config.CLIENT.project, dataset_id - ) - - load_job = Config.CLIENT.load_table_from_dataframe(dataframe, table_id) - load_job.result() - - table = Config.CLIENT.get_table(table_id) - self.assertEqual( - tuple(table.schema), - ( - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("ts_col", "TIMESTAMP"), - # BigQuery does not support uploading DATETIME values from - # Parquet files. See: - # https://github.com/googleapis/google-cloud-python/issues/9996 - bigquery.SchemaField("dt_col", "TIMESTAMP"), - bigquery.SchemaField("float32_col", "FLOAT"), - bigquery.SchemaField("float64_col", "FLOAT"), - bigquery.SchemaField("int8_col", "INTEGER"), - bigquery.SchemaField("int16_col", "INTEGER"), - bigquery.SchemaField("int32_col", "INTEGER"), - bigquery.SchemaField("int64_col", "INTEGER"), - bigquery.SchemaField("uint8_col", "INTEGER"), - bigquery.SchemaField("uint16_col", "INTEGER"), - bigquery.SchemaField("uint32_col", "INTEGER"), - ), - ) - self.assertEqual(table.num_rows, 3) - - @unittest.skipIf( - pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIMUM_VERSION, - "Only `pandas version >=1.0.0` is supported", - ) - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_nullable_int64_datatype(self): - """Test that a DataFrame containing column with None-type values and int64 datatype - can be uploaded if a BigQuery schema is specified. - - https://github.com/googleapis/python-bigquery/issues/22 - """ - - dataset_id = _make_dataset_id("bq_load_test") - self.temp_dataset(dataset_id) - table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( - Config.CLIENT.project, dataset_id - ) - table_schema = (bigquery.SchemaField("x", "INTEGER", mode="NULLABLE"),) - table = retry_403(Config.CLIENT.create_table)( - Table(table_id, schema=table_schema) - ) - self.to_delete.insert(0, table) - - df_data = collections.OrderedDict( - [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] - ) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - load_job = Config.CLIENT.load_table_from_dataframe(dataframe, table_id) - load_job.result() - table = Config.CLIENT.get_table(table_id) - self.assertEqual(tuple(table.schema), (bigquery.SchemaField("x", "INTEGER"),)) - self.assertEqual(table.num_rows, 4) - - @unittest.skipIf( - pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIMUM_VERSION, - "Only `pandas version >=1.0.0` is supported", - ) - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(self): - """Test that a DataFrame containing column with None-type values and int64 datatype - can be uploaded without specifying a schema. - - https://github.com/googleapis/python-bigquery/issues/22 - """ - - dataset_id = _make_dataset_id("bq_load_test") - self.temp_dataset(dataset_id) - table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( - Config.CLIENT.project, dataset_id - ) - df_data = collections.OrderedDict( - [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] - ) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - load_job = Config.CLIENT.load_table_from_dataframe(dataframe, table_id) - load_job.result() - table = Config.CLIENT.get_table(table_id) - self.assertEqual(tuple(table.schema), (bigquery.SchemaField("x", "INTEGER"),)) - self.assertEqual(table.num_rows, 4) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_nulls(self): - """Test that a DataFrame with null columns can be uploaded if a - BigQuery schema is specified. + def test_load_table_from_local_parquet_file_decimal_types(self): + from google.cloud.bigquery.enums import DecimalTargetType + from google.cloud.bigquery.job import SourceFormat + from google.cloud.bigquery.job import WriteDisposition - See: https://github.com/googleapis/google-cloud-python/issues/7370 - """ - # Schema with all scalar types. - scalars_schema = ( - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("bytes_col", "BYTES"), - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("dt_col", "DATETIME"), - bigquery.SchemaField("float_col", "FLOAT"), - bigquery.SchemaField("geo_col", "GEOGRAPHY"), - bigquery.SchemaField("int_col", "INTEGER"), - bigquery.SchemaField("num_col", "NUMERIC"), - bigquery.SchemaField("str_col", "STRING"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("ts_col", "TIMESTAMP"), - ) - table_schema = scalars_schema + ( - # TODO: Array columns can't be read due to NULLABLE versus REPEATED - # mode mismatch. See: - # https://issuetracker.google.com/133415569#comment3 - # bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"), - # TODO: Support writing StructArrays to Parquet. See: - # https://jira.apache.org/jira/browse/ARROW-2587 - # bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema), - ) - num_rows = 100 - nulls = [None] * num_rows - df_data = collections.OrderedDict( - [ - ("bool_col", nulls), - ("bytes_col", nulls), - ("date_col", nulls), - ("dt_col", nulls), - ("float_col", nulls), - ("geo_col", nulls), - ("int_col", nulls), - ("num_col", nulls), - ("str_col", nulls), - ("time_col", nulls), - ("ts_col", nulls), - ] - ) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + TABLE_NAME = "test_table_parquet" - dataset_id = _make_dataset_id("bq_load_test") - self.temp_dataset(dataset_id) - table_id = "{}.{}.load_table_from_dataframe_w_nulls".format( - Config.CLIENT.project, dataset_id - ) + expected_rows = [ + (decimal.Decimal("123.999999999999"),), + (decimal.Decimal("99999999999999999999999999.999999999999"),), + ] - # Create the table before loading so that schema mismatch errors are - # identified. - table = retry_403(Config.CLIENT.create_table)( - Table(table_id, schema=table_schema) - ) + dataset = self.temp_dataset(_make_dataset_id("load_local_parquet_then_dump")) + table_ref = dataset.table(TABLE_NAME) + table = Table(table_ref) self.to_delete.insert(0, table) - job_config = bigquery.LoadJobConfig(schema=table_schema) - load_job = Config.CLIENT.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() - - table = Config.CLIENT.get_table(table) - self.assertEqual(tuple(table.schema), table_schema) - self.assertEqual(table.num_rows, num_rows) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_required(self): - """Test that a DataFrame with required columns can be uploaded if a - BigQuery schema is specified. - - See: https://github.com/googleapis/google-cloud-python/issues/8093 - """ - table_schema = ( - bigquery.SchemaField("name", "STRING", mode="REQUIRED"), - bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), - ) + job_config = bigquery.LoadJobConfig() + job_config.source_format = SourceFormat.PARQUET + job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE + job_config.decimal_target_types = [ + DecimalTargetType.NUMERIC, + DecimalTargetType.BIGNUMERIC, + DecimalTargetType.STRING, + ] - records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}] - dataframe = pandas.DataFrame(records, columns=["name", "age"]) - job_config = bigquery.LoadJobConfig(schema=table_schema) - dataset_id = _make_dataset_id("bq_load_test") - self.temp_dataset(dataset_id) - table_id = "{}.{}.load_table_from_dataframe_w_required".format( - Config.CLIENT.project, dataset_id - ) + with open(DATA_PATH / "numeric_38_12.parquet", "rb") as parquet_file: + job = Config.CLIENT.load_table_from_file( + parquet_file, table_ref, job_config=job_config + ) - # Create the table before loading so that schema mismatch errors are - # identified. - table = retry_403(Config.CLIENT.create_table)( - Table(table_id, schema=table_schema) - ) - self.to_delete.insert(0, table) + job.result(timeout=JOB_TIMEOUT) # Retry until done. - job_config = bigquery.LoadJobConfig(schema=table_schema) - load_job = Config.CLIENT.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() + self.assertEqual(job.output_rows, len(expected_rows)) table = Config.CLIENT.get_table(table) - self.assertEqual(tuple(table.schema), table_schema) - self.assertEqual(table.num_rows, 2) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_explicit_schema(self): - # Schema with all scalar types. - # TODO: Uploading DATETIME columns currently fails, thus that field type - # is temporarily removed from the test. - # See: - # https://github.com/googleapis/python-bigquery/issues/61 - # https://issuetracker.google.com/issues/151765076 - scalars_schema = ( - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("bytes_col", "BYTES"), - bigquery.SchemaField("date_col", "DATE"), - # bigquery.SchemaField("dt_col", "DATETIME"), - bigquery.SchemaField("float_col", "FLOAT"), - bigquery.SchemaField("geo_col", "GEOGRAPHY"), - bigquery.SchemaField("int_col", "INTEGER"), - bigquery.SchemaField("num_col", "NUMERIC"), - bigquery.SchemaField("str_col", "STRING"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("ts_col", "TIMESTAMP"), - ) - table_schema = scalars_schema + ( - # TODO: Array columns can't be read due to NULLABLE versus REPEATED - # mode mismatch. See: - # https://issuetracker.google.com/133415569#comment3 - # bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"), - # TODO: Support writing StructArrays to Parquet. See: - # https://jira.apache.org/jira/browse/ARROW-2587 - # bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema), - ) - df_data = collections.OrderedDict( - [ - ("bool_col", [True, None, False]), - ("bytes_col", [b"abc", None, b"def"]), - ( - "date_col", - [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)], - ), - # ( - # "dt_col", - # [ - # datetime.datetime(1, 1, 1, 0, 0, 0), - # None, - # datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - # ], - # ), - ("float_col", [float("-inf"), float("nan"), float("inf")]), - ( - "geo_col", - [ - "POINT(30 10)", - None, - "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", - ], - ), - ("int_col", [-9223372036854775808, None, 9223372036854775807]), - ( - "num_col", - [ - decimal.Decimal("-99999999999999999999999999999.999999999"), - None, - decimal.Decimal("99999999999999999999999999999.999999999"), - ], - ), - ("str_col", [u"abc", None, u"def"]), - ( - "time_col", - [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], - ), - ( - "ts_col", - [ - datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc), - None, - datetime.datetime( - 9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc - ), - ], - ), - ] - ) - dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) - - dataset_id = _make_dataset_id("bq_load_test") - self.temp_dataset(dataset_id) - table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format( - Config.CLIENT.project, dataset_id - ) - - job_config = bigquery.LoadJobConfig(schema=table_schema) - load_job = Config.CLIENT.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() - - table = Config.CLIENT.get_table(table_id) - self.assertEqual(tuple(table.schema), table_schema) - self.assertEqual(table.num_rows, 3) - - @unittest.skipIf( - pyarrow is None or PYARROW_INSTALLED_VERSION < PYARROW_MINIMUM_VERSION, - "Only `pyarrow version >=0.17.0` is supported", - ) - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_load_table_from_dataframe_w_struct_datatype(self): - """Test that a DataFrame with struct datatype can be uploaded if a - BigQuery schema is specified. + rows = self._fetch_single_page(table) + row_tuples = [r.values() for r in rows] + self.assertEqual(sorted(row_tuples), sorted(expected_rows)) - https://github.com/googleapis/python-bigquery/issues/21 - """ - dataset_id = _make_dataset_id("bq_load_test") - self.temp_dataset(dataset_id) - table_id = "{}.{}.load_table_from_dataframe_w_struct_datatype".format( - Config.CLIENT.project, dataset_id - ) - table_schema = [ - bigquery.SchemaField( - "bar", - "RECORD", - fields=[ - bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), - bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), - ], - mode="REQUIRED", - ), - ] - table = retry_403(Config.CLIENT.create_table)( - Table(table_id, schema=table_schema) - ) - self.to_delete.insert(0, table) + # Forcing the NUMERIC type, however, should result in an error. + job_config.decimal_target_types = [DecimalTargetType.NUMERIC] - df_data = [{"id": 1, "age": 21}, {"id": 2, "age": 22}, {"id": 2, "age": 23}] - dataframe = pandas.DataFrame(data={"bar": df_data}, columns=["bar"]) + with open(DATA_PATH / "numeric_38_12.parquet", "rb") as parquet_file: + job = Config.CLIENT.load_table_from_file( + parquet_file, table_ref, job_config=job_config + ) - load_job = Config.CLIENT.load_table_from_dataframe(dataframe, table_id) - load_job.result() + with self.assertRaises(BadRequest) as exc_info: + job.result(timeout=JOB_TIMEOUT) - table = Config.CLIENT.get_table(table_id) - self.assertEqual(table.schema, table_schema) - self.assertEqual(table.num_rows, 3) + exc_msg = str(exc_info.exception) + self.assertIn("out of valid NUMERIC range", exc_msg) def test_load_table_from_json_basic_use(self): table_schema = ( @@ -1149,7 +918,7 @@ def test_load_table_from_json_basic_use(self): # Create the table before loading so that schema mismatch errors are # identified. - table = retry_403(Config.CLIENT.create_table)( + table = helpers.retry_403(Config.CLIENT.create_table)( Table(table_id, schema=table_schema) ) self.to_delete.insert(0, table) @@ -1185,7 +954,7 @@ def test_load_table_from_json_schema_autodetect(self): bigquery.SchemaField("is_awesome", "BOOLEAN", mode="NULLABLE"), ) # create the table before loading so that the column order is predictable - table = retry_403(Config.CLIENT.create_table)( + table = helpers.retry_403(Config.CLIENT.create_table)( Table(table_id, schema=table_schema) ) self.to_delete.insert(0, table) @@ -1213,14 +982,14 @@ def test_load_avro_from_uri_then_dump_table(self): ("orange", 590), ("red", 650), ] - with open(os.path.join(WHERE, "data", "colors.avro"), "rb") as f: + with open(DATA_PATH / "colors.avro", "rb") as f: GS_URL = self._write_avro_to_storage( "bq_load_test" + unique_resource_id(), "colors.avro", f ) dataset = self.temp_dataset(_make_dataset_id("bq_load_test")) table_arg = dataset.table(table_name) - table = retry_403(Config.CLIENT.create_table)(Table(table_arg)) + table = helpers.retry_403(Config.CLIENT.create_table)(Table(table_arg)) self.to_delete.insert(0, table) config = bigquery.LoadJobConfig() @@ -1251,7 +1020,7 @@ def test_load_table_from_uri_then_dump_table(self): dataset = self.temp_dataset(_make_dataset_id("load_gcs_then_dump")) table_arg = Table(dataset.table(TABLE_ID), schema=SCHEMA) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) config = bigquery.LoadJobConfig() @@ -1280,7 +1049,7 @@ def test_load_table_from_file_w_explicit_location(self): self._create_bucket(bucket_name, location="eu") # Create a temporary dataset & table in the EU. - table_bytes = six.BytesIO(b"a,3\nb,2\nc,1\n") + table_bytes = io.BytesIO(b"a,3\nb,2\nc,1\n") client = Config.CLIENT dataset = self.temp_dataset(_make_dataset_id("eu_load_file"), location="EU") table_ref = dataset.table("letters") @@ -1299,7 +1068,7 @@ def test_load_table_from_file_w_explicit_location(self): job_id = load_job.job_id # Can get the job from the EU. - load_job = client.get_job(job_id, location="EU") + load_job = client.get_job(load_job) self.assertEqual(job_id, load_job.job_id) self.assertEqual("EU", load_job.location) self.assertTrue(load_job.exists()) @@ -1316,7 +1085,7 @@ def test_load_table_from_file_w_explicit_location(self): # Can cancel the job from the EU. self.assertTrue(load_job.cancel()) - load_job = client.cancel_job(job_id, location="EU") + load_job = client.cancel_job(load_job) self.assertEqual(job_id, load_job.job_id) self.assertEqual("EU", load_job.location) @@ -1437,7 +1206,7 @@ def test_extract_table(self): job.result(timeout=100) self.to_delete.insert(0, destination) - got_bytes = retry_storage_errors(destination.download_as_string)() + got_bytes = retry_storage_errors(destination.download_as_bytes)() got = got_bytes.decode("utf-8") self.assertIn("Bharney Rhubble", got) @@ -1469,7 +1238,7 @@ def test_get_set_iam_policy(self): table_ref = Table(dataset.table(table_id)) self.assertFalse(_table_exists(table_ref)) - table = retry_403(Config.CLIENT.create_table)(table_ref) + table = helpers.retry_403(Config.CLIENT.create_table)(table_ref) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) @@ -1494,7 +1263,7 @@ def test_test_iam_permissions(self): table_ref = Table(dataset.table(table_id)) self.assertFalse(_table_exists(table_ref)) - table = retry_403(Config.CLIENT.create_table)(table_ref) + table = helpers.retry_403(Config.CLIENT.create_table)(table_ref) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) @@ -1518,7 +1287,7 @@ def test_job_cancel(self): dataset = self.temp_dataset(DATASET_ID) table_arg = Table(dataset.table(TABLE_NAME), schema=SCHEMA) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) job = Config.CLIENT.query(QUERY, job_id_prefix=JOB_ID_PREFIX) @@ -1533,6 +1302,23 @@ def test_job_cancel(self): # raise an error, and that the job completed (in the `retry()` # above). + def test_job_labels(self): + DATASET_ID = _make_dataset_id("job_cancel") + JOB_ID_PREFIX = "fetch_" + DATASET_ID + QUERY = "SELECT 1 as one" + + self.temp_dataset(DATASET_ID) + + job_config = bigquery.QueryJobConfig( + labels={"custom_label": "label_value", "another_label": "foo123"} + ) + job = Config.CLIENT.query( + QUERY, job_id_prefix=JOB_ID_PREFIX, job_config=job_config + ) + + expected_labels = {"custom_label": "label_value", "another_label": "foo123"} + self.assertEqual(job.labels, expected_labels) + def test_get_failed_job(self): # issue 4246 from google.api_core.exceptions import BadRequest @@ -1572,75 +1358,12 @@ def test_query_w_legacy_sql_types(self): self.assertEqual(len(rows[0]), 1) self.assertEqual(rows[0][0], example["expected"]) - def _generate_standard_sql_types_examples(self): - naive = datetime.datetime(2016, 12, 5, 12, 41, 9) - naive_microseconds = datetime.datetime(2016, 12, 5, 12, 41, 9, 250000) - stamp = "%s %s" % (naive.date().isoformat(), naive.time().isoformat()) - stamp_microseconds = stamp + ".250000" - zoned = naive.replace(tzinfo=UTC) - zoned_microseconds = naive_microseconds.replace(tzinfo=UTC) - numeric = decimal.Decimal("123456789.123456789") - return [ - {"sql": "SELECT 1", "expected": 1}, - {"sql": "SELECT 1.3", "expected": 1.3}, - {"sql": "SELECT TRUE", "expected": True}, - {"sql": 'SELECT "ABC"', "expected": "ABC"}, - {"sql": 'SELECT CAST("foo" AS BYTES)', "expected": b"foo"}, - {"sql": 'SELECT TIMESTAMP "%s"' % (stamp,), "expected": zoned}, - { - "sql": 'SELECT TIMESTAMP "%s"' % (stamp_microseconds,), - "expected": zoned_microseconds, - }, - {"sql": 'SELECT DATETIME(TIMESTAMP "%s")' % (stamp,), "expected": naive}, - { - "sql": 'SELECT DATETIME(TIMESTAMP "%s")' % (stamp_microseconds,), - "expected": naive_microseconds, - }, - {"sql": 'SELECT DATE(TIMESTAMP "%s")' % (stamp,), "expected": naive.date()}, - {"sql": 'SELECT TIME(TIMESTAMP "%s")' % (stamp,), "expected": naive.time()}, - {"sql": 'SELECT NUMERIC "%s"' % (numeric,), "expected": numeric}, - {"sql": "SELECT (1, 2)", "expected": {"_field_1": 1, "_field_2": 2}}, - { - "sql": "SELECT ((1, 2), (3, 4), 5)", - "expected": { - "_field_1": {"_field_1": 1, "_field_2": 2}, - "_field_2": {"_field_1": 3, "_field_2": 4}, - "_field_3": 5, - }, - }, - {"sql": "SELECT [1, 2, 3]", "expected": [1, 2, 3]}, - { - "sql": "SELECT ([1, 2], 3, [4, 5])", - "expected": {"_field_1": [1, 2], "_field_2": 3, "_field_3": [4, 5]}, - }, - { - "sql": "SELECT [(1, 2, 3), (4, 5, 6)]", - "expected": [ - {"_field_1": 1, "_field_2": 2, "_field_3": 3}, - {"_field_1": 4, "_field_2": 5, "_field_3": 6}, - ], - }, - { - "sql": "SELECT [([1, 2, 3], 4), ([5, 6], 7)]", - "expected": [ - {u"_field_1": [1, 2, 3], u"_field_2": 4}, - {u"_field_1": [5, 6], u"_field_2": 7}, - ], - }, - { - "sql": "SELECT ARRAY(SELECT STRUCT([1, 2]))", - "expected": [{u"_field_1": [1, 2]}], - }, - {"sql": "SELECT ST_GeogPoint(1, 2)", "expected": "POINT(1 2)"}, - ] - def test_query_w_standard_sql_types(self): - examples = self._generate_standard_sql_types_examples() - for example in examples: - rows = list(Config.CLIENT.query(example["sql"])) + for sql, expected in helpers.STANDARD_SQL_EXAMPLES: + rows = list(Config.CLIENT.query(sql)) self.assertEqual(len(rows), 1) self.assertEqual(len(rows[0]), 1) - self.assertEqual(rows[0][0], example["expected"]) + self.assertEqual(rows[0][0], expected) def test_query_w_failed_query(self): from google.api_core.exceptions import BadRequest @@ -1655,23 +1378,30 @@ def test_query_w_wrong_config(self): rows = list(Config.CLIENT.query("SELECT 1;").result()) assert rows[0][0] == 1 - project = Config.CLIENT.project - dataset_ref = bigquery.DatasetReference(project, "dset") bad_config = LoadJobConfig() - bad_config.destination = dataset_ref.table("tbl") + bad_config.source_format = enums.SourceFormat.CSV with self.assertRaises(Exception): Config.CLIENT.query(good_query, job_config=bad_config).result() def test_query_w_timeout(self): + job_config = bigquery.QueryJobConfig() + job_config.use_query_cache = False + query_job = Config.CLIENT.query( "SELECT * FROM `bigquery-public-data.github_repos.commits`;", job_id_prefix="test_query_w_timeout_", + location="US", + job_config=job_config, ) with self.assertRaises(concurrent.futures.TimeoutError): - # 1 second is much too short for this query. query_job.result(timeout=1) + # Even though the query takes >1 second, the call to getQueryResults + # should succeed. + self.assertFalse(query_job.done(timeout=1)) + self.assertIsNotNone(Config.CLIENT.cancel_job(query_job)) + def test_query_w_page_size(self): page_size = 45 query_job = Config.CLIENT.query( @@ -1770,14 +1500,103 @@ def test_query_statistics(self): self.assertGreater(stages_with_inputs, 0) self.assertGreater(len(plan), stages_with_inputs) + def test_dml_statistics(self): + table_schema = ( + bigquery.SchemaField("foo", "STRING"), + bigquery.SchemaField("bar", "INTEGER"), + ) + + dataset_id = _make_dataset_id("bq_system_test") + self.temp_dataset(dataset_id) + table_id = "{}.{}.test_dml_statistics".format(Config.CLIENT.project, dataset_id) + + # Create the table before loading so that the column order is deterministic. + table = helpers.retry_403(Config.CLIENT.create_table)( + Table(table_id, schema=table_schema) + ) + self.to_delete.insert(0, table) + + # Insert a few rows and check the stats. + sql = f""" + INSERT INTO `{table_id}` + VALUES ("one", 1), ("two", 2), ("three", 3), ("four", 4); + """ + query_job = Config.CLIENT.query(sql) + query_job.result() + + assert query_job.dml_stats is not None + assert query_job.dml_stats.inserted_row_count == 4 + assert query_job.dml_stats.updated_row_count == 0 + assert query_job.dml_stats.deleted_row_count == 0 + + # Update some of the rows. + sql = f""" + UPDATE `{table_id}` + SET bar = bar + 1 + WHERE bar > 2; + """ + query_job = Config.CLIENT.query(sql) + query_job.result() + + assert query_job.dml_stats is not None + assert query_job.dml_stats.inserted_row_count == 0 + assert query_job.dml_stats.updated_row_count == 2 + assert query_job.dml_stats.deleted_row_count == 0 + + # Now delete a few rows and check the stats. + sql = f""" + DELETE FROM `{table_id}` + WHERE foo != "two"; + """ + query_job = Config.CLIENT.query(sql) + query_job.result() + + assert query_job.dml_stats is not None + assert query_job.dml_stats.inserted_row_count == 0 + assert query_job.dml_stats.updated_row_count == 0 + assert query_job.dml_stats.deleted_row_count == 3 + + def test_transaction_info(self): + table_schema = ( + bigquery.SchemaField("foo", "STRING"), + bigquery.SchemaField("bar", "INTEGER"), + ) + + dataset_id = _make_dataset_id("bq_system_test") + self.temp_dataset(dataset_id) + table_id = f"{Config.CLIENT.project}.{dataset_id}.test_dml_statistics" + + # Create the table before loading so that the column order is deterministic. + table = helpers.retry_403(Config.CLIENT.create_table)( + Table(table_id, schema=table_schema) + ) + self.to_delete.insert(0, table) + + # Insert a few rows and check the stats. + sql = f""" + BEGIN TRANSACTION; + INSERT INTO `{table_id}` + VALUES ("one", 1), ("two", 2), ("three", 3), ("four", 4); + + UPDATE `{table_id}` + SET bar = bar + 1 + WHERE bar > 2; + COMMIT TRANSACTION; + """ + query_job = Config.CLIENT.query(sql) + query_job.result() + + # Transaction ID set by the server should be accessible + assert query_job.transaction_info is not None + assert query_job.transaction_info.transaction_id != "" + def test_dbapi_w_standard_sql_types(self): - examples = self._generate_standard_sql_types_examples() - for example in examples: - Config.CURSOR.execute(example["sql"]) + for sql, expected in helpers.STANDARD_SQL_EXAMPLES: + Config.CURSOR.execute(sql) self.assertEqual(Config.CURSOR.rowcount, 1) row = Config.CURSOR.fetchone() self.assertEqual(len(row), 1) - self.assertEqual(row[0], example["expected"]) + self.assertEqual(row[0], expected) row = Config.CURSOR.fetchone() self.assertIsNone(row) @@ -1792,58 +1611,36 @@ def test_dbapi_fetchall(self): row_tuples = [r.values() for r in rows] self.assertEqual(row_tuples, [(1, 2), (3, 4), (5, 6)]) - @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" - ) - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_dbapi_fetch_w_bqstorage_client_large_result_set(self): - bqstorage_client = bigquery_storage_v1.BigQueryReadClient( - credentials=Config.CLIENT._credentials - ) - cursor = dbapi.connect(Config.CLIENT, bqstorage_client).cursor() - - cursor.execute( - """ - SELECT id, `by`, time_ts - FROM `bigquery-public-data.hacker_news.comments` - ORDER BY `id` ASC - LIMIT 100000 + def test_dbapi_fetchall_from_script(self): + query = """ + CREATE TEMP TABLE Example + ( + x INT64, + y STRING + ); + + INSERT INTO Example + VALUES (5, 'foo'), + (6, 'bar'), + (7, 'baz'); + + SELECT * + FROM Example + ORDER BY x ASC; """ - ) - result_rows = [cursor.fetchone(), cursor.fetchone(), cursor.fetchone()] - - field_name = operator.itemgetter(0) - fetched_data = [sorted(row.items(), key=field_name) for row in result_rows] - - # Since DB API is not thread safe, only a single result stream should be - # requested by the BQ storage client, meaning that results should arrive - # in the sorted order. - expected_data = [ - [ - ("by", "sama"), - ("id", 15), - ("time_ts", datetime.datetime(2006, 10, 9, 19, 51, 1, tzinfo=UTC)), - ], - [ - ("by", "pg"), - ("id", 17), - ("time_ts", datetime.datetime(2006, 10, 9, 19, 52, 45, tzinfo=UTC)), - ], - [ - ("by", "pg"), - ("id", 22), - ("time_ts", datetime.datetime(2006, 10, 10, 2, 18, 22, tzinfo=UTC)), - ], - ] - self.assertEqual(fetched_data, expected_data) + Config.CURSOR.execute(query) + self.assertEqual(Config.CURSOR.rowcount, 3, "expected 3 rows") + rows = Config.CURSOR.fetchall() + row_tuples = [r.values() for r in rows] + self.assertEqual(row_tuples, [(5, "foo"), (6, "bar"), (7, "baz")]) @unittest.skipIf( - bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_dbapi_fetch_w_bqstorage_client_v1beta1_large_result_set(self): - bqstorage_client = bigquery_storage_v1beta1.BigQueryStorageClient( + def test_dbapi_fetch_w_bqstorage_client_large_result_set(self): + bqstorage_client = bigquery_storage.BigQueryReadClient( credentials=Config.CLIENT._credentials ) cursor = dbapi.connect(Config.CLIENT, bqstorage_client).cursor() @@ -1901,7 +1698,7 @@ def test_dbapi_dry_run_query(self): self.assertEqual(list(rows), []) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_dbapi_connection_does_not_leak_sockets(self): current_process = psutil.Process() @@ -1936,7 +1733,7 @@ def _load_table_for_dml(self, rows, dataset_id, table_id): greeting = bigquery.SchemaField("greeting", "STRING", mode="NULLABLE") table_ref = dataset.table(table_id) table_arg = Table(table_ref, schema=[greeting]) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) with _NamedTemporaryFile() as temp: @@ -1979,7 +1776,9 @@ def test_query_w_dml(self): def test_dbapi_w_dml(self): dataset_name = _make_dataset_id("dml_dbapi") table_name = "test_table" - self._load_table_for_dml([("Hello World",)], dataset_name, table_name) + self._load_table_for_dml( + [("こんにちは",), ("Hello World",), ("Howdy!",)], dataset_name, table_name + ) query_template = """UPDATE {}.{} SET greeting = 'Guten Tag' WHERE greeting = 'Hello World' @@ -1990,13 +1789,14 @@ def test_dbapi_w_dml(self): job_id="test_dbapi_w_dml_{}".format(str(uuid.uuid4())), ) self.assertEqual(Config.CURSOR.rowcount, 1) - self.assertIsNone(Config.CURSOR.fetchone()) def test_query_w_query_params(self): from google.cloud.bigquery.job import QueryJobConfig from google.cloud.bigquery.query import ArrayQueryParameter from google.cloud.bigquery.query import ScalarQueryParameter + from google.cloud.bigquery.query import ScalarQueryParameterType from google.cloud.bigquery.query import StructQueryParameter + from google.cloud.bigquery.query import StructQueryParameterType question = "What is the answer to life, the universe, and everything?" question_param = ScalarQueryParameter( @@ -2010,6 +1810,10 @@ def test_query_w_query_params(self): pi_numeric_param = ScalarQueryParameter( name="pi_numeric_param", type_="NUMERIC", value=pi_numeric ) + bignum = decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)) + bignum_param = ScalarQueryParameter( + name="bignum_param", type_="BIGNUMERIC", value=bignum + ) truthy = True truthy_param = ScalarQueryParameter(name="truthy", type_="BOOL", value=truthy) beef = b"DEADBEEF" @@ -2051,6 +1855,14 @@ def test_query_w_query_params(self): characters_param = ArrayQueryParameter( name=None, array_type="RECORD", values=[phred_param, bharney_param] ) + empty_struct_array_param = ArrayQueryParameter( + name="empty_array_param", + values=[], + array_type=StructQueryParameterType( + ScalarQueryParameterType(name="foo", type_="INT64"), + ScalarQueryParameterType(name="bar", type_="STRING"), + ), + ) hero_param = StructQueryParameter("hero", phred_name_param, phred_age_param) sidekick_param = StructQueryParameter( "sidekick", bharney_name_param, bharney_age_param @@ -2141,6 +1953,11 @@ def test_query_w_query_params(self): ], "query_parameters": [characters_param], }, + { + "sql": "SELECT @empty_array_param", + "expected": [], + "query_parameters": [empty_struct_array_param], + }, { "sql": "SELECT @roles", "expected": { @@ -2154,7 +1971,13 @@ def test_query_w_query_params(self): "expected": {"friends": [phred_name, bharney_name]}, "query_parameters": [with_friends_param], }, + { + "sql": "SELECT @bignum_param", + "expected": bignum, + "query_parameters": [bignum_param], + }, ] + for example in examples: jconfig = QueryJobConfig() jconfig.query_parameters = example["query_parameters"] @@ -2280,210 +2103,6 @@ def test_query_iter(self): row_tuples = [r.values() for r in query_job] self.assertEqual(row_tuples, [(1,)]) - def test_querying_data_w_timeout(self): - job_config = bigquery.QueryJobConfig() - job_config.use_query_cache = False - - query_job = Config.CLIENT.query( - """ - SELECT name, SUM(number) AS total_people - FROM `bigquery-public-data.usa_names.usa_1910_current` - GROUP BY name - """, - location="US", - job_config=job_config, - ) - - # Specify a very tight deadline to demonstrate that the timeout - # actually has effect. - with self.assertRaises(requests.exceptions.Timeout): - query_job.done(timeout=0.1) - - # Now wait for the result using a more realistic deadline. - query_job.result(timeout=30) - self.assertTrue(query_job.done(timeout=30)) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_query_results_to_dataframe(self): - QUERY = """ - SELECT id, author, time_ts, dead - FROM `bigquery-public-data.hacker_news.comments` - LIMIT 10 - """ - - df = Config.CLIENT.query(QUERY).result().to_dataframe() - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 10) # verify the number of rows - column_names = ["id", "author", "time_ts", "dead"] - self.assertEqual(list(df), column_names) # verify the column names - exp_datatypes = { - "id": int, - "author": six.text_type, - "time_ts": pandas.Timestamp, - "dead": bool, - } - for index, row in df.iterrows(): - for col in column_names: - # all the schema fields are nullable, so None is acceptable - if not row[col] is None: - self.assertIsInstance(row[col], exp_datatypes[col]) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" - ) - def test_query_results_to_dataframe_w_bqstorage(self): - query = """ - SELECT id, author, time_ts, dead - FROM `bigquery-public-data.hacker_news.comments` - LIMIT 10 - """ - - bqstorage_client = bigquery_storage_v1.BigQueryReadClient( - credentials=Config.CLIENT._credentials - ) - - df = Config.CLIENT.query(query).result().to_dataframe(bqstorage_client) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 10) # verify the number of rows - column_names = ["id", "author", "time_ts", "dead"] - self.assertEqual(list(df), column_names) - exp_datatypes = { - "id": int, - "author": six.text_type, - "time_ts": pandas.Timestamp, - "dead": bool, - } - for index, row in df.iterrows(): - for col in column_names: - # all the schema fields are nullable, so None is acceptable - if not row[col] is None: - self.assertIsInstance(row[col], exp_datatypes[col]) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf( - bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`" - ) - def test_query_results_to_dataframe_w_bqstorage_v1beta1(self): - query = """ - SELECT id, author, time_ts, dead - FROM `bigquery-public-data.hacker_news.comments` - LIMIT 10 - """ - - bqstorage_client = bigquery_storage_v1beta1.BigQueryStorageClient( - credentials=Config.CLIENT._credentials - ) - - df = Config.CLIENT.query(query).result().to_dataframe(bqstorage_client) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 10) # verify the number of rows - column_names = ["id", "author", "time_ts", "dead"] - self.assertEqual(list(df), column_names) - exp_datatypes = { - "id": int, - "author": six.text_type, - "time_ts": pandas.Timestamp, - "dead": bool, - } - for index, row in df.iterrows(): - for col in column_names: - # all the schema fields are nullable, so None is acceptable - if not row[col] is None: - self.assertIsInstance(row[col], exp_datatypes[col]) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_insert_rows_from_dataframe(self): - SF = bigquery.SchemaField - schema = [ - SF("float_col", "FLOAT", mode="REQUIRED"), - SF("int_col", "INTEGER", mode="REQUIRED"), - SF("bool_col", "BOOLEAN", mode="REQUIRED"), - SF("string_col", "STRING", mode="NULLABLE"), - ] - - dataframe = pandas.DataFrame( - [ - { - "float_col": 1.11, - "bool_col": True, - "string_col": "my string", - "int_col": 10, - }, - { - "float_col": 2.22, - "bool_col": False, - "string_col": "another string", - "int_col": 20, - }, - { - "float_col": 3.33, - "bool_col": False, - "string_col": "another string", - "int_col": 30, - }, - { - "float_col": 4.44, - "bool_col": True, - "string_col": "another string", - "int_col": 40, - }, - { - "float_col": 5.55, - "bool_col": False, - "string_col": "another string", - "int_col": 50, - }, - { - "float_col": 6.66, - "bool_col": True, - # Include a NaN value, because pandas often uses NaN as a - # NULL value indicator. - "string_col": float("NaN"), - "int_col": 60, - }, - ] - ) - - table_id = "test_table" - dataset = self.temp_dataset(_make_dataset_id("issue_7553")) - table_arg = Table(dataset.table(table_id), schema=schema) - table = retry_403(Config.CLIENT.create_table)(table_arg) - self.to_delete.insert(0, table) - - chunk_errors = Config.CLIENT.insert_rows_from_dataframe( - table, dataframe, chunk_size=3 - ) - for errors in chunk_errors: - assert not errors - - # Use query to fetch rows instead of listing directly from the table so - # that we get values from the streaming buffer. - rows = list( - Config.CLIENT.query( - "SELECT * FROM `{}.{}.{}`".format( - table.project, table.dataset_id, table.table_id - ) - ) - ) - - sorted_rows = sorted(rows, key=operator.attrgetter("int_col")) - row_tuples = [r.values() for r in sorted_rows] - expected = [ - tuple(None if col != col else col for col in data_row) - for data_row in dataframe.itertuples(index=False) - ] - - assert len(row_tuples) == len(expected) - - for row, expected_row in zip(row_tuples, expected): - six.assertCountEqual( - self, row, expected_row - ) # column order does not matter - def test_insert_rows_nested_nested(self): # See #2951 SF = bigquery.SchemaField @@ -2514,7 +2133,7 @@ def test_insert_rows_nested_nested(self): table_id = "test_table" dataset = self.temp_dataset(_make_dataset_id("issue_2951")) table_arg = Table(dataset.table(table_id), schema=schema) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) Config.CLIENT.insert_rows(table, to_insert) @@ -2554,7 +2173,7 @@ def test_insert_rows_nested_nested_dictionary(self): table_id = "test_table" dataset = self.temp_dataset(_make_dataset_id("issue_2951")) table_arg = Table(dataset.table(table_id), schema=schema) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) Config.CLIENT.insert_rows(table, to_insert) @@ -2565,11 +2184,14 @@ def test_insert_rows_nested_nested_dictionary(self): expected_rows = [("Some value", record)] self.assertEqual(row_tuples, expected_rows) + @pytest.mark.skipif( + MTLS_TESTING, reason="mTLS testing has no permission to the max-value.js file" + ) def test_create_routine(self): routine_name = "test_routine" dataset = self.temp_dataset(_make_dataset_id("create_routine")) float64_type = bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.FLOAT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.FLOAT64 ) routine = bigquery.Routine( dataset.routine(routine_name), @@ -2584,36 +2206,116 @@ def test_create_routine(self): bigquery.RoutineArgument( name="arr", data_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.ARRAY, + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.ARRAY, array_element_type=float64_type, ), ) ] routine.body = "return maxValue(arr)" + routine.determinism_level = bigquery.DeterminismLevel.DETERMINISTIC query_string = "SELECT `{}`([-100.0, 3.14, 100.0, 42.0]) as max_value;".format( str(routine.reference) ) - routine = retry_403(Config.CLIENT.create_routine)(routine) - query_job = retry_403(Config.CLIENT.query)(query_string) + routine = helpers.retry_403(Config.CLIENT.create_routine)(routine) + query_job = helpers.retry_403(Config.CLIENT.query)(query_string) rows = list(query_job.result()) assert len(rows) == 1 assert rows[0].max_value == 100.0 + def test_create_tvf_routine(self): + from google.cloud.bigquery import Routine, RoutineArgument, RoutineType + + StandardSqlDataType = bigquery_v2.types.StandardSqlDataType + StandardSqlField = bigquery_v2.types.StandardSqlField + StandardSqlTableType = bigquery_v2.types.StandardSqlTableType + + INT64 = StandardSqlDataType.TypeKind.INT64 + STRING = StandardSqlDataType.TypeKind.STRING + + client = Config.CLIENT + + dataset = self.temp_dataset(_make_dataset_id("create_tvf_routine")) + routine_ref = dataset.routine("test_tvf_routine") + + routine_body = """ + SELECT int_col, str_col + FROM ( + UNNEST([1, 2, 3]) int_col + JOIN + (SELECT str_col FROM UNNEST(["one", "two", "three"]) str_col) + ON TRUE + ) + WHERE int_col > threshold + """ + + return_table_type = StandardSqlTableType( + columns=[ + StandardSqlField( + name="int_col", type=StandardSqlDataType(type_kind=INT64), + ), + StandardSqlField( + name="str_col", type=StandardSqlDataType(type_kind=STRING), + ), + ] + ) + + routine_args = [ + RoutineArgument( + name="threshold", data_type=StandardSqlDataType(type_kind=INT64), + ) + ] + + routine_def = Routine( + routine_ref, + type_=RoutineType.TABLE_VALUED_FUNCTION, + arguments=routine_args, + return_table_type=return_table_type, + body=routine_body, + ) + + # Create TVF routine. + client.delete_routine(routine_ref, not_found_ok=True) + routine = client.create_routine(routine_def) + + assert routine.body == routine_body + assert routine.return_table_type == return_table_type + assert routine.arguments == routine_args + + # Execute the routine to see if it's working as expected. + query_job = client.query( + f""" + SELECT int_col, str_col + FROM `{routine.reference}`(1) + ORDER BY int_col, str_col ASC + """ + ) + + result_rows = [tuple(row) for row in query_job.result()] + expected = [ + (2, "one"), + (2, "three"), + (2, "two"), + (3, "one"), + (3, "three"), + (3, "two"), + ] + assert result_rows == expected + def test_create_table_rows_fetch_nested_schema(self): table_name = "test_table" dataset = self.temp_dataset(_make_dataset_id("create_table_nested_schema")) schema = _load_json_schema() table_arg = Table(dataset.table(table_name), schema=schema) - table = retry_403(Config.CLIENT.create_table)(table_arg) + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) self.to_delete.insert(0, table) self.assertTrue(_table_exists(table)) self.assertEqual(table.table_id, table_name) to_insert = [] # Data is in "JSON Lines" format, see http://jsonlines.org/ - json_filename = os.path.join(WHERE, "data", "characters.jsonl") + json_filename = DATA_PATH / "characters.jsonl" with open(json_filename) as rows_file: for line in rows_file: to_insert.append(json.loads(line)) @@ -2658,12 +2360,9 @@ def test_create_table_rows_fetch_nested_schema(self): self.assertEqual(found[7], e_favtime) self.assertEqual(found[8], decimal.Decimal(expected["FavoriteNumber"])) - def _fetch_dataframe(self, query): - return Config.CLIENT.query(query).result().to_dataframe() - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_nested_table_to_arrow(self): from google.cloud.bigquery.job import SourceFormat @@ -2688,7 +2387,7 @@ def test_nested_table_to_arrow(self): {"string_col": "Some value", "record_col": record, "float_col": 3.14} ] rows = [json.dumps(row) for row in to_insert] - body = six.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) + body = io.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) table_id = "test_table" dataset = self.temp_dataset(_make_dataset_id("nested_df")) table = dataset.table(table_id) @@ -2699,7 +2398,7 @@ def test_nested_table_to_arrow(self): job_config.schema = schema # Load a table using a local JSON file from memory. Config.CLIENT.load_table_from_file(body, table, job_config=job_config).result() - bqstorage_client = bigquery_storage_v1.BigQueryReadClient( + bqstorage_client = bigquery_storage.BigQueryReadClient( credentials=Config.CLIENT._credentials ) @@ -2711,221 +2410,28 @@ def test_nested_table_to_arrow(self): self.assertEqual(tbl.num_rows, 1) self.assertEqual(tbl.num_columns, 3) # Columns may not appear in the requested order. - self.assertTrue( - pyarrow.types.is_float64(tbl.schema.field_by_name("float_col").type) - ) - self.assertTrue( - pyarrow.types.is_string(tbl.schema.field_by_name("string_col").type) - ) - record_col = tbl.schema.field_by_name("record_col").type + self.assertTrue(pyarrow.types.is_float64(tbl.schema.field("float_col").type)) + self.assertTrue(pyarrow.types.is_string(tbl.schema.field("string_col").type)) + record_col = tbl.schema.field("record_col").type self.assertTrue(pyarrow.types.is_struct(record_col)) - self.assertEqual(record_col.num_children, 2) + self.assertEqual(record_col.num_fields, 2) self.assertEqual(record_col[0].name, "nested_string") self.assertTrue(pyarrow.types.is_string(record_col[0].type)) self.assertEqual(record_col[1].name, "nested_repeated") self.assertTrue(pyarrow.types.is_list(record_col[1].type)) self.assertTrue(pyarrow.types.is_int64(record_col[1].type.value_type)) - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_nested_table_to_dataframe(self): - from google.cloud.bigquery.job import SourceFormat - from google.cloud.bigquery.job import WriteDisposition - - SF = bigquery.SchemaField - schema = [ - SF("string_col", "STRING", mode="NULLABLE"), - SF( - "record_col", - "RECORD", - mode="NULLABLE", - fields=[ - SF("nested_string", "STRING", mode="NULLABLE"), - SF("nested_repeated", "INTEGER", mode="REPEATED"), - SF( - "nested_record", - "RECORD", - mode="NULLABLE", - fields=[SF("nested_nested_string", "STRING", mode="NULLABLE")], - ), - ], - ), - SF("bigfloat_col", "FLOAT", mode="NULLABLE"), - SF("smallfloat_col", "FLOAT", mode="NULLABLE"), - ] - record = { - "nested_string": "another string value", - "nested_repeated": [0, 1, 2], - "nested_record": {"nested_nested_string": "some deep insight"}, - } - to_insert = [ - { - "string_col": "Some value", - "record_col": record, - "bigfloat_col": 3.14, - "smallfloat_col": 2.72, - } - ] - rows = [json.dumps(row) for row in to_insert] - body = six.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) - table_id = "test_table" - dataset = self.temp_dataset(_make_dataset_id("nested_df")) - table = dataset.table(table_id) - self.to_delete.insert(0, table) - job_config = bigquery.LoadJobConfig() - job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE - job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON - job_config.schema = schema - # Load a table using a local JSON file from memory. - Config.CLIENT.load_table_from_file(body, table, job_config=job_config).result() - - df = Config.CLIENT.list_rows(table, selected_fields=schema).to_dataframe( - dtypes={"smallfloat_col": "float16"} - ) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 1) # verify the number of rows - exp_columns = ["string_col", "record_col", "bigfloat_col", "smallfloat_col"] - self.assertEqual(list(df), exp_columns) # verify the column names - row = df.iloc[0] - # verify the row content - self.assertEqual(row["string_col"], "Some value") - expected_keys = tuple(sorted(record.keys())) - row_keys = tuple(sorted(row["record_col"].keys())) - self.assertEqual(row_keys, expected_keys) - # Can't compare numpy arrays, which pyarrow encodes the embedded - # repeated column to, so convert to list. - self.assertEqual(list(row["record_col"]["nested_repeated"]), [0, 1, 2]) - # verify that nested data can be accessed with indices/keys - self.assertEqual(row["record_col"]["nested_repeated"][0], 0) - self.assertEqual( - row["record_col"]["nested_record"]["nested_nested_string"], - "some deep insight", - ) - # verify dtypes - self.assertEqual(df.dtypes["bigfloat_col"].name, "float64") - self.assertEqual(df.dtypes["smallfloat_col"].name, "float16") - - def test_list_rows_empty_table(self): - from google.cloud.bigquery.table import RowIterator - - dataset_id = _make_dataset_id("empty_table") - dataset = self.temp_dataset(dataset_id) - table_ref = dataset.table("empty_table") - table = Config.CLIENT.create_table(bigquery.Table(table_ref)) - - # It's a bit silly to list rows for an empty table, but this does - # happen as the result of a DDL query from an IPython magic command. - rows = Config.CLIENT.list_rows(table) - self.assertIsInstance(rows, RowIterator) - self.assertEqual(tuple(rows), ()) - - def test_list_rows_page_size(self): - from google.cloud.bigquery.job import SourceFormat - from google.cloud.bigquery.job import WriteDisposition - - num_items = 7 - page_size = 3 - num_pages, num_last_page = divmod(num_items, page_size) - - SF = bigquery.SchemaField - schema = [SF("string_col", "STRING", mode="NULLABLE")] - to_insert = [{"string_col": "item%d" % i} for i in range(num_items)] - rows = [json.dumps(row) for row in to_insert] - body = six.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) - - table_id = "test_table" - dataset = self.temp_dataset(_make_dataset_id("nested_df")) - table = dataset.table(table_id) - self.to_delete.insert(0, table) - job_config = bigquery.LoadJobConfig() - job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE - job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON - job_config.schema = schema - # Load a table using a local JSON file from memory. - Config.CLIENT.load_table_from_file(body, table, job_config=job_config).result() - - df = Config.CLIENT.list_rows(table, selected_fields=schema, page_size=page_size) - pages = df.pages - - for i in range(num_pages): - page = next(pages) - self.assertEqual(page.num_items, page_size) - page = next(pages) - self.assertEqual(page.num_items, num_last_page) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" - ) - def test_list_rows_max_results_w_bqstorage(self): - table_ref = DatasetReference("bigquery-public-data", "utility_us").table( - "country_code_iso" - ) - bqstorage_client = bigquery_storage_v1.BigQueryReadClient( - credentials=Config.CLIENT._credentials - ) - - row_iterator = Config.CLIENT.list_rows( - table_ref, - selected_fields=[bigquery.SchemaField("country_name", "STRING")], - max_results=100, - ) - dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) - - self.assertEqual(len(dataframe.index), 100) - def temp_dataset(self, dataset_id, location=None): project = Config.CLIENT.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = Dataset(dataset_ref) if location: dataset.location = location - dataset = retry_403(Config.CLIENT.create_dataset)(dataset) + dataset = helpers.retry_403(Config.CLIENT.create_dataset)(dataset) self.to_delete.append(dataset) return dataset -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -@pytest.mark.skipif(IPython is None, reason="Requires `ipython`") -@pytest.mark.usefixtures("ipython_interactive") -def test_bigquery_magic(): - ip = IPython.get_ipython() - current_process = psutil.Process() - conn_count_start = len(current_process.connections()) - - ip.extension_manager.load_extension("google.cloud.bigquery") - sql = """ - SELECT - CONCAT( - 'https://stackoverflow.com/questions/', - CAST(id as STRING)) as url, - view_count - FROM `bigquery-public-data.stackoverflow.posts_questions` - WHERE tags like '%google-bigquery%' - ORDER BY view_count DESC - LIMIT 10 - """ - with io.capture_output() as captured: - result = ip.run_cell_magic("bigquery", "--use_rest_api", sql) - - conn_count_end = len(current_process.connections()) - - lines = re.split("\n|\r", captured.stdout) - # Removes blanks & terminal code (result of display clearing) - updates = list(filter(lambda x: bool(x) and x != "\x1b[2K", lines)) - assert re.match("Executing query with job ID: .*", updates[0]) - assert all(re.match("Query executing: .*s", line) for line in updates[1:-1]) - assert re.match("Query complete after .*s", updates[-1]) - assert isinstance(result, pandas.DataFrame) - assert len(result) == 10 # verify row count - assert list(result) == ["url", "view_count"] # verify column names - - # NOTE: For some reason, the number of open sockets is sometimes one *less* - # than expected when running system tests on Kokoro, thus using the <= assertion. - # That's still fine, however, since the sockets are apparently not leaked. - assert conn_count_end <= conn_count_start # system resources are released - - def _job_done(instance): return instance.state.lower() == "done" @@ -2947,19 +2453,106 @@ def _table_exists(t): return False -@pytest.fixture(scope="session") -def ipython(): - config = tools.default_config() - config.TerminalInteractiveShell.simple_prompt = True - shell = interactiveshell.TerminalInteractiveShell.instance(config=config) - return shell +def test_dbapi_create_view(dataset_id): + query = f""" + CREATE VIEW {dataset_id}.dbapi_create_view + AS SELECT name, SUM(number) AS total + FROM `bigquery-public-data.usa_names.usa_1910_2013` + GROUP BY name; + """ -@pytest.fixture() -def ipython_interactive(request, ipython): - """Activate IPython's builtin hooks + Config.CURSOR.execute(query) + assert Config.CURSOR.rowcount == 0, "expected 0 rows" + + +def test_parameterized_types_round_trip(dataset_id): + client = Config.CLIENT + table_id = f"{dataset_id}.test_parameterized_types_round_trip" + fields = ( + ("n", "NUMERIC"), + ("n9", "NUMERIC(9)"), + ("n92", "NUMERIC(9, 2)"), + ("bn", "BIGNUMERIC"), + ("bn9", "BIGNUMERIC(38)"), + ("bn92", "BIGNUMERIC(38, 22)"), + ("s", "STRING"), + ("s9", "STRING(9)"), + ("b", "BYTES"), + ("b9", "BYTES(9)"), + ) + client.query( + "create table {} ({})".format(table_id, ", ".join(" ".join(f) for f in fields)) + ).result() + table = client.get_table(table_id) + table_id2 = table_id + "2" + client.create_table(Table(f"{client.project}.{table_id2}", table.schema)) + table2 = client.get_table(table_id2) - for the duration of the test scope. - """ - with ipython.builtin_trap: - yield ipython + assert tuple(s._key()[:2] for s in table2.schema) == fields + + +def test_table_snapshots(dataset_id): + from google.cloud.bigquery import CopyJobConfig + from google.cloud.bigquery import OperationType + + client = Config.CLIENT + + source_table_path = f"{client.project}.{dataset_id}.test_table" + snapshot_table_path = f"{source_table_path}_snapshot" + + # Create the table before loading so that the column order is predictable. + schema = [ + bigquery.SchemaField("foo", "INTEGER"), + bigquery.SchemaField("bar", "STRING"), + ] + source_table = helpers.retry_403(Config.CLIENT.create_table)( + Table(source_table_path, schema=schema) + ) + + # Populate the table with initial data. + rows = [{"foo": 1, "bar": "one"}, {"foo": 2, "bar": "two"}] + load_job = Config.CLIENT.load_table_from_json(rows, source_table) + load_job.result() + + # Now create a snapshot before modifying the original table data. + copy_config = CopyJobConfig() + copy_config.operation_type = OperationType.SNAPSHOT + + copy_job = client.copy_table( + sources=source_table_path, + destination=snapshot_table_path, + job_config=copy_config, + ) + copy_job.result() + + # Modify data in original table. + sql = f'INSERT INTO `{source_table_path}`(foo, bar) VALUES (3, "three")' + query_job = client.query(sql) + query_job.result() + + # List rows from the source table and compare them to rows from the snapshot. + rows_iter = client.list_rows(source_table_path) + rows = sorted(row.values() for row in rows_iter) + assert rows == [(1, "one"), (2, "two"), (3, "three")] + + rows_iter = client.list_rows(snapshot_table_path) + rows = sorted(row.values() for row in rows_iter) + assert rows == [(1, "one"), (2, "two")] + + # Now restore the table from the snapshot and it should again contain the old + # set of rows. + copy_config = CopyJobConfig() + copy_config.operation_type = OperationType.RESTORE + copy_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE + + copy_job = client.copy_table( + sources=snapshot_table_path, + destination=source_table_path, + job_config=copy_config, + ) + copy_job.result() + + rows_iter = client.list_rows(source_table_path) + rows = sorted(row.values() for row in rows_iter) + assert rows == [(1, "one"), (2, "two")] diff --git a/tests/system/test_job_retry.py b/tests/system/test_job_retry.py new file mode 100644 index 000000000..520545493 --- /dev/null +++ b/tests/system/test_job_retry.py @@ -0,0 +1,72 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import threading +import time + +import google.api_core.exceptions +import google.cloud.bigquery +import pytest + + +def thread(func): + thread = threading.Thread(target=func, daemon=True) + thread.start() + return thread + + +@pytest.mark.parametrize("job_retry_on_query", [True, False]) +def test_query_retry_539(bigquery_client, dataset_id, job_retry_on_query): + """ + Test job_retry + + See: https://github.com/googleapis/python-bigquery/issues/539 + """ + from google.api_core import exceptions + from google.api_core.retry import if_exception_type, Retry + + table_name = f"{dataset_id}.t539" + + # Without a custom retry, we fail: + with pytest.raises(google.api_core.exceptions.NotFound): + bigquery_client.query(f"select count(*) from {table_name}").result() + + retry_notfound = Retry(predicate=if_exception_type(exceptions.NotFound)) + + job_retry = dict(job_retry=retry_notfound) if job_retry_on_query else {} + job = bigquery_client.query(f"select count(*) from {table_name}", **job_retry) + job_id = job.job_id + + # We can already know that the job failed, but we're not supposed + # to find out until we call result, which is where retry happend + assert job.done() + assert job.exception() is not None + + @thread + def create_table(): + time.sleep(1) # Give the first retry attempt time to fail. + with contextlib.closing(google.cloud.bigquery.Client()) as client: + client.query(f"create table {table_name} (id int64)").result() + + job_retry = {} if job_retry_on_query else dict(job_retry=retry_notfound) + [[count]] = list(job.result(**job_retry)) + assert count == 0 + + # The job was retried, and thus got a new job id + assert job.job_id != job_id + + # Make sure we don't leave a thread behind: + create_table.join() + bigquery_client.query(f"drop table {table_name}").result() diff --git a/tests/system/test_list_rows.py b/tests/system/test_list_rows.py new file mode 100644 index 000000000..70388059e --- /dev/null +++ b/tests/system/test_list_rows.py @@ -0,0 +1,112 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import decimal + +from google.cloud import bigquery +from google.cloud.bigquery import enums + + +def test_list_rows_empty_table(bigquery_client: bigquery.Client, table_id: str): + from google.cloud.bigquery.table import RowIterator + + table = bigquery_client.create_table(table_id) + + # It's a bit silly to list rows for an empty table, but this does + # happen as the result of a DDL query from an IPython magic command. + rows = bigquery_client.list_rows(table) + assert isinstance(rows, RowIterator) + assert tuple(rows) == () + + +def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str): + num_items = 7 + page_size = 3 + num_pages, num_last_page = divmod(num_items, page_size) + + to_insert = [{"string_col": "item%d" % i, "rowindex": i} for i in range(num_items)] + bigquery_client.load_table_from_json(to_insert, table_id).result() + + df = bigquery_client.list_rows( + table_id, + selected_fields=[bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING)], + page_size=page_size, + ) + pages = df.pages + + for i in range(num_pages): + page = next(pages) + assert page.num_items == page_size + page = next(pages) + assert page.num_items == num_last_page + + +def test_list_rows_scalars(bigquery_client: bigquery.Client, scalars_table: str): + rows = sorted( + bigquery_client.list_rows(scalars_table), key=lambda row: row["rowindex"] + ) + row = rows[0] + assert row["bool_col"] # True + assert row["bytes_col"] == b"Hello, World!" + assert row["date_col"] == datetime.date(2021, 7, 21) + assert row["datetime_col"] == datetime.datetime(2021, 7, 21, 11, 39, 45) + assert row["geography_col"] == "POINT(-122.0838511 37.3860517)" + assert row["int64_col"] == 123456789 + assert row["numeric_col"] == decimal.Decimal("1.23456789") + assert row["bignumeric_col"] == decimal.Decimal("10.111213141516171819") + assert row["float64_col"] == 1.25 + assert row["string_col"] == "Hello, World!" + assert row["time_col"] == datetime.time(11, 41, 43, 76160) + assert row["timestamp_col"] == datetime.datetime( + 2021, 7, 21, 17, 43, 43, 945289, tzinfo=datetime.timezone.utc + ) + + nullrow = rows[1] + for column, value in nullrow.items(): + if column == "rowindex": + assert value == 1 + else: + assert value is None + + +def test_list_rows_scalars_extreme( + bigquery_client: bigquery.Client, scalars_extreme_table: str +): + rows = sorted( + bigquery_client.list_rows(scalars_extreme_table), + key=lambda row: row["rowindex"], + ) + row = rows[0] + assert row["bool_col"] # True + assert row["bytes_col"] == b"\r\n" + assert row["date_col"] == datetime.date(9999, 12, 31) + assert row["datetime_col"] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) + assert row["geography_col"] == "POINT(-135 90)" + assert row["int64_col"] == 9223372036854775807 + assert row["numeric_col"] == decimal.Decimal(f"9.{'9' * 37}E+28") + assert row["bignumeric_col"] == decimal.Decimal(f"9.{'9' * 75}E+37") + assert row["float64_col"] == float("Inf") + assert row["string_col"] == "Hello, World" + assert row["time_col"] == datetime.time(23, 59, 59, 999999) + assert row["timestamp_col"] == datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ) + + nullrow = rows[4] + for column, value in nullrow.items(): + if column == "rowindex": + assert value == 4 + else: + assert value is None diff --git a/tests/system/test_magics.py b/tests/system/test_magics.py new file mode 100644 index 000000000..78c15cb50 --- /dev/null +++ b/tests/system/test_magics.py @@ -0,0 +1,83 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for Jupyter/IPython connector.""" + +import re + +import pytest +import psutil + + +IPython = pytest.importorskip("IPython") +io = pytest.importorskip("IPython.utils.io") +pandas = pytest.importorskip("pandas") +tools = pytest.importorskip("IPython.testing.tools") +interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") + + +@pytest.fixture(scope="session") +def ipython(): + config = tools.default_config() + config.TerminalInteractiveShell.simple_prompt = True + shell = interactiveshell.TerminalInteractiveShell.instance(config=config) + return shell + + +@pytest.fixture() +def ipython_interactive(ipython): + """Activate IPython's builtin hooks + + for the duration of the test scope. + """ + with ipython.builtin_trap: + yield ipython + + +def test_bigquery_magic(ipython_interactive): + ip = IPython.get_ipython() + current_process = psutil.Process() + conn_count_start = len(current_process.connections()) + + ip.extension_manager.load_extension("google.cloud.bigquery") + sql = """ + SELECT + CONCAT( + 'https://stackoverflow.com/questions/', + CAST(id as STRING)) as url, + view_count + FROM `bigquery-public-data.stackoverflow.posts_questions` + WHERE tags like '%google-bigquery%' + ORDER BY view_count DESC + LIMIT 10 + """ + with io.capture_output() as captured: + result = ip.run_cell_magic("bigquery", "--use_rest_api", sql) + + conn_count_end = len(current_process.connections()) + + lines = re.split("\n|\r", captured.stdout) + # Removes blanks & terminal code (result of display clearing) + updates = list(filter(lambda x: bool(x) and x != "\x1b[2K", lines)) + assert re.match("Executing query with job ID: .*", updates[0]) + assert all(re.match("Query executing: .*s", line) for line in updates[1:-1]) + assert re.match("Query complete after .*s", updates[-1]) + assert isinstance(result, pandas.DataFrame) + assert len(result) == 10 # verify row count + assert list(result) == ["url", "view_count"] # verify column names + + # NOTE: For some reason, the number of open sockets is sometimes one *less* + # than expected when running system tests on Kokoro, thus using the <= assertion. + # That's still fine, however, since the sockets are apparently not leaked. + assert conn_count_end <= conn_count_start # system resources are released diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py new file mode 100644 index 000000000..93ce23481 --- /dev/null +++ b/tests/system/test_pandas.py @@ -0,0 +1,985 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for pandas connector.""" + +import collections +import datetime +import decimal +import json +import io +import operator + +import google.api_core.retry +import pkg_resources +import pytest + +from google.cloud import bigquery +from . import helpers + + +bigquery_storage = pytest.importorskip( + "google.cloud.bigquery_storage", minversion="2.0.0" +) +pandas = pytest.importorskip("pandas", minversion="0.23.0") +pyarrow = pytest.importorskip("pyarrow", minversion="1.0.0") + + +PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version +PANDAS_INT64_VERSION = pkg_resources.parse_version("1.0.0") + + +class MissingDataError(Exception): + pass + + +def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_id): + """Test that a DataFrame with dtypes that map well to BigQuery types + can be uploaded without specifying a schema. + + https://github.com/googleapis/google-cloud-python/issues/9044 + """ + df_data = collections.OrderedDict( + [ + ("bool_col", pandas.Series([True, False, True], dtype="bool")), + ( + "ts_col", + pandas.Series( + [ + datetime.datetime(2010, 1, 2, 3, 44, 50), + datetime.datetime(2011, 2, 3, 14, 50, 59), + datetime.datetime(2012, 3, 14, 15, 16), + ], + dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + ), + ( + "dt_col", + pandas.Series( + [ + datetime.datetime(2010, 1, 2, 3, 44, 50), + datetime.datetime(2011, 2, 3, 14, 50, 59), + datetime.datetime(2012, 3, 14, 15, 16), + ], + dtype="datetime64[ns]", + ), + ), + ("float32_col", pandas.Series([1.0, 2.0, 3.0], dtype="float32")), + ("float64_col", pandas.Series([4.0, 5.0, 6.0], dtype="float64")), + ("int8_col", pandas.Series([-12, -11, -10], dtype="int8")), + ("int16_col", pandas.Series([-9, -8, -7], dtype="int16")), + ("int32_col", pandas.Series([-6, -5, -4], dtype="int32")), + ("int64_col", pandas.Series([-3, -2, -1], dtype="int64")), + ("uint8_col", pandas.Series([0, 1, 2], dtype="uint8")), + ("uint16_col", pandas.Series([3, 4, 5], dtype="uint16")), + ("uint32_col", pandas.Series([6, 7, 8], dtype="uint32")), + ] + ) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_automatic_schema".format( + bigquery_client.project, dataset_id + ) + + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == ( + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + # BigQuery does not support uploading DATETIME values from + # Parquet files. See: + # https://github.com/googleapis/google-cloud-python/issues/9996 + bigquery.SchemaField("dt_col", "TIMESTAMP"), + bigquery.SchemaField("float32_col", "FLOAT"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("int8_col", "INTEGER"), + bigquery.SchemaField("int16_col", "INTEGER"), + bigquery.SchemaField("int32_col", "INTEGER"), + bigquery.SchemaField("int64_col", "INTEGER"), + bigquery.SchemaField("uint8_col", "INTEGER"), + bigquery.SchemaField("uint16_col", "INTEGER"), + bigquery.SchemaField("uint32_col", "INTEGER"), + ) + assert table.num_rows == 3 + + +@pytest.mark.skipif( + PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, + reason="Only `pandas version >=1.0.0` is supported", +) +def test_load_table_from_dataframe_w_nullable_int64_datatype( + bigquery_client, dataset_id +): + """Test that a DataFrame containing column with None-type values and int64 datatype + can be uploaded if a BigQuery schema is specified. + + https://github.com/googleapis/python-bigquery/issues/22 + """ + table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( + bigquery_client.project, dataset_id + ) + table_schema = (bigquery.SchemaField("x", "INTEGER", mode="NULLABLE"),) + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + df_data = collections.OrderedDict( + [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] + ) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) + assert table.num_rows == 4 + + +@pytest.mark.skipif( + PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, + reason="Only `pandas version >=1.0.0` is supported", +) +def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema( + bigquery_client, dataset_id, table_id +): + """Test that a DataFrame containing column with None-type values and int64 datatype + can be uploaded without specifying a schema. + + https://github.com/googleapis/python-bigquery/issues/22 + """ + + df_data = collections.OrderedDict( + [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] + ) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) + assert table.num_rows == 4 + + +def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): + """Test that a DataFrame with null columns can be uploaded if a + BigQuery schema is specified. + + See: https://github.com/googleapis/google-cloud-python/issues/7370 + """ + # Schema with all scalar types. + scalars_schema = ( + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("dt_col", "DATETIME"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("num_col", "NUMERIC"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), + bigquery.SchemaField("str_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + ) + + table_schema = scalars_schema + ( + # TODO: Array columns can't be read due to NULLABLE versus REPEATED + # mode mismatch. See: + # https://issuetracker.google.com/133415569#comment3 + # bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"), + # TODO: Support writing StructArrays to Parquet. See: + # https://jira.apache.org/jira/browse/ARROW-2587 + # bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema), + ) + num_rows = 100 + nulls = [None] * num_rows + df_data = [ + ("bool_col", nulls), + ("bytes_col", nulls), + ("date_col", nulls), + ("dt_col", nulls), + ("float_col", nulls), + ("geo_col", nulls), + ("int_col", nulls), + ("num_col", nulls), + ("bignum_col", nulls), + ("str_col", nulls), + ("time_col", nulls), + ("ts_col", nulls), + ] + df_data = collections.OrderedDict(df_data) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_nulls".format( + bigquery_client.project, dataset_id + ) + + # Create the table before loading so that schema mismatch errors are + # identified. + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + job_config = bigquery.LoadJobConfig(schema=table_schema) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table) + assert tuple(table.schema) == table_schema + assert table.num_rows == num_rows + + +def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id): + """Test that a DataFrame with required columns can be uploaded if a + BigQuery schema is specified. + + See: https://github.com/googleapis/google-cloud-python/issues/8093 + """ + table_schema = ( + bigquery.SchemaField("name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), + ) + + records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}] + dataframe = pandas.DataFrame(records, columns=["name", "age"]) + job_config = bigquery.LoadJobConfig(schema=table_schema) + table_id = "{}.{}.load_table_from_dataframe_w_required".format( + bigquery_client.project, dataset_id + ) + + # Create the table before loading so that schema mismatch errors are + # identified. + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + job_config = bigquery.LoadJobConfig(schema=table_schema) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table) + assert tuple(table.schema) == table_schema + assert table.num_rows == 2 + + +def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id): + # Schema with all scalar types. + # See: + # https://github.com/googleapis/python-bigquery/issues/61 + # https://issuetracker.google.com/issues/151765076 + scalars_schema = ( + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("dt_col", "DATETIME"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("num_col", "NUMERIC"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), + bigquery.SchemaField("str_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + ) + + table_schema = scalars_schema + ( + # TODO: Array columns can't be read due to NULLABLE versus REPEATED + # mode mismatch. See: + # https://issuetracker.google.com/133415569#comment3 + # bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"), + # TODO: Support writing StructArrays to Parquet. See: + # https://jira.apache.org/jira/browse/ARROW-2587 + # bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema), + ) + + df_data = [ + ("bool_col", [True, None, False]), + ("bytes_col", [b"abc", None, b"def"]), + ("date_col", [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)]), + ( + "dt_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0), + None, + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ), + ("float_col", [float("-inf"), float("nan"), float("inf")]), + ( + "geo_col", + ["POINT(30 10)", None, "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], + ), + ("int_col", [-9223372036854775808, None, 9223372036854775807]), + ( + "num_col", + [ + decimal.Decimal("-99999999999999999999999999999.999999999"), + None, + decimal.Decimal("99999999999999999999999999999.999999999"), + ], + ), + ( + "bignum_col", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], + ), + ("str_col", ["abc", None, "def"]), + ( + "time_col", + [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], + ), + ( + "ts_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), + None, + datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ), + ] + df_data = collections.OrderedDict(df_data) + dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format( + bigquery_client.project, dataset_id + ) + + job_config = bigquery.LoadJobConfig(schema=table_schema) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == table_schema + assert table.num_rows == 3 + + +def test_load_table_from_dataframe_w_struct_datatype(bigquery_client, dataset_id): + """Test that a DataFrame with struct datatype can be uploaded if a + BigQuery schema is specified. + + https://github.com/googleapis/python-bigquery/issues/21 + """ + table_id = "{}.{}.load_table_from_dataframe_w_struct_datatype".format( + bigquery_client.project, dataset_id + ) + table_schema = [ + bigquery.SchemaField( + "bar", + "RECORD", + fields=[ + bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), + ], + mode="REQUIRED", + ), + ] + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + df_data = [{"id": 1, "age": 21}, {"id": 2, "age": 22}, {"id": 2, "age": 23}] + dataframe = pandas.DataFrame(data={"bar": df_data}, columns=["bar"]) + + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert table.schema == table_schema + assert table.num_rows == 3 + + +def test_load_table_from_dataframe_w_explicit_schema_source_format_csv( + bigquery_client, dataset_id +): + from google.cloud.bigquery.job import SourceFormat + + table_schema = ( + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("dt_col", "DATETIME"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("num_col", "NUMERIC"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), + bigquery.SchemaField("str_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + ) + df_data = collections.OrderedDict( + [ + ("bool_col", [True, None, False]), + ("bytes_col", ["abc", None, "def"]), + ("date_col", [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)],), + ( + "dt_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0), + None, + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ), + ("float_col", [float("-inf"), float("nan"), float("inf")]), + ( + "geo_col", + [ + "POINT(30 10)", + None, + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", + ], + ), + ("int_col", [-9223372036854775808, None, 9223372036854775807]), + ( + "num_col", + [ + decimal.Decimal("-99999999999999999999999999999.999999999"), + None, + decimal.Decimal("99999999999999999999999999999.999999999"), + ], + ), + ( + "bignum_col", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], + ), + ("str_col", ["abc", None, "def"]), + ( + "time_col", + [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], + ), + ( + "ts_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), + None, + datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ), + ] + ) + dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format( + bigquery_client.project, dataset_id + ) + + job_config = bigquery.LoadJobConfig( + schema=table_schema, source_format=SourceFormat.CSV + ) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == table_schema + assert table.num_rows == 3 + + +def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats( + bigquery_client, dataset_id, table_id +): + from google.cloud.bigquery.job import SourceFormat + + table_schema = (bigquery.SchemaField("float_col", "FLOAT"),) + df_data = collections.OrderedDict( + [ + ( + "float_col", + [ + 0.14285714285714285, + 0.51428571485748, + 0.87128748, + 1.807960649, + 2.0679610649, + 2.4406779661016949, + 3.7148514257, + 3.8571428571428572, + 1.51251252e40, + ], + ), + ] + ) + dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + + job_config = bigquery.LoadJobConfig( + schema=table_schema, source_format=SourceFormat.CSV + ) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table_id) + rows = bigquery_client.list_rows(table_id) + floats = [r.values()[0] for r in rows] + assert tuple(table.schema) == table_schema + assert table.num_rows == 9 + assert floats == df_data["float_col"] + + +def test_query_results_to_dataframe(bigquery_client): + QUERY = """ + SELECT id, author, time_ts, dead + FROM `bigquery-public-data.hacker_news.comments` + LIMIT 10 + """ + + df = bigquery_client.query(QUERY).result().to_dataframe() + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 10 # verify the number of rows + column_names = ["id", "author", "time_ts", "dead"] + assert list(df) == column_names # verify the column names + exp_datatypes = { + "id": int, + "author": str, + "time_ts": pandas.Timestamp, + "dead": bool, + } + for _, row in df.iterrows(): + for col in column_names: + # all the schema fields are nullable, so None is acceptable + if not row[col] is None: + assert isinstance(row[col], exp_datatypes[col]) + + +def test_query_results_to_dataframe_w_bqstorage(bigquery_client): + query = """ + SELECT id, author, time_ts, dead + FROM `bigquery-public-data.hacker_news.comments` + LIMIT 10 + """ + + bqstorage_client = bigquery_storage.BigQueryReadClient( + credentials=bigquery_client._credentials + ) + + df = bigquery_client.query(query).result().to_dataframe(bqstorage_client) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 10 # verify the number of rows + column_names = ["id", "author", "time_ts", "dead"] + assert list(df) == column_names + exp_datatypes = { + "id": int, + "author": str, + "time_ts": pandas.Timestamp, + "dead": bool, + } + for index, row in df.iterrows(): + for col in column_names: + # all the schema fields are nullable, so None is acceptable + if not row[col] is None: + assert isinstance(row[col], exp_datatypes[col]) + + +def test_insert_rows_from_dataframe(bigquery_client, dataset_id): + SF = bigquery.SchemaField + schema = [ + SF("float_col", "FLOAT", mode="REQUIRED"), + SF("int_col", "INTEGER", mode="REQUIRED"), + SF("bool_col", "BOOLEAN", mode="REQUIRED"), + SF("string_col", "STRING", mode="NULLABLE"), + ] + + dataframe = pandas.DataFrame( + [ + { + "float_col": 1.11, + "bool_col": True, + "string_col": "my string", + "int_col": 10, + }, + { + "float_col": 2.22, + "bool_col": False, + "string_col": "another string", + "int_col": 20, + }, + { + "float_col": 3.33, + "bool_col": False, + "string_col": "another string", + "int_col": 30, + }, + { + "float_col": 4.44, + "bool_col": True, + "string_col": "another string", + "int_col": 40, + }, + { + "float_col": 5.55, + "bool_col": False, + "string_col": "another string", + "int_col": 50, + }, + { + "float_col": 6.66, + "bool_col": True, + # Include a NaN value, because pandas often uses NaN as a + # NULL value indicator. + "string_col": float("NaN"), + "int_col": 60, + }, + ] + ) + + table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe" + table_arg = bigquery.Table(table_id, schema=schema) + table = helpers.retry_403(bigquery_client.create_table)(table_arg) + + chunk_errors = bigquery_client.insert_rows_from_dataframe( + table, dataframe, chunk_size=3 + ) + for errors in chunk_errors: + assert not errors + expected = [ + # Pandas often represents NULL values as NaN. Convert to None for + # easier comparison. + tuple(None if col != col else col for col in data_row) + for data_row in dataframe.itertuples(index=False) + ] + + # Use query to fetch rows instead of listing directly from the table so + # that we get values from the streaming buffer "within a few seconds". + # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability + @google.api_core.retry.Retry( + predicate=google.api_core.retry.if_exception_type(MissingDataError) + ) + def get_rows(): + rows = list( + bigquery_client.query( + "SELECT * FROM `{}.{}.{}`".format( + table.project, table.dataset_id, table.table_id + ) + ) + ) + if len(rows) != len(expected): + raise MissingDataError() + return rows + + rows = get_rows() + sorted_rows = sorted(rows, key=operator.attrgetter("int_col")) + row_tuples = [r.values() for r in sorted_rows] + + for row, expected_row in zip(row_tuples, expected): + assert ( + # Use Counter to verify the same number of values in each, because + # column order does not matter. + collections.Counter(row) + == collections.Counter(expected_row) + ) + + +def test_nested_table_to_dataframe(bigquery_client, dataset_id): + from google.cloud.bigquery.job import SourceFormat + from google.cloud.bigquery.job import WriteDisposition + + SF = bigquery.SchemaField + schema = [ + SF("string_col", "STRING", mode="NULLABLE"), + SF( + "record_col", + "RECORD", + mode="NULLABLE", + fields=[ + SF("nested_string", "STRING", mode="NULLABLE"), + SF("nested_repeated", "INTEGER", mode="REPEATED"), + SF( + "nested_record", + "RECORD", + mode="NULLABLE", + fields=[SF("nested_nested_string", "STRING", mode="NULLABLE")], + ), + ], + ), + SF("bigfloat_col", "FLOAT", mode="NULLABLE"), + SF("smallfloat_col", "FLOAT", mode="NULLABLE"), + ] + record = { + "nested_string": "another string value", + "nested_repeated": [0, 1, 2], + "nested_record": {"nested_nested_string": "some deep insight"}, + } + to_insert = [ + { + "string_col": "Some value", + "record_col": record, + "bigfloat_col": 3.14, + "smallfloat_col": 2.72, + } + ] + rows = [json.dumps(row) for row in to_insert] + body = io.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) + table_id = f"{bigquery_client.project}.{dataset_id}.test_nested_table_to_dataframe" + job_config = bigquery.LoadJobConfig() + job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE + job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON + job_config.schema = schema + # Load a table using a local JSON file from memory. + bigquery_client.load_table_from_file(body, table_id, job_config=job_config).result() + + df = bigquery_client.list_rows(table_id, selected_fields=schema).to_dataframe( + dtypes={"smallfloat_col": "float16"} + ) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 1 # verify the number of rows + exp_columns = ["string_col", "record_col", "bigfloat_col", "smallfloat_col"] + assert list(df) == exp_columns # verify the column names + row = df.iloc[0] + # verify the row content + assert row["string_col"] == "Some value" + expected_keys = tuple(sorted(record.keys())) + row_keys = tuple(sorted(row["record_col"].keys())) + assert row_keys == expected_keys + # Can't compare numpy arrays, which pyarrow encodes the embedded + # repeated column to, so convert to list. + assert list(row["record_col"]["nested_repeated"]) == [0, 1, 2] + # verify that nested data can be accessed with indices/keys + assert row["record_col"]["nested_repeated"][0] == 0 + assert ( + row["record_col"]["nested_record"]["nested_nested_string"] + == "some deep insight" + ) + # verify dtypes + assert df.dtypes["bigfloat_col"].name == "float64" + assert df.dtypes["smallfloat_col"].name == "float16" + + +def test_list_rows_max_results_w_bqstorage(bigquery_client): + table_ref = bigquery.DatasetReference("bigquery-public-data", "utility_us").table( + "country_code_iso" + ) + bqstorage_client = bigquery_storage.BigQueryReadClient( + credentials=bigquery_client._credentials + ) + + row_iterator = bigquery_client.list_rows( + table_ref, + selected_fields=[bigquery.SchemaField("country_name", "STRING")], + max_results=100, + ) + with pytest.warns( + UserWarning, match="Cannot use bqstorage_client if max_results is set" + ): + dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) + + assert len(dataframe.index) == 100 + + +def test_upload_time_and_datetime_56(bigquery_client, dataset_id): + df = pandas.DataFrame( + dict( + dt=[ + datetime.datetime(2020, 1, 8, 8, 0, 0), + datetime.datetime( + 2020, + 1, + 8, + 8, + 0, + 0, + tzinfo=datetime.timezone(datetime.timedelta(hours=-7)), + ), + ], + t=[datetime.time(0, 0, 10, 100001), None], + ) + ) + table = f"{dataset_id}.test_upload_time_and_datetime" + bigquery_client.load_table_from_dataframe(df, table).result() + data = list(map(list, bigquery_client.list_rows(table))) + assert data == [ + [ + datetime.datetime(2020, 1, 8, 8, 0, tzinfo=datetime.timezone.utc), + datetime.time(0, 0, 10, 100001), + ], + [datetime.datetime(2020, 1, 8, 15, 0, tzinfo=datetime.timezone.utc), None], + ] + + from google.cloud.bigquery import job, schema + + table = f"{dataset_id}.test_upload_time_and_datetime_dt" + config = job.LoadJobConfig( + schema=[schema.SchemaField("dt", "DATETIME"), schema.SchemaField("t", "TIME")] + ) + + bigquery_client.load_table_from_dataframe(df, table, job_config=config).result() + data = list(map(list, bigquery_client.list_rows(table))) + assert data == [ + [datetime.datetime(2020, 1, 8, 8, 0), datetime.time(0, 0, 10, 100001)], + [datetime.datetime(2020, 1, 8, 15, 0), None], + ] + + +def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): + wkt = pytest.importorskip("shapely.wkt") + bigquery_client.query( + f"create table {dataset_id}.lake (name string, geog geography)" + ).result() + bigquery_client.query( + f""" + insert into {dataset_id}.lake (name, geog) values + ('foo', st_geogfromtext('point(0 0)')), + ('bar', st_geogfromtext('point(0 1)')), + ('baz', null) + """ + ).result() + df = bigquery_client.query( + f"select * from {dataset_id}.lake order by name" + ).to_dataframe(geography_as_object=True) + assert list(df["name"]) == ["bar", "baz", "foo"] + assert df["geog"][0] == wkt.loads("point(0 1)") + assert pandas.isna(df["geog"][1]) + assert df["geog"][2] == wkt.loads("point(0 0)") + + +def test_to_geodataframe(bigquery_client, dataset_id): + geopandas = pytest.importorskip("geopandas") + from shapely import wkt + + bigquery_client.query( + f"create table {dataset_id}.geolake (name string, geog geography)" + ).result() + bigquery_client.query( + f""" + insert into {dataset_id}.geolake (name, geog) values + ('foo', st_geogfromtext('point(0 0)')), + ('bar', st_geogfromtext('polygon((0 0, 1 0, 1 1, 0 0))')), + ('baz', null) + """ + ).result() + df = bigquery_client.query( + f"select * from {dataset_id}.geolake order by name" + ).to_geodataframe() + assert df["geog"][0] == wkt.loads("polygon((0 0, 1 0, 1 1, 0 0))") + assert pandas.isna(df["geog"][1]) + assert df["geog"][2] == wkt.loads("point(0 0)") + assert isinstance(df, geopandas.GeoDataFrame) + assert isinstance(df["geog"], geopandas.GeoSeries) + assert df.area[0] == 0.5 + assert pandas.isna(df.area[1]) + assert df.area[2] == 0.0 + assert df.crs.srs == "EPSG:4326" + assert df.crs.name == "WGS 84" + assert df.geog.crs.srs == "EPSG:4326" + assert df.geog.crs.name == "WGS 84" + + +def test_load_geodataframe(bigquery_client, dataset_id): + geopandas = pytest.importorskip("geopandas") + import pandas + from shapely import wkt + from google.cloud.bigquery.schema import SchemaField + + df = geopandas.GeoDataFrame( + pandas.DataFrame( + dict( + name=["foo", "bar"], + geo1=[None, None], + geo2=[None, wkt.loads("Point(1 1)")], + ) + ), + geometry="geo1", + ) + + table_id = f"{dataset_id}.lake_from_gp" + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo1", "GEOGRAPHY", "NULLABLE"), + SchemaField("geo2", "GEOGRAPHY", "NULLABLE"), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", None, "POINT(1 1)"], + ["foo", None, None], + ] + + +def test_load_dataframe_w_shapely(bigquery_client, dataset_id): + wkt = pytest.importorskip("shapely.wkt") + from google.cloud.bigquery.schema import SchemaField + + df = pandas.DataFrame( + dict(name=["foo", "bar"], geo=[None, wkt.loads("Point(1 1)")]) + ) + + table_id = f"{dataset_id}.lake_from_shapes" + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo", "GEOGRAPHY", "NULLABLE"), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", "POINT(1 1)"], + ["foo", None], + ] + + bigquery_client.load_table_from_dataframe(df, table_id).result() + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", "POINT(1 1)"], + ["bar", "POINT(1 1)"], + ["foo", None], + ["foo", None], + ] + + +def test_load_dataframe_w_wkb(bigquery_client, dataset_id): + wkt = pytest.importorskip("shapely.wkt") + from shapely import wkb + from google.cloud.bigquery.schema import SchemaField + + df = pandas.DataFrame( + dict(name=["foo", "bar"], geo=[None, wkb.dumps(wkt.loads("Point(1 1)"))]) + ) + + table_id = f"{dataset_id}.lake_from_wkb" + # We create the table first, to inform the interpretation of the wkb data + bigquery_client.query( + f"create table {table_id} (name string, geo GEOGRAPHY)" + ).result() + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo", "GEOGRAPHY", "NULLABLE"), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", "POINT(1 1)"], + ["foo", None], + ] diff --git a/tests/system/test_structs.py b/tests/system/test_structs.py new file mode 100644 index 000000000..20740f614 --- /dev/null +++ b/tests/system/test_structs.py @@ -0,0 +1,31 @@ +import datetime + +import pytest + +from google.cloud.bigquery.dbapi import connect + +person_type = "struct>>" +person_type_sized = ( + "struct>>" +) + + +@pytest.mark.parametrize("person_type_decl", [person_type, person_type_sized]) +def test_structs(bigquery_client, dataset_id, person_type_decl, table_id): + conn = connect(bigquery_client) + cursor = conn.cursor() + cursor.execute(f"create table {table_id} (person {person_type_decl})") + data = dict( + name="par", + children=[ + dict(name="ch1", bdate=datetime.date(2021, 1, 1)), + dict(name="ch2", bdate=datetime.date(2021, 1, 2)), + ], + ) + cursor.execute( + f"insert into {table_id} (person) values (%(v:{person_type})s)", dict(v=data), + ) + + cursor.execute(f"select * from {table_id}") + [[result]] = list(cursor) + assert result == data diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py index df379f1e9..4de65971c 100644 --- a/tests/unit/__init__.py +++ b/tests/unit/__init__.py @@ -1,4 +1,5 @@ -# Copyright 2016 Google LLC +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,3 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 000000000..7a67ea6b5 --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,37 @@ +# Copyright 2021 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .helpers import make_client + + +@pytest.fixture +def client(): + yield make_client() + + +@pytest.fixture +def PROJECT(): + yield "PROJECT" + + +@pytest.fixture +def DS_ID(): + yield "DATASET_ID" + + +@pytest.fixture +def LOCATION(): + yield "us-central" diff --git a/tests/unit/enums/test_standard_sql_data_types.py b/tests/unit/enums/test_standard_sql_data_types.py index 6fa4f057f..7f62c46fd 100644 --- a/tests/unit/enums/test_standard_sql_data_types.py +++ b/tests/unit/enums/test_standard_sql_data_types.py @@ -32,7 +32,7 @@ def enum_under_test(): @pytest.fixture def gapic_enum(): """The referential autogenerated enum the enum under test is based on.""" - from google.cloud.bigquery_v2.gapic.enums import StandardSqlDataType + from google.cloud.bigquery_v2.types import StandardSqlDataType return StandardSqlDataType.TypeKind @@ -61,7 +61,10 @@ def test_standard_sql_types_enum_members(enum_under_test, gapic_enum): assert name not in enum_under_test.__members__ -def test_standard_sql_types_enum_docstring(enum_under_test, gapic_enum): +@pytest.mark.skip(reason="Code generator issue, the docstring is not generated.") +def test_standard_sql_types_enum_docstring( + enum_under_test, gapic_enum +): # pragma: NO COVER assert "STRUCT (int):" not in enum_under_test.__doc__ assert "BOOL (int):" in enum_under_test.__doc__ assert "TIME (int):" in enum_under_test.__doc__ diff --git a/tests/unit/gapic/__init__.py b/tests/unit/gapic/__init__.py new file mode 100644 index 000000000..4de65971c --- /dev/null +++ b/tests/unit/gapic/__init__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/tests/unit/helpers.py b/tests/unit/helpers.py index eea345e89..67aeaca35 100644 --- a/tests/unit/helpers.py +++ b/tests/unit/helpers.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import google.cloud.bigquery.client +import google.cloud.bigquery.dataset +import mock +import pytest + def make_connection(*responses): import google.cloud.bigquery._http @@ -21,6 +26,8 @@ def make_connection(*responses): mock_conn = mock.create_autospec(google.cloud.bigquery._http.Connection) mock_conn.user_agent = "testing 1.2.3" mock_conn.api_request.side_effect = list(responses) + [NotFound("miss")] + mock_conn.API_BASE_URL = "https://bigquery.googleapis.com" + mock_conn.get_api_base_url_for_mtls = mock.Mock(return_value=mock_conn.API_BASE_URL) return mock_conn @@ -29,3 +36,47 @@ def _to_pyarrow(value): import pyarrow return pyarrow.array([value])[0] + + +def make_client(project="PROJECT", **kw): + credentials = mock.Mock(spec=google.auth.credentials.Credentials) + return google.cloud.bigquery.client.Client(project, credentials, **kw) + + +def make_dataset_reference_string(project, ds_id): + return f"{project}.{ds_id}" + + +def make_dataset(project, ds_id): + return google.cloud.bigquery.dataset.Dataset( + google.cloud.bigquery.dataset.DatasetReference(project, ds_id) + ) + + +def make_dataset_list_item(project, ds_id): + return google.cloud.bigquery.dataset.DatasetListItem( + dict(datasetReference=dict(projectId=project, datasetId=ds_id)) + ) + + +def identity(x): + return x + + +def get_reference(x): + return x.reference + + +dataset_like = [ + (google.cloud.bigquery.dataset.DatasetReference, identity), + (make_dataset, identity), + (make_dataset_list_item, get_reference), + ( + make_dataset_reference_string, + google.cloud.bigquery.dataset.DatasetReference.from_string, + ), +] + +dataset_polymorphic = pytest.mark.parametrize( + "make_dataset,get_reference", dataset_like +) diff --git a/tests/unit/job/__init__.py b/tests/unit/job/__init__.py new file mode 100644 index 000000000..c6334245a --- /dev/null +++ b/tests/unit/job/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/job/helpers.py b/tests/unit/job/helpers.py new file mode 100644 index 000000000..c792214e7 --- /dev/null +++ b/tests/unit/job/helpers.py @@ -0,0 +1,205 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import mock +from google.api_core import exceptions + + +def _make_credentials(): + import google.auth.credentials + + return mock.Mock(spec=google.auth.credentials.Credentials) + + +def _make_client(project="test-project", connection=None): + from google.cloud.bigquery.client import Client + + if connection is None: + connection = _make_connection() + + client = Client(project=project, credentials=_make_credentials(), _http=object()) + client._connection = connection + return client + + +def _make_connection(*responses): + import google.cloud.bigquery._http + from google.cloud.exceptions import NotFound + + mock_conn = mock.create_autospec(google.cloud.bigquery._http.Connection) + mock_conn.api_request.side_effect = list(responses) + [NotFound("miss")] + return mock_conn + + +def _make_retriable_exception(): + return exceptions.TooManyRequests( + "retriable exception", errors=[{"reason": "rateLimitExceeded"}] + ) + + +def _make_job_resource( + creation_time_ms=1437767599006, + started_time_ms=1437767600007, + ended_time_ms=1437767601008, + started=False, + ended=False, + etag="abc-def-hjk", + endpoint="https://bigquery.googleapis.com", + job_type="load", + job_id="a-random-id", + location="US", + project_id="some-project", + user_email="bq-user@example.com", +): + resource = { + "status": {"state": "PENDING"}, + "configuration": {job_type: {}}, + "statistics": {"creationTime": creation_time_ms, job_type: {}}, + "etag": etag, + "id": "{}:{}".format(project_id, job_id), + "jobReference": { + "projectId": project_id, + "jobId": job_id, + "location": location, + }, + "selfLink": "{}/bigquery/v2/projects/{}/jobs/{}".format( + endpoint, project_id, job_id + ), + "user_email": user_email, + } + + if started or ended: + resource["statistics"]["startTime"] = started_time_ms + resource["status"]["state"] = "RUNNING" + + if ended: + resource["statistics"]["endTime"] = ended_time_ms + resource["status"]["state"] = "DONE" + + if job_type == "query": + resource["configuration"]["query"]["destinationTable"] = { + "projectId": project_id, + "datasetId": "_temp_dataset", + "tableId": "_temp_table", + } + + return resource + + +class _Base(unittest.TestCase): + from google.cloud.bigquery.dataset import DatasetReference + from google.cloud.bigquery.table import TableReference + + ENDPOINT = "https://bigquery.googleapis.com" + PROJECT = "project" + SOURCE1 = "http://example.com/source1.csv" + DS_ID = "dataset_id" + DS_REF = DatasetReference(PROJECT, DS_ID) + TABLE_ID = "table_id" + TABLE_REF = TableReference(DS_REF, TABLE_ID) + JOB_ID = "JOB_ID" + JOB_TYPE = "unknown" + KMS_KEY_NAME = "projects/1/locations/us/keyRings/1/cryptoKeys/1" + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def _setUpConstants(self): + import datetime + from google.cloud._helpers import UTC + + self.WHEN_TS = 1437767599.006 + self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.ETAG = "ETAG" + self.FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID) + self.RESOURCE_URL = "{}/bigquery/v2/projects/{}/jobs/{}".format( + self.ENDPOINT, self.PROJECT, self.JOB_ID + ) + self.USER_EMAIL = "phred@example.com" + + def _table_ref(self, table_id): + from google.cloud.bigquery.table import TableReference + + return TableReference(self.DS_REF, table_id) + + def _make_resource(self, started=False, ended=False, location="US"): + self._setUpConstants() + return _make_job_resource( + creation_time_ms=int(self.WHEN_TS * 1000), + started_time_ms=int(self.WHEN_TS * 1000), + ended_time_ms=int(self.WHEN_TS * 1000) + 1000000, + started=started, + ended=ended, + etag=self.ETAG, + endpoint=self.ENDPOINT, + job_type=self.JOB_TYPE, + job_id=self.JOB_ID, + project_id=self.PROJECT, + user_email=self.USER_EMAIL, + location=location, + ) + + def _verifyInitialReadonlyProperties(self, job): + # root elements of resource + self.assertIsNone(job.etag) + self.assertIsNone(job.self_link) + self.assertIsNone(job.user_email) + + # derived from resource['statistics'] + self.assertIsNone(job.created) + self.assertIsNone(job.started) + self.assertIsNone(job.ended) + self.assertIsNone(job.transaction_info) + + # derived from resource['status'] + self.assertIsNone(job.error_result) + self.assertIsNone(job.errors) + self.assertIsNone(job.state) + + def _verifyReadonlyResourceProperties(self, job, resource): + from datetime import timedelta + + statistics = resource.get("statistics", {}) + + if "creationTime" in statistics: + self.assertEqual(job.created, self.WHEN) + else: + self.assertIsNone(job.created) + + if "startTime" in statistics: + self.assertEqual(job.started, self.WHEN) + else: + self.assertIsNone(job.started) + + if "endTime" in statistics: + self.assertEqual(job.ended, self.WHEN + timedelta(seconds=1000)) + else: + self.assertIsNone(job.ended) + + if "etag" in resource: + self.assertEqual(job.etag, self.ETAG) + else: + self.assertIsNone(job.etag) + + if "selfLink" in resource: + self.assertEqual(job.self_link, self.RESOURCE_URL) + else: + self.assertIsNone(job.self_link) + + if "user_email" in resource: + self.assertEqual(job.user_email, self.USER_EMAIL) + else: + self.assertIsNone(job.user_email) diff --git a/tests/unit/job/test_base.py b/tests/unit/job/test_base.py new file mode 100644 index 000000000..c3f7854e3 --- /dev/null +++ b/tests/unit/job/test_base.py @@ -0,0 +1,1164 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import http +import unittest + +from google.api_core import exceptions +import google.api_core.retry +import mock +import pytest + +from .helpers import _make_client +from .helpers import _make_connection +from .helpers import _make_retriable_exception +from .helpers import _make_job_resource + + +class Test__error_result_to_exception(unittest.TestCase): + def _call_fut(self, *args, **kwargs): + from google.cloud.bigquery import job + + return job._error_result_to_exception(*args, **kwargs) + + def test_simple(self): + error_result = {"reason": "invalid", "message": "bad request"} + exception = self._call_fut(error_result) + self.assertEqual(exception.code, http.client.BAD_REQUEST) + self.assertTrue(exception.message.startswith("bad request")) + self.assertIn(error_result, exception.errors) + + def test_missing_reason(self): + error_result = {} + exception = self._call_fut(error_result) + self.assertEqual(exception.code, http.client.INTERNAL_SERVER_ERROR) + + +class Test_JobReference(unittest.TestCase): + JOB_ID = "job-id" + PROJECT = "test-project-123" + LOCATION = "us-central" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery import job + + return job._JobReference + + def _make_one(self, job_id, project, location): + return self._get_target_class()(job_id, project, location) + + def test_ctor(self): + job_ref = self._make_one(self.JOB_ID, self.PROJECT, self.LOCATION) + + self.assertEqual(job_ref.job_id, self.JOB_ID) + self.assertEqual(job_ref.project, self.PROJECT) + self.assertEqual(job_ref.location, self.LOCATION) + + def test__to_api_repr(self): + job_ref = self._make_one(self.JOB_ID, self.PROJECT, self.LOCATION) + + self.assertEqual( + job_ref._to_api_repr(), + { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": self.LOCATION, + }, + ) + + def test_from_api_repr(self): + api_repr = { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": self.LOCATION, + } + + job_ref = self._get_target_class()._from_api_repr(api_repr) + + self.assertEqual(job_ref.job_id, self.JOB_ID) + self.assertEqual(job_ref.project, self.PROJECT) + self.assertEqual(job_ref.location, self.LOCATION) + + +class Test_AsyncJob(unittest.TestCase): + JOB_ID = "job-id" + PROJECT = "test-project-123" + LOCATION = "us-central" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery import job + + return job._AsyncJob + + def _make_one(self, job_id, client): + return self._get_target_class()(job_id, client) + + def _make_derived_class(self): + class Derived(self._get_target_class()): + _JOB_TYPE = "derived" + + return Derived + + def _make_derived(self, job_id, client): + return self._make_derived_class()(job_id, client) + + @staticmethod + def _job_reference(job_id, project, location): + from google.cloud.bigquery import job + + return job._JobReference(job_id, project, location) + + def test_ctor_w_bare_job_id(self): + import threading + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + + self.assertEqual(job.job_id, self.JOB_ID) + self.assertEqual(job.project, self.PROJECT) + self.assertIsNone(job.location) + self.assertIs(job._client, client) + self.assertEqual( + job._properties, + {"jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}}, + ) + self.assertIsInstance(job._completion_lock, type(threading.Lock())) + self.assertEqual( + job.path, "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) + ) + + def test_ctor_w_job_ref(self): + import threading + + other_project = "other-project-234" + client = _make_client(project=other_project) + job_ref = self._job_reference(self.JOB_ID, self.PROJECT, self.LOCATION) + job = self._make_one(job_ref, client) + + self.assertEqual(job.job_id, self.JOB_ID) + self.assertEqual(job.project, self.PROJECT) + self.assertEqual(job.location, self.LOCATION) + self.assertIs(job._client, client) + self.assertEqual( + job._properties, + { + "jobReference": { + "projectId": self.PROJECT, + "location": self.LOCATION, + "jobId": self.JOB_ID, + } + }, + ) + self.assertFalse(job._result_set) + self.assertIsInstance(job._completion_lock, type(threading.Lock())) + self.assertEqual( + job.path, "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) + ) + + def test__require_client_w_none(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + + self.assertIs(job._require_client(None), client) + + def test__require_client_w_other(self): + client = _make_client(project=self.PROJECT) + other = object() + job = self._make_one(self.JOB_ID, client) + + self.assertIs(job._require_client(other), other) + + def test_job_type(self): + client = _make_client(project=self.PROJECT) + derived = self._make_derived(self.JOB_ID, client) + + self.assertEqual(derived.job_type, "derived") + + def test_parent_job_id(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + + self.assertIsNone(job.parent_job_id) + job._properties["statistics"] = {"parentJobId": "parent-job-123"} + self.assertEqual(job.parent_job_id, "parent-job-123") + + def test_script_statistics(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + + self.assertIsNone(job.script_statistics) + job._properties["statistics"] = { + "scriptStatistics": { + "evaluationKind": "EXPRESSION", + "stackFrames": [ + { + "startLine": 5, + "startColumn": 29, + "endLine": 9, + "endColumn": 14, + "text": "QUERY TEXT", + } + ], + } + } + script_stats = job.script_statistics + self.assertEqual(script_stats.evaluation_kind, "EXPRESSION") + stack_frames = script_stats.stack_frames + self.assertEqual(len(stack_frames), 1) + stack_frame = stack_frames[0] + self.assertIsNone(stack_frame.procedure_id) + self.assertEqual(stack_frame.start_line, 5) + self.assertEqual(stack_frame.start_column, 29) + self.assertEqual(stack_frame.end_line, 9) + self.assertEqual(stack_frame.end_column, 14) + self.assertEqual(stack_frame.text, "QUERY TEXT") + + def test_transaction_info(self): + from google.cloud.bigquery.job.base import TransactionInfo + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + assert job.transaction_info is None + + statistics = job._properties["statistics"] = {} + assert job.transaction_info is None + + statistics["transactionInfo"] = {"transactionId": "123-abc-xyz"} + assert isinstance(job.transaction_info, TransactionInfo) + assert job.transaction_info.transaction_id == "123-abc-xyz" + + def test_num_child_jobs(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + + self.assertEqual(job.num_child_jobs, 0) + job._properties["statistics"] = {"numChildJobs": "17"} + self.assertEqual(job.num_child_jobs, 17) + + def test_labels_miss(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertEqual(job.labels, {}) + + def test_labels_update_in_place(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + labels = job.labels + labels["foo"] = "bar" # update in place + self.assertEqual(job.labels, {"foo": "bar"}) + + def test_labels_hit(self): + labels = {"foo": "bar"} + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties.setdefault("configuration", {})["labels"] = labels + self.assertEqual(job.labels, labels) + + def test_etag(self): + etag = "ETAG-123" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.etag) + job._properties["etag"] = etag + self.assertEqual(job.etag, etag) + + def test_self_link(self): + self_link = "https://api.example.com/123" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.self_link) + job._properties["selfLink"] = self_link + self.assertEqual(job.self_link, self_link) + + def test_user_email(self): + user_email = "user@example.com" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.user_email) + job._properties["user_email"] = user_email + self.assertEqual(job.user_email, user_email) + + @staticmethod + def _datetime_and_millis(): + import datetime + from google.cloud._helpers import _millis + + now = datetime.datetime.utcnow().replace( + microsecond=123000, + tzinfo=datetime.timezone.utc, # stats timestamps have ms precision + ) + return now, _millis(now) + + def test_created(self): + now, millis = self._datetime_and_millis() + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.created) + stats = job._properties["statistics"] = {} + self.assertIsNone(job.created) + stats["creationTime"] = millis + self.assertEqual(job.created, now) + + def test_started(self): + now, millis = self._datetime_and_millis() + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.started) + stats = job._properties["statistics"] = {} + self.assertIsNone(job.started) + stats["startTime"] = millis + self.assertEqual(job.started, now) + + def test_ended(self): + now, millis = self._datetime_and_millis() + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.ended) + stats = job._properties["statistics"] = {} + self.assertIsNone(job.ended) + stats["endTime"] = millis + self.assertEqual(job.ended, now) + + def test_reservation_usage_no_stats(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["statistics"] = {} + self.assertEqual(job.reservation_usage, []) + + def test_reservation_usage_stats_exist(self): + from google.cloud.bigquery.job import ReservationUsage + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["statistics"] = { + "reservationUsage": [ + {"name": "slot_foo", "slotMs": "42"}, + {"name": "slot_bar", "slotMs": "123"}, + ], + } + + expected = [ + ReservationUsage(name="slot_foo", slot_ms=42), + ReservationUsage(name="slot_bar", slot_ms=123), + ] + self.assertEqual(job.reservation_usage, expected) + + def test__job_statistics(self): + statistics = {"foo": "bar"} + client = _make_client(project=self.PROJECT) + derived = self._make_derived(self.JOB_ID, client) + self.assertEqual(derived._job_statistics(), {}) + stats = derived._properties["statistics"] = {} + self.assertEqual(derived._job_statistics(), {}) + stats["derived"] = statistics + self.assertEqual(derived._job_statistics(), statistics) + + def test_error_result(self): + error_result = { + "debugInfo": "DEBUG INFO", + "location": "LOCATION", + "message": "MESSAGE", + "reason": "REASON", + } + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.error_result) + status = job._properties["status"] = {} + self.assertIsNone(job.error_result) + status["errorResult"] = error_result + self.assertEqual(job.error_result, error_result) + + def test_errors(self): + errors = [ + { + "debugInfo": "DEBUG INFO", + "location": "LOCATION", + "message": "MESSAGE", + "reason": "REASON", + } + ] + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.errors) + status = job._properties["status"] = {} + self.assertIsNone(job.errors) + status["errors"] = errors + self.assertEqual(job.errors, errors) + + def test_state(self): + state = "STATE" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.state) + status = job._properties["status"] = {} + self.assertIsNone(job.state) + status["state"] = state + self.assertEqual(job.state, state) + + def _set_properties_job(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._set_future_result = mock.Mock() + job._properties = { + "jobReference": job._properties["jobReference"], + "foo": "bar", + } + return job + + def test__set_properties_no_stats(self): + config = {"test": True} + resource = {"configuration": config} + job = self._set_properties_job() + + job._set_properties(resource) + + self.assertEqual(job._properties, resource) + + def test__set_properties_w_creation_time(self): + now, millis = self._datetime_and_millis() + config = {"test": True} + stats = {"creationTime": str(millis)} + resource = {"configuration": config, "statistics": stats} + job = self._set_properties_job() + + job._set_properties(resource) + + cleaned = copy.deepcopy(resource) + cleaned["statistics"]["creationTime"] = float(millis) + self.assertEqual(job._properties, cleaned) + + def test__set_properties_w_start_time(self): + now, millis = self._datetime_and_millis() + config = {"test": True} + stats = {"startTime": str(millis)} + resource = {"configuration": config, "statistics": stats} + job = self._set_properties_job() + + job._set_properties(resource) + + cleaned = copy.deepcopy(resource) + cleaned["statistics"]["startTime"] = float(millis) + self.assertEqual(job._properties, cleaned) + + def test__set_properties_w_end_time(self): + now, millis = self._datetime_and_millis() + config = {"test": True} + stats = {"endTime": str(millis)} + resource = {"configuration": config, "statistics": stats} + job = self._set_properties_job() + + job._set_properties(resource) + + cleaned = copy.deepcopy(resource) + cleaned["statistics"]["endTime"] = float(millis) + self.assertEqual(job._properties, cleaned) + + def test__check_resource_config_missing_job_ref(self): + resource = {} + klass = self._make_derived_class() + + with self.assertRaises(KeyError): + klass._check_resource_config(resource) + + def test__check_resource_config_missing_job_id(self): + resource = {"jobReference": {}} + klass = self._make_derived_class() + + with self.assertRaises(KeyError): + klass._check_resource_config(resource) + + def test__check_resource_config_missing_configuration(self): + resource = {"jobReference": {"jobId": self.JOB_ID}} + klass = self._make_derived_class() + + with self.assertRaises(KeyError): + klass._check_resource_config(resource) + + def test__check_resource_config_missing_config_type(self): + resource = {"jobReference": {"jobId": self.JOB_ID}, "configuration": {}} + klass = self._make_derived_class() + + with self.assertRaises(KeyError): + klass._check_resource_config(resource) + + def test__check_resource_config_ok(self): + derived_config = {"foo": "bar"} + resource = { + "jobReference": {"jobId": self.JOB_ID}, + "configuration": {"derived": derived_config}, + } + klass = self._make_derived_class() + + # Should not throw. + klass._check_resource_config(resource) + + def test__build_resource(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + resource = job._build_resource() + assert resource["jobReference"]["jobId"] == self.JOB_ID + + def test_to_api_repr(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + resource = job.to_api_repr() + assert resource["jobReference"]["jobId"] == self.JOB_ID + + def test__begin_already(self): + job = self._set_properties_job() + job._properties["status"] = {"state": "WHATEVER"} + + with self.assertRaises(ValueError): + job._begin() + + def test__begin_defaults(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + job = self._set_properties_job() + builder = job.to_api_repr = mock.Mock() + builder.return_value = resource + call_api = job._client._call_api = mock.Mock() + call_api.return_value = resource + path = "/projects/{}/jobs".format(self.PROJECT) + job._begin() + + call_api.assert_called_once_with( + DEFAULT_RETRY, + span_name="BigQuery.job.begin", + span_attributes={"path": path}, + job_ref=job, + method="POST", + path=path, + data=resource, + timeout=None, + ) + self.assertEqual(job._properties, resource) + + def test__begin_explicit(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + other_project = "other-project-234" + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + job = self._set_properties_job() + builder = job.to_api_repr = mock.Mock() + builder.return_value = resource + client = _make_client(project=other_project) + call_api = client._call_api = mock.Mock() + call_api.return_value = resource + retry = DEFAULT_RETRY.with_deadline(1) + path = "/projects/{}/jobs".format(self.PROJECT) + job._begin(client=client, retry=retry, timeout=7.5) + + call_api.assert_called_once_with( + retry, + span_name="BigQuery.job.begin", + span_attributes={"path": path}, + job_ref=job, + method="POST", + path=path, + data=resource, + timeout=7.5, + ) + self.assertEqual(job._properties, resource) + + def test_exists_defaults_miss(self): + from google.cloud.exceptions import NotFound + from google.cloud.bigquery.retry import DEFAULT_RETRY + + job = self._set_properties_job() + job._properties["jobReference"]["location"] = self.LOCATION + call_api = job._client._call_api = mock.Mock() + call_api.side_effect = NotFound("testing") + self.assertFalse(job.exists()) + + call_api.assert_called_once_with( + DEFAULT_RETRY, + span_name="BigQuery.job.exists", + span_attributes={ + "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) + }, + job_ref=job, + method="GET", + path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), + query_params={"fields": "id", "location": self.LOCATION}, + timeout=None, + ) + + def test_exists_explicit_hit(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + other_project = "other-project-234" + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + job = self._set_properties_job() + client = _make_client(project=other_project) + call_api = client._call_api = mock.Mock() + call_api.return_value = resource + retry = DEFAULT_RETRY.with_deadline(1) + self.assertTrue(job.exists(client=client, retry=retry)) + + call_api.assert_called_once_with( + retry, + span_name="BigQuery.job.exists", + span_attributes={ + "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) + }, + job_ref=job, + method="GET", + path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), + query_params={"fields": "id"}, + timeout=None, + ) + + def test_exists_w_timeout(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + PATH = "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) + job = self._set_properties_job() + call_api = job._client._call_api = mock.Mock() + job.exists(timeout=7.5) + + call_api.assert_called_once_with( + DEFAULT_RETRY, + span_name="BigQuery.job.exists", + span_attributes={"path": PATH}, + job_ref=job, + method="GET", + path=PATH, + query_params={"fields": "id"}, + timeout=7.5, + ) + + def test_reload_defaults(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + job = self._set_properties_job() + job._properties["jobReference"]["location"] = self.LOCATION + call_api = job._client._call_api = mock.Mock() + call_api.return_value = resource + job.reload() + + call_api.assert_called_once_with( + DEFAULT_RETRY, + span_name="BigQuery.job.reload", + span_attributes={ + "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) + }, + job_ref=job, + method="GET", + path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), + query_params={"location": self.LOCATION}, + timeout=None, + ) + self.assertEqual(job._properties, resource) + + def test_reload_explicit(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + other_project = "other-project-234" + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + job = self._set_properties_job() + client = _make_client(project=other_project) + call_api = client._call_api = mock.Mock() + call_api.return_value = resource + retry = DEFAULT_RETRY.with_deadline(1) + job.reload(client=client, retry=retry, timeout=4.2) + + call_api.assert_called_once_with( + retry, + span_name="BigQuery.job.reload", + span_attributes={ + "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) + }, + job_ref=job, + method="GET", + path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), + query_params={}, + timeout=4.2, + ) + self.assertEqual(job._properties, resource) + + def test_cancel_defaults(self): + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + response = {"job": resource} + job = self._set_properties_job() + job._properties["jobReference"]["location"] = self.LOCATION + connection = job._client._connection = _make_connection(response) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertTrue(job.cancel()) + + final_attributes.assert_called() + + connection.api_request.assert_called_once_with( + method="POST", + path="/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID), + query_params={"location": self.LOCATION}, + timeout=None, + ) + self.assertEqual(job._properties, resource) + + def test_cancel_explicit(self): + other_project = "other-project-234" + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + response = {"job": resource} + job = self._set_properties_job() + client = _make_client(project=other_project) + connection = client._connection = _make_connection(response) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertTrue(job.cancel(client=client, timeout=7.5)) + + final_attributes.assert_called_with( + {"path": "/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID)}, + client, + job, + ) + + connection.api_request.assert_called_once_with( + method="POST", + path="/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID), + query_params={}, + timeout=7.5, + ) + self.assertEqual(job._properties, resource) + + def test_cancel_w_custom_retry(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + api_path = "/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID) + resource = { + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + response = {"job": resource} + job = self._set_properties_job() + + api_request_patcher = mock.patch.object( + job._client._connection, "api_request", side_effect=[ValueError, response] + ) + retry = DEFAULT_RETRY.with_deadline(1).with_predicate( + lambda exc: isinstance(exc, ValueError) + ) + + with api_request_patcher as fake_api_request: + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + result = job.cancel(retry=retry, timeout=7.5) + + final_attributes.assert_called() + + self.assertTrue(result) + self.assertEqual(job._properties, resource) + self.assertEqual( + fake_api_request.call_args_list, + [ + mock.call(method="POST", path=api_path, query_params={}, timeout=7.5), + mock.call( + method="POST", path=api_path, query_params={}, timeout=7.5 + ), # was retried once + ], + ) + + def test__set_future_result_wo_done(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + set_exception = job.set_exception = mock.Mock() + set_result = job.set_result = mock.Mock() + + job._set_future_result() + + set_exception.assert_not_called() + set_result.assert_not_called() + + def test__set_future_result_w_result_set(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["status"] = {"state": "DONE"} + job._result_set = True + set_exception = job.set_exception = mock.Mock() + set_result = job.set_result = mock.Mock() + + job._set_future_result() + + set_exception.assert_not_called() + set_result.assert_not_called() + + def test__set_future_result_w_done_wo_result_set_w_error(self): + from google.cloud.exceptions import NotFound + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["status"] = { + "state": "DONE", + "errorResult": {"reason": "notFound", "message": "testing"}, + } + set_exception = job.set_exception = mock.Mock() + set_result = job.set_result = mock.Mock() + + job._set_future_result() + + set_exception.assert_called_once() + args, kw = set_exception.call_args + (exception,) = args + self.assertIsInstance(exception, NotFound) + self.assertEqual(exception.message, "testing") + self.assertEqual(kw, {}) + set_result.assert_not_called() + + def test__set_future_result_w_done_wo_result_set_wo_error(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["status"] = {"state": "DONE"} + set_exception = job.set_exception = mock.Mock() + set_result = job.set_result = mock.Mock() + + job._set_future_result() + + set_exception.assert_not_called() + set_result.assert_called_once_with(job) + + def test_done_defaults_wo_state(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + reload_ = job.reload = mock.Mock() + + self.assertFalse(job.done()) + + reload_.assert_called_once_with(retry=DEFAULT_RETRY, timeout=None) + + def test_done_explicit_wo_state(self): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + reload_ = job.reload = mock.Mock() + retry = DEFAULT_RETRY.with_deadline(1) + + self.assertFalse(job.done(retry=retry, timeout=7.5)) + + reload_.assert_called_once_with(retry=retry, timeout=7.5) + + def test_done_already(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["status"] = {"state": "DONE"} + + self.assertTrue(job.done()) + + def test_result_default_wo_state(self): + begun_job_resource = _make_job_resource( + job_id=self.JOB_ID, project_id=self.PROJECT, location="US", started=True + ) + done_job_resource = _make_job_resource( + job_id=self.JOB_ID, + project_id=self.PROJECT, + location="US", + started=True, + ended=True, + ) + conn = _make_connection( + _make_retriable_exception(), + begun_job_resource, + _make_retriable_exception(), + done_job_resource, + ) + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, client) + + self.assertIs(job.result(), job) + + begin_call = mock.call( + method="POST", + path=f"/projects/{self.PROJECT}/jobs", + data={"jobReference": {"jobId": self.JOB_ID, "projectId": self.PROJECT}}, + timeout=None, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", + query_params={"location": "US"}, + timeout=None, + ) + conn.api_request.assert_has_calls( + [begin_call, begin_call, reload_call, reload_call] + ) + + def test_result_w_retry_wo_state(self): + begun_job_resource = _make_job_resource( + job_id=self.JOB_ID, project_id=self.PROJECT, location="EU", started=True + ) + done_job_resource = _make_job_resource( + job_id=self.JOB_ID, + project_id=self.PROJECT, + location="EU", + started=True, + ended=True, + ) + conn = _make_connection( + exceptions.NotFound("not normally retriable"), + begun_job_resource, + exceptions.NotFound("not normally retriable"), + done_job_resource, + ) + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one( + self._job_reference(self.JOB_ID, self.PROJECT, "EU"), client + ) + custom_predicate = mock.Mock() + custom_predicate.return_value = True + custom_retry = google.api_core.retry.Retry( + predicate=custom_predicate, initial=0.001, maximum=0.001, deadline=0.1, + ) + self.assertIs(job.result(retry=custom_retry), job) + + begin_call = mock.call( + method="POST", + path=f"/projects/{self.PROJECT}/jobs", + data={ + "jobReference": { + "jobId": self.JOB_ID, + "projectId": self.PROJECT, + "location": "EU", + } + }, + timeout=None, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", + query_params={"location": "EU"}, + timeout=None, + ) + conn.api_request.assert_has_calls( + [begin_call, begin_call, reload_call, reload_call] + ) + + def test_result_explicit_w_state(self): + conn = _make_connection() + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, client) + # Use _set_properties() instead of directly modifying _properties so + # that the result state is set properly. + job_resource = job._properties + job_resource["status"] = {"state": "DONE"} + job._set_properties(job_resource) + timeout = 1 + + self.assertIs(job.result(timeout=timeout), job) + + conn.api_request.assert_not_called() + + def test_cancelled_wo_error_result(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + + self.assertFalse(job.cancelled()) + + def test_cancelled_w_error_result_not_stopped(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["status"] = {"errorResult": {"reason": "other"}} + + self.assertFalse(job.cancelled()) + + def test_cancelled_w_error_result_w_stopped(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + job._properties["status"] = {"errorResult": {"reason": "stopped"}} + + self.assertTrue(job.cancelled()) + + +class Test_JobConfig(unittest.TestCase): + JOB_TYPE = "testing" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery import job + + return job._JobConfig + + def _make_one(self, job_type=JOB_TYPE): + return self._get_target_class()(job_type) + + def test_ctor(self): + job_config = self._make_one() + self.assertEqual(job_config._job_type, self.JOB_TYPE) + self.assertEqual(job_config._properties, {self.JOB_TYPE: {}}) + + def test_ctor_with_unknown_property_raises_error(self): + error_text = "Property wrong_name is unknown for" + with pytest.raises(AttributeError, match=error_text): + config = self._make_one() + config.wrong_name = None + + def test_fill_from_default(self): + from google.cloud.bigquery import QueryJobConfig + + job_config = QueryJobConfig() + job_config.dry_run = True + job_config.maximum_bytes_billed = 1000 + + default_job_config = QueryJobConfig() + default_job_config.use_query_cache = True + default_job_config.maximum_bytes_billed = 2000 + + final_job_config = job_config._fill_from_default(default_job_config) + self.assertTrue(final_job_config.dry_run) + self.assertTrue(final_job_config.use_query_cache) + self.assertEqual(final_job_config.maximum_bytes_billed, 1000) + + def test_fill_from_default_conflict(self): + from google.cloud.bigquery import QueryJobConfig + + basic_job_config = QueryJobConfig() + conflicting_job_config = self._make_one("conflicting_job_type") + self.assertNotEqual( + basic_job_config._job_type, conflicting_job_config._job_type + ) + + with self.assertRaises(TypeError): + basic_job_config._fill_from_default(conflicting_job_config) + + @mock.patch("google.cloud.bigquery._helpers._get_sub_prop") + def test__get_sub_prop_wo_default(self, _get_sub_prop): + job_config = self._make_one() + key = "key" + self.assertIs(job_config._get_sub_prop(key), _get_sub_prop.return_value) + _get_sub_prop.assert_called_once_with( + job_config._properties, [self.JOB_TYPE, key], default=None + ) + + @mock.patch("google.cloud.bigquery._helpers._get_sub_prop") + def test__get_sub_prop_w_default(self, _get_sub_prop): + job_config = self._make_one() + key = "key" + default = "default" + self.assertIs( + job_config._get_sub_prop(key, default=default), _get_sub_prop.return_value + ) + _get_sub_prop.assert_called_once_with( + job_config._properties, [self.JOB_TYPE, key], default=default + ) + + @mock.patch("google.cloud.bigquery._helpers._set_sub_prop") + def test__set_sub_prop(self, _set_sub_prop): + job_config = self._make_one() + key = "key" + value = "value" + job_config._set_sub_prop(key, value) + _set_sub_prop.assert_called_once_with( + job_config._properties, [self.JOB_TYPE, key], value + ) + + def test_to_api_repr(self): + job_config = self._make_one() + expected = job_config._properties = {self.JOB_TYPE: {"foo": "bar"}} + found = job_config.to_api_repr() + self.assertEqual(found, expected) + self.assertIsNot(found, expected) # copied + + # 'from_api_repr' cannot be tested on '_JobConfig', because it presumes + # the ctor can be called w/o arguments + + def test_labels_miss(self): + job_config = self._make_one() + self.assertEqual(job_config.labels, {}) + + def test_labels_update_in_place(self): + job_config = self._make_one() + labels = job_config.labels + labels["foo"] = "bar" # update in place + self.assertEqual(job_config.labels, {"foo": "bar"}) + + def test_labels_hit(self): + labels = {"foo": "bar"} + job_config = self._make_one() + job_config._properties["labels"] = labels + self.assertEqual(job_config.labels, labels) + + def test_labels_setter_invalid(self): + labels = object() + job_config = self._make_one() + with self.assertRaises(ValueError): + job_config.labels = labels + + def test_labels_setter(self): + labels = {"foo": "bar"} + job_config = self._make_one() + job_config.labels = labels + self.assertEqual(job_config._properties["labels"], labels) diff --git a/tests/unit/job/test_copy.py b/tests/unit/job/test_copy.py new file mode 100644 index 000000000..992efcf6b --- /dev/null +++ b/tests/unit/job/test_copy.py @@ -0,0 +1,509 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mock + +from .helpers import _Base +from .helpers import _make_client +from .helpers import _make_connection + + +class TestCopyJobConfig(_Base): + JOB_TYPE = "copy" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import CopyJobConfig + + return CopyJobConfig + + def test_ctor_defaults(self): + from google.cloud.bigquery.job import OperationType + + config = self._make_one() + + assert config.create_disposition is None + assert config.write_disposition is None + assert config.destination_encryption_configuration is None + assert config.operation_type == OperationType.OPERATION_TYPE_UNSPECIFIED + + def test_ctor_w_properties(self): + from google.cloud.bigquery.job import CreateDisposition + from google.cloud.bigquery.job import OperationType + from google.cloud.bigquery.job import WriteDisposition + + create_disposition = CreateDisposition.CREATE_NEVER + write_disposition = WriteDisposition.WRITE_TRUNCATE + snapshot_operation = OperationType.SNAPSHOT + + config = self._get_target_class()( + create_disposition=create_disposition, + write_disposition=write_disposition, + operation_type=snapshot_operation, + ) + + self.assertEqual(config.create_disposition, create_disposition) + self.assertEqual(config.write_disposition, write_disposition) + self.assertEqual(config.operation_type, snapshot_operation) + + def test_to_api_repr_with_encryption(self): + from google.cloud.bigquery.encryption_configuration import ( + EncryptionConfiguration, + ) + + config = self._make_one() + config.destination_encryption_configuration = EncryptionConfiguration( + kms_key_name=self.KMS_KEY_NAME + ) + resource = config.to_api_repr() + self.assertEqual( + resource, + { + "copy": { + "destinationEncryptionConfiguration": { + "kmsKeyName": self.KMS_KEY_NAME + } + } + }, + ) + + def test_to_api_repr_with_encryption_none(self): + config = self._make_one() + config.destination_encryption_configuration = None + resource = config.to_api_repr() + self.assertEqual( + resource, {"copy": {"destinationEncryptionConfiguration": None}} + ) + + def test_operation_type_setting_none(self): + from google.cloud.bigquery.job import OperationType + + config = self._make_one(operation_type=OperationType.SNAPSHOT) + + # Setting it to None is the same as setting it to OPERATION_TYPE_UNSPECIFIED. + config.operation_type = None + assert config.operation_type == OperationType.OPERATION_TYPE_UNSPECIFIED + + def test_operation_type_setting_non_none(self): + from google.cloud.bigquery.job import OperationType + + config = self._make_one(operation_type=None) + config.operation_type = OperationType.RESTORE + assert config.operation_type == OperationType.RESTORE + + +class TestCopyJob(_Base): + JOB_TYPE = "copy" + SOURCE_TABLE = "source_table" + DESTINATION_TABLE = "destination_table" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import CopyJob + + return CopyJob + + def _make_resource(self, started=False, ended=False): + resource = super(TestCopyJob, self)._make_resource(started, ended) + config = resource["configuration"]["copy"] + config["sourceTables"] = [ + { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + } + ] + config["destinationTable"] = { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + } + + return resource + + def _verifyResourceProperties(self, job, resource): + self._verifyReadonlyResourceProperties(job, resource) + + config = resource.get("configuration", {}).get("copy") + + table_ref = config["destinationTable"] + self.assertEqual(job.destination.project, table_ref["projectId"]) + self.assertEqual(job.destination.dataset_id, table_ref["datasetId"]) + self.assertEqual(job.destination.table_id, table_ref["tableId"]) + + sources = config.get("sourceTables") + if sources is None: + sources = [config["sourceTable"]] + self.assertEqual(len(sources), len(job.sources)) + for table_ref, table in zip(sources, job.sources): + self.assertEqual(table.project, table_ref["projectId"]) + self.assertEqual(table.dataset_id, table_ref["datasetId"]) + self.assertEqual(table.table_id, table_ref["tableId"]) + + if "createDisposition" in config: + self.assertEqual(job.create_disposition, config["createDisposition"]) + else: + self.assertIsNone(job.create_disposition) + + if "writeDisposition" in config: + self.assertEqual(job.write_disposition, config["writeDisposition"]) + else: + self.assertIsNone(job.write_disposition) + + if "destinationEncryptionConfiguration" in config: + self.assertIsNotNone(job.destination_encryption_configuration) + self.assertEqual( + job.destination_encryption_configuration.kms_key_name, + config["destinationEncryptionConfiguration"]["kmsKeyName"], + ) + else: + self.assertIsNone(job.destination_encryption_configuration) + + def test_ctor(self): + client = _make_client(project=self.PROJECT) + source = self._table_ref(self.SOURCE_TABLE) + destination = self._table_ref(self.DESTINATION_TABLE) + job = self._make_one(self.JOB_ID, [source], destination, client) + self.assertEqual(job.destination, destination) + self.assertEqual(job.sources, [source]) + self.assertIs(job._client, client) + self.assertEqual(job.job_type, self.JOB_TYPE) + self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) + + self._verifyInitialReadonlyProperties(job) + + # set/read from resource['configuration']['copy'] + self.assertIsNone(job.create_disposition) + self.assertIsNone(job.write_disposition) + self.assertIsNone(job.destination_encryption_configuration) + + def test_from_api_repr_missing_identity(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = {} + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_missing_config(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": "%s:%s" % (self.PROJECT, self.DS_ID), + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + } + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_bare(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "copy": { + "sourceTables": [ + { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + } + ], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + }, + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_with_encryption(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "copy": { + "sourceTables": [ + { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + } + ], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + }, + "destinationEncryptionConfiguration": { + "kmsKeyName": self.KMS_KEY_NAME + }, + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_w_sourcetable(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "copy": { + "sourceTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + }, + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + }, + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_wo_sources(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "copy": { + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + } + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + with self.assertRaises(KeyError): + _ = job.sources + + def test_from_api_repr_w_properties(self): + from google.cloud.bigquery.job import CreateDisposition + + client = _make_client(project=self.PROJECT) + RESOURCE = self._make_resource() + copy_config = RESOURCE["configuration"]["copy"] + copy_config["createDisposition"] = CreateDisposition.CREATE_IF_NEEDED + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_bound_client(self): + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + source = self._table_ref(self.SOURCE_TABLE) + destination = self._table_ref(self.DESTINATION_TABLE) + job = self._make_one(self.JOB_ID, [source], destination, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "copy": { + "sourceTables": [ + { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + } + ], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + }, + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_alternate_client(self): + from google.cloud.bigquery.job import CopyJobConfig + + from google.cloud.bigquery.job import CreateDisposition + from google.cloud.bigquery.job import WriteDisposition + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource(ended=True) + COPY_CONFIGURATION = { + "sourceTables": [ + { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + } + ], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + }, + "createDisposition": CreateDisposition.CREATE_NEVER, + "writeDisposition": WriteDisposition.WRITE_TRUNCATE, + } + RESOURCE["configuration"]["copy"] = COPY_CONFIGURATION + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + source = self._table_ref(self.SOURCE_TABLE) + destination = self._table_ref(self.DESTINATION_TABLE) + config = CopyJobConfig() + config.create_disposition = CreateDisposition.CREATE_NEVER + config.write_disposition = WriteDisposition.WRITE_TRUNCATE + job = self._make_one(self.JOB_ID, [source], destination, client1, config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": {"copy": COPY_CONFIGURATION}, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_exists_miss_w_bound_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn = _make_connection() + client = _make_client(project=self.PROJECT, connection=conn) + + source = self._table_ref(self.SOURCE_TABLE) + destination = self._table_ref(self.DESTINATION_TABLE) + job = self._make_one(self.JOB_ID, [source], destination, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertFalse(job.exists()) + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_exists_hit_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection({}) + client2 = _make_client(project=self.PROJECT, connection=conn2) + source = self._table_ref(self.SOURCE_TABLE) + destination = self._table_ref(self.DESTINATION_TABLE) + job = self._make_one(self.JOB_ID, [source], destination, client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertTrue(job.exists(client=client2)) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_reload_w_bound_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource() + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + source = self._table_ref(self.SOURCE_TABLE) + destination = self._table_ref(self.DESTINATION_TABLE) + job = self._make_one(self.JOB_ID, [source], destination, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_reload_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource() + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + source = self._table_ref(self.SOURCE_TABLE) + destination = self._table_ref(self.DESTINATION_TABLE) + job = self._make_one(self.JOB_ID, [source], destination, client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) diff --git a/tests/unit/job/test_extract.py b/tests/unit/job/test_extract.py new file mode 100644 index 000000000..4c9411d0d --- /dev/null +++ b/tests/unit/job/test_extract.py @@ -0,0 +1,437 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mock + +from .helpers import _Base +from .helpers import _make_client +from .helpers import _make_connection + + +class TestExtractJobConfig(_Base): + JOB_TYPE = "extract" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import ExtractJobConfig + + return ExtractJobConfig + + def test_ctor_w_properties(self): + config = self._get_target_class()(field_delimiter="\t", print_header=True) + + self.assertEqual(config.field_delimiter, "\t") + self.assertTrue(config.print_header) + + def test_to_api_repr(self): + from google.cloud.bigquery import job + + config = self._make_one() + config.compression = job.Compression.SNAPPY + config.destination_format = job.DestinationFormat.AVRO + config.field_delimiter = "ignored for avro" + config.print_header = False + config._properties["extract"]["someNewField"] = "some-value" + config.use_avro_logical_types = True + resource = config.to_api_repr() + self.assertEqual( + resource, + { + "extract": { + "compression": "SNAPPY", + "destinationFormat": "AVRO", + "fieldDelimiter": "ignored for avro", + "printHeader": False, + "someNewField": "some-value", + "useAvroLogicalTypes": True, + } + }, + ) + + def test_from_api_repr(self): + cls = self._get_target_class() + config = cls.from_api_repr( + { + "extract": { + "compression": "NONE", + "destinationFormat": "CSV", + "fieldDelimiter": "\t", + "printHeader": True, + "someNewField": "some-value", + "useAvroLogicalTypes": False, + } + } + ) + self.assertEqual(config.compression, "NONE") + self.assertEqual(config.destination_format, "CSV") + self.assertEqual(config.field_delimiter, "\t") + self.assertEqual(config.print_header, True) + self.assertEqual(config._properties["extract"]["someNewField"], "some-value") + self.assertEqual(config.use_avro_logical_types, False) + + +class TestExtractJob(_Base): + JOB_TYPE = "extract" + SOURCE_TABLE = "source_table" + DESTINATION_URI = "gs://bucket_name/object_name" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import ExtractJob + + return ExtractJob + + def _make_resource(self, started=False, ended=False): + resource = super(TestExtractJob, self)._make_resource(started, ended) + config = resource["configuration"]["extract"] + config["sourceTable"] = { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + } + config["destinationUris"] = [self.DESTINATION_URI] + return resource + + def _verifyResourceProperties(self, job, resource): + self._verifyReadonlyResourceProperties(job, resource) + + config = resource.get("configuration", {}).get("extract") + + self.assertEqual(job.destination_uris, config["destinationUris"]) + + if "sourceTable" in config: + table_ref = config["sourceTable"] + self.assertEqual(job.source.project, table_ref["projectId"]) + self.assertEqual(job.source.dataset_id, table_ref["datasetId"]) + self.assertEqual(job.source.table_id, table_ref["tableId"]) + else: + model_ref = config["sourceModel"] + self.assertEqual(job.source.project, model_ref["projectId"]) + self.assertEqual(job.source.dataset_id, model_ref["datasetId"]) + self.assertEqual(job.source.model_id, model_ref["modelId"]) + + if "compression" in config: + self.assertEqual(job.compression, config["compression"]) + else: + self.assertIsNone(job.compression) + + if "destinationFormat" in config: + self.assertEqual(job.destination_format, config["destinationFormat"]) + else: + self.assertIsNone(job.destination_format) + + if "fieldDelimiter" in config: + self.assertEqual(job.field_delimiter, config["fieldDelimiter"]) + else: + self.assertIsNone(job.field_delimiter) + + if "printHeader" in config: + self.assertEqual(job.print_header, config["printHeader"]) + else: + self.assertIsNone(job.print_header) + + def test_ctor(self): + from google.cloud.bigquery.table import Table + + client = _make_client(project=self.PROJECT) + source = Table(self.TABLE_REF) + job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) + self.assertEqual(job.source.project, self.PROJECT) + self.assertEqual(job.source.dataset_id, self.DS_ID) + self.assertEqual(job.source.table_id, self.TABLE_ID) + self.assertEqual(job.destination_uris, [self.DESTINATION_URI]) + self.assertIs(job._client, client) + self.assertEqual(job.job_type, self.JOB_TYPE) + self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) + + self._verifyInitialReadonlyProperties(job) + + # set/read from resource['configuration']['extract'] + self.assertIsNone(job.compression) + self.assertIsNone(job.destination_format) + self.assertIsNone(job.field_delimiter) + self.assertIsNone(job.print_header) + + def test_destination_uri_file_counts(self): + file_counts = 23 + client = _make_client(project=self.PROJECT) + job = self._make_one( + self.JOB_ID, self.TABLE_REF, [self.DESTINATION_URI], client + ) + self.assertIsNone(job.destination_uri_file_counts) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.destination_uri_file_counts) + + extract_stats = statistics["extract"] = {} + self.assertIsNone(job.destination_uri_file_counts) + + extract_stats["destinationUriFileCounts"] = [str(file_counts)] + self.assertEqual(job.destination_uri_file_counts, [file_counts]) + + def test_from_api_repr_missing_identity(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = {} + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_missing_config(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": "%s:%s" % (self.PROJECT, self.DS_ID), + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + } + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_bare(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "extract": { + "sourceTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + }, + "destinationUris": [self.DESTINATION_URI], + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_for_model(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "extract": { + "sourceModel": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "modelId": "model_id", + }, + "destinationUris": [self.DESTINATION_URI], + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_w_properties(self): + from google.cloud.bigquery.job import Compression + + client = _make_client(project=self.PROJECT) + RESOURCE = self._make_resource() + extract_config = RESOURCE["configuration"]["extract"] + extract_config["compression"] = Compression.GZIP + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_bound_client(self): + from google.cloud.bigquery.dataset import DatasetReference + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + source_dataset = DatasetReference(self.PROJECT, self.DS_ID) + source = source_dataset.table(self.SOURCE_TABLE) + job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "extract": { + "sourceTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + }, + "destinationUris": [self.DESTINATION_URI], + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_alternate_client(self): + from google.cloud.bigquery.dataset import DatasetReference + from google.cloud.bigquery.job import Compression + from google.cloud.bigquery.job import DestinationFormat + from google.cloud.bigquery.job import ExtractJobConfig + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource(ended=True) + EXTRACT_CONFIGURATION = { + "sourceTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.SOURCE_TABLE, + }, + "destinationUris": [self.DESTINATION_URI], + "compression": Compression.GZIP, + "destinationFormat": DestinationFormat.NEWLINE_DELIMITED_JSON, + "fieldDelimiter": "|", + "printHeader": False, + } + RESOURCE["configuration"]["extract"] = EXTRACT_CONFIGURATION + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + source_dataset = DatasetReference(self.PROJECT, self.DS_ID) + source = source_dataset.table(self.SOURCE_TABLE) + config = ExtractJobConfig() + config.compression = Compression.GZIP + config.destination_format = DestinationFormat.NEWLINE_DELIMITED_JSON + config.field_delimiter = "|" + config.print_header = False + job = self._make_one( + self.JOB_ID, source, [self.DESTINATION_URI], client1, config + ) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": {"extract": EXTRACT_CONFIGURATION}, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_exists_miss_w_bound_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn = _make_connection() + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one( + self.JOB_ID, self.TABLE_REF, [self.DESTINATION_URI], client + ) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertFalse(job.exists()) + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_exists_hit_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection({}) + client2 = _make_client(project=self.PROJECT, connection=conn2) + job = self._make_one( + self.JOB_ID, self.TABLE_REF, [self.DESTINATION_URI], client1 + ) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertTrue(job.exists(client=client2)) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_reload_w_bound_client(self): + from google.cloud.bigquery.dataset import DatasetReference + + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource() + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + source_dataset = DatasetReference(self.PROJECT, self.DS_ID) + source = source_dataset.table(self.SOURCE_TABLE) + job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload() + + final_attributes.assert_called_with({"path": PATH}, client, job) + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_reload_w_alternate_client(self): + from google.cloud.bigquery.dataset import DatasetReference + + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource() + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + source_dataset = DatasetReference(self.PROJECT, self.DS_ID) + source = source_dataset.table(self.SOURCE_TABLE) + job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) diff --git a/tests/unit/job/test_load.py b/tests/unit/job/test_load.py new file mode 100644 index 000000000..70e7860a7 --- /dev/null +++ b/tests/unit/job/test_load.py @@ -0,0 +1,838 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +import mock + +from .helpers import _Base +from .helpers import _make_client +from .helpers import _make_connection + + +class TestLoadJob(_Base): + JOB_TYPE = "load" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import LoadJob + + return LoadJob + + def _setUpConstants(self): + super(TestLoadJob, self)._setUpConstants() + self.INPUT_FILES = 2 + self.INPUT_BYTES = 12345 + self.OUTPUT_BYTES = 23456 + self.OUTPUT_ROWS = 345 + + def _make_resource(self, started=False, ended=False): + resource = super(TestLoadJob, self)._make_resource(started, ended) + config = resource["configuration"]["load"] + config["sourceUris"] = [self.SOURCE1] + config["destinationTable"] = { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.TABLE_ID, + } + + if ended: + resource["status"] = {"state": "DONE"} + resource["statistics"]["load"]["inputFiles"] = self.INPUT_FILES + resource["statistics"]["load"]["inputFileBytes"] = self.INPUT_BYTES + resource["statistics"]["load"]["outputBytes"] = self.OUTPUT_BYTES + resource["statistics"]["load"]["outputRows"] = self.OUTPUT_ROWS + + return resource + + def _verifyBooleanConfigProperties(self, job, config): + if "allowJaggedRows" in config: + self.assertEqual(job.allow_jagged_rows, config["allowJaggedRows"]) + else: + self.assertIsNone(job.allow_jagged_rows) + if "allowQuotedNewlines" in config: + self.assertEqual(job.allow_quoted_newlines, config["allowQuotedNewlines"]) + else: + self.assertIsNone(job.allow_quoted_newlines) + if "autodetect" in config: + self.assertEqual(job.autodetect, config["autodetect"]) + else: + self.assertIsNone(job.autodetect) + if "ignoreUnknownValues" in config: + self.assertEqual(job.ignore_unknown_values, config["ignoreUnknownValues"]) + else: + self.assertIsNone(job.ignore_unknown_values) + if "useAvroLogicalTypes" in config: + self.assertEqual(job.use_avro_logical_types, config["useAvroLogicalTypes"]) + else: + self.assertIsNone(job.use_avro_logical_types) + + def _verifyEnumConfigProperties(self, job, config): + if "createDisposition" in config: + self.assertEqual(job.create_disposition, config["createDisposition"]) + else: + self.assertIsNone(job.create_disposition) + if "encoding" in config: + self.assertEqual(job.encoding, config["encoding"]) + else: + self.assertIsNone(job.encoding) + if "sourceFormat" in config: + self.assertEqual(job.source_format, config["sourceFormat"]) + else: + self.assertIsNone(job.source_format) + if "writeDisposition" in config: + self.assertEqual(job.write_disposition, config["writeDisposition"]) + else: + self.assertIsNone(job.write_disposition) + if "schemaUpdateOptions" in config: + self.assertEqual(job.schema_update_options, config["schemaUpdateOptions"]) + else: + self.assertIsNone(job.schema_update_options) + + def _verifyResourceProperties(self, job, resource): + self._verifyReadonlyResourceProperties(job, resource) + + config = resource.get("configuration", {}).get("load") + + self._verifyBooleanConfigProperties(job, config) + self._verifyEnumConfigProperties(job, config) + + self.assertEqual(job.source_uris, config["sourceUris"]) + + table_ref = config["destinationTable"] + self.assertEqual(job.destination.project, table_ref["projectId"]) + self.assertEqual(job.destination.dataset_id, table_ref["datasetId"]) + self.assertEqual(job.destination.table_id, table_ref["tableId"]) + + if "fieldDelimiter" in config: + self.assertEqual(job.field_delimiter, config["fieldDelimiter"]) + else: + self.assertIsNone(job.field_delimiter) + if "maxBadRecords" in config: + self.assertEqual(job.max_bad_records, config["maxBadRecords"]) + else: + self.assertIsNone(job.max_bad_records) + if "nullMarker" in config: + self.assertEqual(job.null_marker, config["nullMarker"]) + else: + self.assertIsNone(job.null_marker) + if "quote" in config: + self.assertEqual(job.quote_character, config["quote"]) + else: + self.assertIsNone(job.quote_character) + if "skipLeadingRows" in config: + self.assertEqual(str(job.skip_leading_rows), config["skipLeadingRows"]) + else: + self.assertIsNone(job.skip_leading_rows) + + if "destinationEncryptionConfiguration" in config: + self.assertIsNotNone(job.destination_encryption_configuration) + self.assertEqual( + job.destination_encryption_configuration.kms_key_name, + config["destinationEncryptionConfiguration"]["kmsKeyName"], + ) + else: + self.assertIsNone(job.destination_encryption_configuration) + + def test_ctor(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + self.assertEqual(job.destination, self.TABLE_REF) + self.assertEqual(list(job.source_uris), [self.SOURCE1]) + self.assertIs(job._client, client) + self.assertEqual(job.job_type, self.JOB_TYPE) + self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) + + self._verifyInitialReadonlyProperties(job) + + # derived from resource['statistics']['load'] + self.assertIsNone(job.input_file_bytes) + self.assertIsNone(job.input_files) + self.assertIsNone(job.output_bytes) + self.assertIsNone(job.output_rows) + + # set/read from resource['configuration']['load'] + self.assertIsNone(job.schema) + self.assertIsNone(job.allow_jagged_rows) + self.assertIsNone(job.allow_quoted_newlines) + self.assertIsNone(job.autodetect) + self.assertIsNone(job.create_disposition) + self.assertIsNone(job.encoding) + self.assertIsNone(job.field_delimiter) + self.assertIsNone(job.ignore_unknown_values) + self.assertIsNone(job.max_bad_records) + self.assertIsNone(job.null_marker) + self.assertIsNone(job.quote_character) + self.assertIsNone(job.skip_leading_rows) + self.assertIsNone(job.source_format) + self.assertIsNone(job.write_disposition) + self.assertIsNone(job.destination_encryption_configuration) + self.assertIsNone(job.destination_table_description) + self.assertIsNone(job.destination_table_friendly_name) + self.assertIsNone(job.range_partitioning) + self.assertIsNone(job.time_partitioning) + self.assertIsNone(job.use_avro_logical_types) + self.assertIsNone(job.clustering_fields) + self.assertIsNone(job.schema_update_options) + + def test_ctor_w_config(self): + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.job import LoadJobConfig + + client = _make_client(project=self.PROJECT) + full_name = SchemaField("full_name", "STRING", mode="REQUIRED") + age = SchemaField("age", "INTEGER", mode="REQUIRED") + config = LoadJobConfig() + config.schema = [full_name, age] + job = self._make_one( + self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client, config + ) + self.assertEqual(job.schema, [full_name, age]) + config.destination_table_description = "Description" + expected = {"description": "Description"} + self.assertEqual( + config._properties["load"]["destinationTableProperties"], expected + ) + friendly_name = "Friendly Name" + config._properties["load"]["destinationTableProperties"] = { + "friendlyName": friendly_name + } + self.assertEqual(config.destination_table_friendly_name, friendly_name) + + def test_ctor_w_job_reference(self): + from google.cloud.bigquery import job + + client = _make_client(project=self.PROJECT) + job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") + load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) + self.assertEqual(load_job.project, "alternative-project") + self.assertEqual(load_job.location, "US") + + def test_done(self): + client = _make_client(project=self.PROJECT) + resource = self._make_resource(ended=True) + job = self._get_target_class().from_api_repr(resource, client) + self.assertTrue(job.done()) + + def test_result(self): + client = _make_client(project=self.PROJECT) + resource = self._make_resource(ended=True) + job = self._get_target_class().from_api_repr(resource, client) + + result = job.result() + + self.assertIs(result, job) + + def test_result_invokes_begin(self): + begun_resource = self._make_resource() + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection(begun_resource, done_resource) + client = _make_client(self.PROJECT) + client._connection = connection + + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + job.result() + + self.assertEqual(len(connection.api_request.call_args_list), 2) + begin_request, reload_request = connection.api_request.call_args_list + self.assertEqual(begin_request[1]["method"], "POST") + self.assertEqual(reload_request[1]["method"], "GET") + + def test_schema_setter_non_list(self): + from google.cloud.bigquery.job import LoadJobConfig + + config = LoadJobConfig() + with self.assertRaises(TypeError): + config.schema = object() + + def test_schema_setter_invalid_field(self): + from google.cloud.bigquery.job import LoadJobConfig + from google.cloud.bigquery.schema import SchemaField + + config = LoadJobConfig() + full_name = SchemaField("full_name", "STRING", mode="REQUIRED") + with self.assertRaises(ValueError): + config.schema = [full_name, object()] + + def test_schema_setter(self): + from google.cloud.bigquery.job import LoadJobConfig + from google.cloud.bigquery.schema import SchemaField + + config = LoadJobConfig() + full_name = SchemaField("full_name", "STRING", mode="REQUIRED") + age = SchemaField("age", "INTEGER", mode="REQUIRED") + config.schema = [full_name, age] + self.assertEqual(config.schema, [full_name, age]) + + def test_props_set_by_server(self): + import datetime + from google.cloud._helpers import UTC + from google.cloud._helpers import _millis + + CREATED = datetime.datetime(2015, 8, 11, 12, 13, 22, tzinfo=UTC) + STARTED = datetime.datetime(2015, 8, 11, 13, 47, 15, tzinfo=UTC) + ENDED = datetime.datetime(2015, 8, 11, 14, 47, 15, tzinfo=UTC) + FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID) + URL = "http://example.com/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + EMAIL = "phred@example.com" + ERROR_RESULT = { + "debugInfo": "DEBUG", + "location": "LOCATION", + "message": "MESSAGE", + "reason": "REASON", + } + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + job._properties["etag"] = "ETAG" + job._properties["id"] = FULL_JOB_ID + job._properties["selfLink"] = URL + job._properties["user_email"] = EMAIL + + statistics = job._properties["statistics"] = {} + statistics["creationTime"] = _millis(CREATED) + statistics["startTime"] = _millis(STARTED) + statistics["endTime"] = _millis(ENDED) + + self.assertEqual(job.etag, "ETAG") + self.assertEqual(job.self_link, URL) + self.assertEqual(job.user_email, EMAIL) + + self.assertEqual(job.created, CREATED) + self.assertEqual(job.started, STARTED) + self.assertEqual(job.ended, ENDED) + + # running jobs have no load stats not yet set. + self.assertIsNone(job.output_bytes) + + load_stats = statistics["load"] = {} + load_stats["inputFileBytes"] = 12345 + load_stats["inputFiles"] = 1 + load_stats["outputBytes"] = 23456 + load_stats["outputRows"] = 345 + + self.assertEqual(job.input_file_bytes, 12345) + self.assertEqual(job.input_files, 1) + self.assertEqual(job.output_bytes, 23456) + self.assertEqual(job.output_rows, 345) + + status = job._properties["status"] = {} + + self.assertIsNone(job.error_result) + self.assertIsNone(job.errors) + self.assertIsNone(job.state) + + status["errorResult"] = ERROR_RESULT + status["errors"] = [ERROR_RESULT] + status["state"] = "STATE" + + self.assertEqual(job.error_result, ERROR_RESULT) + self.assertEqual(job.errors, [ERROR_RESULT]) + self.assertEqual(job.state, "STATE") + + def test_from_api_repr_missing_identity(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = {} + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_missing_config(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": "%s:%s" % (self.PROJECT, self.JOB_ID), + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + } + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_bare(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.FULL_JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "load": { + "sourceUris": [self.SOURCE1], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.TABLE_ID, + }, + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_with_encryption(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.FULL_JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "load": { + "sourceUris": [self.SOURCE1], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.TABLE_ID, + }, + "destinationEncryptionConfiguration": { + "kmsKeyName": self.KMS_KEY_NAME + }, + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_w_properties(self): + from google.cloud.bigquery.job import CreateDisposition + + client = _make_client(project=self.PROJECT) + RESOURCE = self._make_resource() + load_config = RESOURCE["configuration"]["load"] + load_config["createDisposition"] = CreateDisposition.CREATE_IF_NEEDED + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_already_running(self): + conn = _make_connection() + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + job._properties["status"] = {"state": "RUNNING"} + + with self.assertRaises(ValueError): + job._begin() + + def test_begin_w_bound_client(self): + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + path = "/projects/{}/jobs".format(self.PROJECT) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": path}, client, job) + + conn.api_request.assert_called_once_with( + method="POST", + path=path, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "load": { + "sourceUris": [self.SOURCE1], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.TABLE_ID, + }, + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_autodetect(self): + from google.cloud.bigquery.job import LoadJobConfig + + path = "/projects/{}/jobs".format(self.PROJECT) + resource = self._make_resource() + resource["configuration"]["load"]["autodetect"] = True + # Ensure None for missing server-set props + del resource["statistics"]["creationTime"] + del resource["etag"] + del resource["selfLink"] + del resource["user_email"] + conn = _make_connection(resource) + client = _make_client(project=self.PROJECT, connection=conn) + config = LoadJobConfig() + config.autodetect = True + job = self._make_one( + self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client, config + ) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": path}, client, job) + + sent = { + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "load": { + "sourceUris": [self.SOURCE1], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.TABLE_ID, + }, + "autodetect": True, + } + }, + } + conn.api_request.assert_called_once_with( + method="POST", path=path, data=sent, timeout=None + ) + self._verifyResourceProperties(job, resource) + + def test_begin_w_alternate_client(self): + from google.cloud.bigquery.job import CreateDisposition + from google.cloud.bigquery.job import LoadJobConfig + from google.cloud.bigquery.job import SchemaUpdateOption + from google.cloud.bigquery.job import WriteDisposition + from google.cloud.bigquery.schema import SchemaField + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource(ended=True) + LOAD_CONFIGURATION = { + "sourceUris": [self.SOURCE1], + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.TABLE_ID, + }, + "allowJaggedRows": True, + "allowQuotedNewlines": True, + "createDisposition": CreateDisposition.CREATE_NEVER, + "encoding": "ISO-8559-1", + "fieldDelimiter": "|", + "ignoreUnknownValues": True, + "maxBadRecords": 100, + "nullMarker": r"\N", + "quote": "'", + "skipLeadingRows": "1", + "sourceFormat": "CSV", + "useAvroLogicalTypes": True, + "writeDisposition": WriteDisposition.WRITE_TRUNCATE, + "schema": { + "fields": [ + { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + "description": None, + }, + { + "name": "age", + "type": "INTEGER", + "mode": "REQUIRED", + "description": None, + }, + ] + }, + "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION], + } + RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + full_name = SchemaField("full_name", "STRING", mode="REQUIRED") + age = SchemaField("age", "INTEGER", mode="REQUIRED") + config = LoadJobConfig() + config.schema = [full_name, age] + job = self._make_one( + self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1, config + ) + config.allow_jagged_rows = True + config.allow_quoted_newlines = True + config.create_disposition = CreateDisposition.CREATE_NEVER + config.encoding = "ISO-8559-1" + config.field_delimiter = "|" + config.ignore_unknown_values = True + config.max_bad_records = 100 + config.null_marker = r"\N" + config.quote_character = "'" + config.skip_leading_rows = 1 + config.source_format = "CSV" + config.use_avro_logical_types = True + config.write_disposition = WriteDisposition.WRITE_TRUNCATE + config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION] + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + self.assertEqual(len(conn2.api_request.call_args_list), 1) + req = conn2.api_request.call_args_list[0] + self.assertEqual(req[1]["method"], "POST") + self.assertEqual(req[1]["path"], PATH) + SENT = { + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": {"load": LOAD_CONFIGURATION}, + } + self.maxDiff = None + self.assertEqual(req[1]["data"], SENT) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_job_reference(self): + from google.cloud.bigquery import job + + resource = self._make_resource() + resource["jobReference"]["projectId"] = "alternative-project" + resource["jobReference"]["location"] = "US" + job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") + conn = _make_connection(resource) + client = _make_client(project=self.PROJECT, connection=conn) + load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + load_job._begin() + final_attributes.assert_called_with( + {"path": "/projects/alternative-project/jobs"}, client, load_job + ) + + conn.api_request.assert_called_once() + _, request = conn.api_request.call_args + self.assertEqual(request["method"], "POST") + self.assertEqual(request["path"], "/projects/alternative-project/jobs") + self.assertEqual( + request["data"]["jobReference"]["projectId"], "alternative-project" + ) + self.assertEqual(request["data"]["jobReference"]["location"], "US") + self.assertEqual(request["data"]["jobReference"]["jobId"], self.JOB_ID) + + def test_exists_miss_w_bound_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn = _make_connection() + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertFalse(job.exists()) + + final_attributes.assert_called_with( + {"path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID)}, + client, + job, + ) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_exists_hit_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection({}) + client2 = _make_client(project=self.PROJECT, connection=conn2) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertTrue(job.exists(client=client2)) + + final_attributes.assert_called_with( + {"path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID)}, + client2, + job, + ) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_exists_miss_w_job_reference(self): + from google.cloud.bigquery import job + + job_ref = job._JobReference("my-job-id", "other-project", "US") + conn = _make_connection() + client = _make_client(project=self.PROJECT, connection=conn) + load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertFalse(load_job.exists()) + + final_attributes.assert_called_with( + {"path": "/projects/other-project/jobs/my-job-id"}, client, load_job + ) + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/other-project/jobs/my-job-id", + query_params={"fields": "id", "location": "US"}, + timeout=None, + ) + + def test_reload_w_bound_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource() + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_reload_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource() + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_reload_w_job_reference(self): + from google.cloud.bigquery import job + + resource = self._make_resource(ended=True) + resource["jobReference"]["projectId"] = "alternative-project" + resource["jobReference"]["location"] = "US" + job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") + conn = _make_connection(resource) + client = _make_client(project=self.PROJECT, connection=conn) + load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + load_job.reload() + + final_attributes.assert_called_with( + {"path": "/projects/alternative-project/jobs/{}".format(self.JOB_ID)}, + client, + load_job, + ) + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/alternative-project/jobs/{}".format(self.JOB_ID), + query_params={"location": "US"}, + timeout=None, + ) + + def test_cancel_w_bound_client(self): + PATH = "/projects/%s/jobs/%s/cancel" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource(ended=True) + RESPONSE = {"job": RESOURCE} + conn = _make_connection(RESPONSE) + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.cancel() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="POST", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_cancel_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s/cancel" % (self.PROJECT, self.JOB_ID) + RESOURCE = self._make_resource(ended=True) + RESPONSE = {"job": RESOURCE} + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESPONSE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.cancel(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="POST", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_cancel_w_job_reference(self): + from google.cloud.bigquery import job + + resource = self._make_resource(ended=True) + resource["jobReference"]["projectId"] = "alternative-project" + resource["jobReference"]["location"] = "US" + job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") + conn = _make_connection({"job": resource}) + client = _make_client(project=self.PROJECT, connection=conn) + load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + load_job.cancel() + + final_attributes.assert_called_with( + { + "path": "/projects/alternative-project/jobs/{}/cancel".format( + self.JOB_ID + ) + }, + client, + load_job, + ) + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/alternative-project/jobs/{}/cancel".format(self.JOB_ID), + query_params={"location": "US"}, + timeout=None, + ) diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py new file mode 100644 index 000000000..cbe087dac --- /dev/null +++ b/tests/unit/job/test_load_config.py @@ -0,0 +1,791 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import pytest + +from .helpers import _Base + + +class TestLoadJobConfig(_Base): + JOB_TYPE = "load" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import LoadJobConfig + + return LoadJobConfig + + def test_ctor_w_properties(self): + config = self._get_target_class()( + allow_jagged_rows=True, allow_quoted_newlines=True + ) + + self.assertTrue(config.allow_jagged_rows) + self.assertTrue(config.allow_quoted_newlines) + + def test_allow_jagged_rows_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.allow_jagged_rows) + + def test_allow_jagged_rows_hit(self): + config = self._get_target_class()() + config._properties["load"]["allowJaggedRows"] = True + self.assertTrue(config.allow_jagged_rows) + + def test_allow_jagged_rows_setter(self): + config = self._get_target_class()() + config.allow_jagged_rows = True + self.assertTrue(config._properties["load"]["allowJaggedRows"]) + + def test_allow_quoted_newlines_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.allow_quoted_newlines) + + def test_allow_quoted_newlines_hit(self): + config = self._get_target_class()() + config._properties["load"]["allowQuotedNewlines"] = True + self.assertTrue(config.allow_quoted_newlines) + + def test_allow_quoted_newlines_setter(self): + config = self._get_target_class()() + config.allow_quoted_newlines = True + self.assertTrue(config._properties["load"]["allowQuotedNewlines"]) + + def test_autodetect_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.autodetect) + + def test_autodetect_hit(self): + config = self._get_target_class()() + config._properties["load"]["autodetect"] = True + self.assertTrue(config.autodetect) + + def test_autodetect_setter(self): + config = self._get_target_class()() + config.autodetect = True + self.assertTrue(config._properties["load"]["autodetect"]) + + def test_clustering_fields_miss(self): + config = self._get_target_class()() + self.assertIsNone(config.clustering_fields) + + def test_clustering_fields_hit(self): + config = self._get_target_class()() + fields = ["email", "postal_code"] + config._properties["load"]["clustering"] = {"fields": fields} + self.assertEqual(config.clustering_fields, fields) + + def test_clustering_fields_setter(self): + fields = ["email", "postal_code"] + config = self._get_target_class()() + config.clustering_fields = fields + self.assertEqual(config._properties["load"]["clustering"], {"fields": fields}) + + def test_clustering_fields_setter_w_none(self): + config = self._get_target_class()() + fields = ["email", "postal_code"] + config._properties["load"]["clustering"] = {"fields": fields} + config.clustering_fields = None + self.assertIsNone(config.clustering_fields) + self.assertNotIn("clustering", config._properties["load"]) + + def test_create_disposition_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.create_disposition) + + def test_create_disposition_hit(self): + from google.cloud.bigquery.job import CreateDisposition + + disposition = CreateDisposition.CREATE_IF_NEEDED + config = self._get_target_class()() + config._properties["load"]["createDisposition"] = disposition + self.assertEqual(config.create_disposition, disposition) + + def test_create_disposition_setter(self): + from google.cloud.bigquery.job import CreateDisposition + + disposition = CreateDisposition.CREATE_IF_NEEDED + config = self._get_target_class()() + config.create_disposition = disposition + self.assertEqual(config._properties["load"]["createDisposition"], disposition) + + def test_decimal_target_types_miss(self): + config = self._get_target_class()() + self.assertIsNone(config.decimal_target_types) + + def test_decimal_target_types_hit(self): + from google.cloud.bigquery.enums import DecimalTargetType + + config = self._get_target_class()() + decimal_target_types = [DecimalTargetType.NUMERIC, DecimalTargetType.STRING] + config._properties["load"]["decimalTargetTypes"] = decimal_target_types + + expected = frozenset(decimal_target_types) + self.assertEqual(config.decimal_target_types, expected) + + def test_decimal_target_types_setter(self): + from google.cloud.bigquery.enums import DecimalTargetType + + decimal_target_types = (DecimalTargetType.NUMERIC, DecimalTargetType.BIGNUMERIC) + config = self._get_target_class()() + config.decimal_target_types = decimal_target_types + self.assertEqual( + config._properties["load"]["decimalTargetTypes"], + list(decimal_target_types), + ) + + def test_decimal_target_types_setter_w_none(self): + from google.cloud.bigquery.enums import DecimalTargetType + + config = self._get_target_class()() + decimal_target_types = [DecimalTargetType.BIGNUMERIC] + config._properties["load"]["decimalTargetTypes"] = decimal_target_types + + config.decimal_target_types = None + + self.assertIsNone(config.decimal_target_types) + self.assertNotIn("decimalTargetTypes", config._properties["load"]) + + config.decimal_target_types = None # No error if unsetting an unset property. + + def test_destination_encryption_configuration_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.destination_encryption_configuration) + + def test_destination_encryption_configuration_hit(self): + from google.cloud.bigquery.encryption_configuration import ( + EncryptionConfiguration, + ) + + kms_key_name = "kms-key-name" + encryption_configuration = EncryptionConfiguration(kms_key_name) + config = self._get_target_class()() + config._properties["load"]["destinationEncryptionConfiguration"] = { + "kmsKeyName": kms_key_name + } + self.assertEqual( + config.destination_encryption_configuration, encryption_configuration + ) + + def test_destination_encryption_configuration_setter(self): + from google.cloud.bigquery.encryption_configuration import ( + EncryptionConfiguration, + ) + + kms_key_name = "kms-key-name" + encryption_configuration = EncryptionConfiguration(kms_key_name) + config = self._get_target_class()() + config.destination_encryption_configuration = encryption_configuration + expected = {"kmsKeyName": kms_key_name} + self.assertEqual( + config._properties["load"]["destinationEncryptionConfiguration"], expected + ) + + def test_destination_encryption_configuration_setter_w_none(self): + kms_key_name = "kms-key-name" + config = self._get_target_class()() + config._properties["load"]["destinationEncryptionConfiguration"] = { + "kmsKeyName": kms_key_name + } + config.destination_encryption_configuration = None + self.assertIsNone(config.destination_encryption_configuration) + self.assertNotIn( + "destinationEncryptionConfiguration", config._properties["load"] + ) + + def test_destination_table_description_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.destination_table_description) + + def test_destination_table_description_hit(self): + description = "Description" + config = self._get_target_class()() + config._properties["load"]["destinationTableProperties"] = { + "description": description + } + self.assertEqual(config.destination_table_description, description) + + def test_destination_table_description_setter(self): + description = "Description" + config = self._get_target_class()() + config.destination_table_description = description + expected = {"description": description} + self.assertEqual( + config._properties["load"]["destinationTableProperties"], expected + ) + + def test_destination_table_description_setter_w_fn_already(self): + description = "Description" + friendly_name = "Friendly Name" + config = self._get_target_class()() + config._properties["load"]["destinationTableProperties"] = { + "friendlyName": friendly_name + } + config.destination_table_description = description + expected = {"friendlyName": friendly_name, "description": description} + self.assertEqual( + config._properties["load"]["destinationTableProperties"], expected + ) + + def test_destination_table_description_w_none(self): + description = "Description" + friendly_name = "Friendly Name" + config = self._get_target_class()() + config._properties["load"]["destinationTableProperties"] = { + "description": description, + "friendlyName": friendly_name, + } + config.destination_table_description = None + expected = {"friendlyName": friendly_name} + self.assertEqual( + config._properties["load"]["destinationTableProperties"], expected + ) + + def test_destination_table_friendly_name_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.destination_table_friendly_name) + + def test_destination_table_friendly_name_hit(self): + friendly_name = "Friendly Name" + config = self._get_target_class()() + config._properties["load"]["destinationTableProperties"] = { + "friendlyName": friendly_name + } + self.assertEqual(config.destination_table_friendly_name, friendly_name) + + def test_destination_table_friendly_name_setter(self): + friendly_name = "Friendly Name" + config = self._get_target_class()() + config.destination_table_friendly_name = friendly_name + expected = {"friendlyName": friendly_name} + self.assertEqual( + config._properties["load"]["destinationTableProperties"], expected + ) + + def test_destination_table_friendly_name_setter_w_descr_already(self): + friendly_name = "Friendly Name" + description = "Description" + config = self._get_target_class()() + config._properties["load"]["destinationTableProperties"] = { + "description": description + } + config.destination_table_friendly_name = friendly_name + expected = {"friendlyName": friendly_name, "description": description} + self.assertEqual( + config._properties["load"]["destinationTableProperties"], expected + ) + + def test_destination_table_friendly_name_w_none(self): + friendly_name = "Friendly Name" + description = "Description" + config = self._get_target_class()() + config._properties["load"]["destinationTableProperties"] = { + "description": description, + "friendlyName": friendly_name, + } + config.destination_table_friendly_name = None + expected = {"description": description} + self.assertEqual( + config._properties["load"]["destinationTableProperties"], expected + ) + + def test_encoding_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.encoding) + + def test_encoding_hit(self): + from google.cloud.bigquery.job import Encoding + + encoding = Encoding.UTF_8 + config = self._get_target_class()() + config._properties["load"]["encoding"] = encoding + self.assertEqual(config.encoding, encoding) + + def test_encoding_setter(self): + from google.cloud.bigquery.job import Encoding + + encoding = Encoding.UTF_8 + config = self._get_target_class()() + config.encoding = encoding + self.assertEqual(config._properties["load"]["encoding"], encoding) + + def test_field_delimiter_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.field_delimiter) + + def test_field_delimiter_hit(self): + field_delimiter = "|" + config = self._get_target_class()() + config._properties["load"]["fieldDelimiter"] = field_delimiter + self.assertEqual(config.field_delimiter, field_delimiter) + + def test_field_delimiter_setter(self): + field_delimiter = "|" + config = self._get_target_class()() + config.field_delimiter = field_delimiter + self.assertEqual(config._properties["load"]["fieldDelimiter"], field_delimiter) + + def test_hive_partitioning_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.hive_partitioning) + + def test_hive_partitioning_hit(self): + from google.cloud.bigquery.external_config import HivePartitioningOptions + + config = self._get_target_class()() + config._properties["load"]["hivePartitioningOptions"] = { + "sourceUriPrefix": "http://foo/bar", + "mode": "STRINGS", + } + result = config.hive_partitioning + self.assertIsInstance(result, HivePartitioningOptions) + self.assertEqual(result.source_uri_prefix, "http://foo/bar") + self.assertEqual(result.mode, "STRINGS") + + def test_hive_partitioning_setter(self): + from google.cloud.bigquery.external_config import HivePartitioningOptions + + hive_partitioning = HivePartitioningOptions() + hive_partitioning.source_uri_prefix = "http://foo/bar" + hive_partitioning.mode = "AUTO" + + config = self._get_target_class()() + config.hive_partitioning = hive_partitioning + self.assertEqual( + config._properties["load"]["hivePartitioningOptions"], + {"sourceUriPrefix": "http://foo/bar", "mode": "AUTO"}, + ) + + config.hive_partitioning = None + self.assertIsNone(config._properties["load"]["hivePartitioningOptions"]) + + def test_hive_partitioning_invalid_type(self): + config = self._get_target_class()() + + with self.assertRaises(TypeError): + config.hive_partitioning = {"mode": "AUTO"} + + def test_ignore_unknown_values_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.ignore_unknown_values) + + def test_ignore_unknown_values_hit(self): + config = self._get_target_class()() + config._properties["load"]["ignoreUnknownValues"] = True + self.assertTrue(config.ignore_unknown_values) + + def test_ignore_unknown_values_setter(self): + config = self._get_target_class()() + config.ignore_unknown_values = True + self.assertTrue(config._properties["load"]["ignoreUnknownValues"]) + + def test_max_bad_records_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.max_bad_records) + + def test_max_bad_records_hit(self): + max_bad_records = 13 + config = self._get_target_class()() + config._properties["load"]["maxBadRecords"] = max_bad_records + self.assertEqual(config.max_bad_records, max_bad_records) + + def test_max_bad_records_setter(self): + max_bad_records = 13 + config = self._get_target_class()() + config.max_bad_records = max_bad_records + self.assertEqual(config._properties["load"]["maxBadRecords"], max_bad_records) + + def test_null_marker_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.null_marker) + + def test_null_marker_hit(self): + null_marker = "XXX" + config = self._get_target_class()() + config._properties["load"]["nullMarker"] = null_marker + self.assertEqual(config.null_marker, null_marker) + + def test_null_marker_setter(self): + null_marker = "XXX" + config = self._get_target_class()() + config.null_marker = null_marker + self.assertEqual(config._properties["load"]["nullMarker"], null_marker) + + def test_projection_fields_miss(self): + config = self._get_target_class()() + self.assertIsNone(config.projection_fields) + + def test_projection_fields_hit(self): + config = self._get_target_class()() + fields = ["email", "postal_code"] + config.projection_fields = fields + self.assertEqual(config._properties["load"]["projectionFields"], fields) + self.assertEqual(config.projection_fields, fields) + + def test_quote_character_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.quote_character) + + def test_quote_character_hit(self): + quote_character = "'" + config = self._get_target_class()() + config._properties["load"]["quote"] = quote_character + self.assertEqual(config.quote_character, quote_character) + + def test_quote_character_setter(self): + quote_character = "'" + config = self._get_target_class()() + config.quote_character = quote_character + self.assertEqual(config._properties["load"]["quote"], quote_character) + + def test_schema_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.schema) + + def test_schema_hit(self): + from google.cloud.bigquery.schema import SchemaField + + config = self._get_target_class()() + all_props_repr = { + "mode": "REQUIRED", + "name": "foo", + "type": "INTEGER", + "description": "Foo", + } + minimal_repr = {"name": "bar", "type": "STRING"} + config._properties["load"]["schema"] = { + "fields": [all_props_repr, minimal_repr] + } + all_props, minimal = config.schema + self.assertEqual(all_props, SchemaField.from_api_repr(all_props_repr)) + self.assertEqual(minimal, SchemaField.from_api_repr(minimal_repr)) + + def test_schema_setter_fields(self): + from google.cloud.bigquery.schema import SchemaField + + config = self._get_target_class()() + full_name = SchemaField("full_name", "STRING", mode="REQUIRED") + age = SchemaField("age", "INTEGER", mode="REQUIRED") + config.schema = [full_name, age] + full_name_repr = { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + "policyTags": {"names": []}, + } + age_repr = { + "name": "age", + "type": "INTEGER", + "mode": "REQUIRED", + "policyTags": {"names": []}, + } + self.assertEqual( + config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]} + ) + + def test_schema_setter_valid_mappings_list(self): + config = self._get_target_class()() + + full_name_repr = { + "name": "full_name", + "type": "STRING", + "mode": "REQUIRED", + "policyTags": {"names": []}, + } + age_repr = { + "name": "age", + "type": "INTEGER", + "mode": "REQUIRED", + "policyTags": {"names": []}, + } + schema = [full_name_repr, age_repr] + config.schema = schema + self.assertEqual( + config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]} + ) + + def test_schema_setter_invalid_mappings_list(self): + config = self._get_target_class()() + + schema = [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "typeoo": "INTEGER", "mode": "REQUIRED"}, + ] + + with self.assertRaises(Exception): + config.schema = schema + + def test_schema_setter_unsetting_schema(self): + from google.cloud.bigquery.schema import SchemaField + + config = self._get_target_class()() + config._properties["load"]["schema"] = [ + SchemaField("full_name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + + config.schema = None + self.assertNotIn("schema", config._properties["load"]) + config.schema = None # no error, idempotent operation + + def test_schema_update_options_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.schema_update_options) + + def test_schema_update_options_hit(self): + from google.cloud.bigquery.job import SchemaUpdateOption + + options = [ + SchemaUpdateOption.ALLOW_FIELD_ADDITION, + SchemaUpdateOption.ALLOW_FIELD_RELAXATION, + ] + config = self._get_target_class()() + config._properties["load"]["schemaUpdateOptions"] = options + self.assertEqual(config.schema_update_options, options) + + def test_schema_update_options_setter(self): + from google.cloud.bigquery.job import SchemaUpdateOption + + options = [ + SchemaUpdateOption.ALLOW_FIELD_ADDITION, + SchemaUpdateOption.ALLOW_FIELD_RELAXATION, + ] + config = self._get_target_class()() + config.schema_update_options = options + self.assertEqual(config._properties["load"]["schemaUpdateOptions"], options) + + def test_skip_leading_rows_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.skip_leading_rows) + + def test_skip_leading_rows_hit_w_str(self): + skip_leading_rows = 1 + config = self._get_target_class()() + config._properties["load"]["skipLeadingRows"] = str(skip_leading_rows) + self.assertEqual(config.skip_leading_rows, skip_leading_rows) + + def test_skip_leading_rows_hit_w_integer(self): + skip_leading_rows = 1 + config = self._get_target_class()() + config._properties["load"]["skipLeadingRows"] = skip_leading_rows + self.assertEqual(config.skip_leading_rows, skip_leading_rows) + + def test_skip_leading_rows_setter(self): + skip_leading_rows = 1 + config = self._get_target_class()() + config.skip_leading_rows = skip_leading_rows + self.assertEqual( + config._properties["load"]["skipLeadingRows"], str(skip_leading_rows) + ) + + def test_source_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.source_format) + + def test_source_format_hit(self): + from google.cloud.bigquery.job import SourceFormat + + source_format = SourceFormat.CSV + config = self._get_target_class()() + config._properties["load"]["sourceFormat"] = source_format + self.assertEqual(config.source_format, source_format) + + def test_source_format_setter(self): + from google.cloud.bigquery.job import SourceFormat + + source_format = SourceFormat.CSV + config = self._get_target_class()() + config.source_format = source_format + self.assertEqual(config._properties["load"]["sourceFormat"], source_format) + + def test_range_partitioning_w_none(self): + object_under_test = self._get_target_class()() + assert object_under_test.range_partitioning is None + + def test_range_partitioning_w_value(self): + object_under_test = self._get_target_class()() + object_under_test._properties["load"]["rangePartitioning"] = { + "field": "column_one", + "range": {"start": 1, "end": 1000, "interval": 10}, + } + object_under_test.range_partitioning.field == "column_one" + object_under_test.range_partitioning.range_.start == 1 + object_under_test.range_partitioning.range_.end == 1000 + object_under_test.range_partitioning.range_.interval == 10 + + def test_range_partitioning_setter(self): + from google.cloud.bigquery.table import PartitionRange + from google.cloud.bigquery.table import RangePartitioning + + object_under_test = self._get_target_class()() + object_under_test.range_partitioning = RangePartitioning( + field="column_one", range_=PartitionRange(start=1, end=1000, interval=10) + ) + object_under_test.range_partitioning.field == "column_one" + object_under_test.range_partitioning.range_.start == 1 + object_under_test.range_partitioning.range_.end == 1000 + object_under_test.range_partitioning.range_.interval == 10 + + def test_range_partitioning_setter_w_none(self): + object_under_test = self._get_target_class()() + object_under_test.range_partitioning = None + assert object_under_test.range_partitioning is None + + def test_range_partitioning_setter_w_wrong_type(self): + object_under_test = self._get_target_class()() + with pytest.raises(ValueError, match="RangePartitioning"): + object_under_test.range_partitioning = object() + + def test_time_partitioning_miss(self): + config = self._get_target_class()() + self.assertIsNone(config.time_partitioning) + + def test_time_partitioning_hit(self): + from google.cloud.bigquery.table import TimePartitioning + from google.cloud.bigquery.table import TimePartitioningType + + field = "creation_date" + year_ms = 86400 * 1000 * 365 + config = self._get_target_class()() + config._properties["load"]["timePartitioning"] = { + "type": TimePartitioningType.DAY, + "field": field, + "expirationMs": str(year_ms), + "requirePartitionFilter": False, + } + with warnings.catch_warnings(record=True) as warned: + expected = TimePartitioning( + type_=TimePartitioningType.DAY, + field=field, + expiration_ms=year_ms, + require_partition_filter=False, + ) + self.assertEqual(config.time_partitioning, expected) + + assert len(warned) == 1 + warning = warned[0] + assert "TimePartitioning.require_partition_filter" in str(warning) + + def test_time_partitioning_setter(self): + from google.cloud.bigquery.table import TimePartitioning + from google.cloud.bigquery.table import TimePartitioningType + + field = "creation_date" + year_ms = 86400 * 1000 * 365 + + with warnings.catch_warnings(record=True) as warned: + time_partitioning = TimePartitioning( + type_=TimePartitioningType.DAY, + field=field, + expiration_ms=year_ms, + require_partition_filter=False, + ) + + config = self._get_target_class()() + config.time_partitioning = time_partitioning + expected = { + "type": TimePartitioningType.DAY, + "field": field, + "expirationMs": str(year_ms), + "requirePartitionFilter": False, + } + self.assertEqual(config._properties["load"]["timePartitioning"], expected) + + assert len(warned) == 1 + warning = warned[0] + assert "TimePartitioning.require_partition_filter" in str(warning) + + def test_time_partitioning_setter_w_none(self): + from google.cloud.bigquery.table import TimePartitioningType + + field = "creation_date" + year_ms = 86400 * 1000 * 365 + config = self._get_target_class()() + config._properties["load"]["timePartitioning"] = { + "type": TimePartitioningType.DAY, + "field": field, + "expirationMs": str(year_ms), + "requirePartitionFilter": False, + } + config.time_partitioning = None + self.assertIsNone(config.time_partitioning) + self.assertNotIn("timePartitioning", config._properties["load"]) + + def test_use_avro_logical_types(self): + config = self._get_target_class()() + self.assertIsNone(config.use_avro_logical_types) + + def test_use_avro_logical_types_setter(self): + config = self._get_target_class()() + config.use_avro_logical_types = True + self.assertTrue(config._properties["load"]["useAvroLogicalTypes"]) + + def test_write_disposition_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.write_disposition) + + def test_write_disposition_hit(self): + from google.cloud.bigquery.job import WriteDisposition + + write_disposition = WriteDisposition.WRITE_TRUNCATE + config = self._get_target_class()() + config._properties["load"]["writeDisposition"] = write_disposition + self.assertEqual(config.write_disposition, write_disposition) + + def test_write_disposition_setter(self): + from google.cloud.bigquery.job import WriteDisposition + + write_disposition = WriteDisposition.WRITE_TRUNCATE + config = self._get_target_class()() + config.write_disposition = write_disposition + self.assertEqual( + config._properties["load"]["writeDisposition"], write_disposition + ) + + def test_parquet_options_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.parquet_options) + + def test_parquet_options_hit(self): + config = self._get_target_class()() + config._properties["load"]["parquetOptions"] = dict( + enumAsString=True, enableListInference=False + ) + self.assertTrue(config.parquet_options.enum_as_string) + self.assertFalse(config.parquet_options.enable_list_inference) + + def test_parquet_options_setter(self): + from google.cloud.bigquery.format_options import ParquetOptions + + parquet_options = ParquetOptions.from_api_repr( + dict(enumAsString=False, enableListInference=True) + ) + config = self._get_target_class()() + + config.parquet_options = parquet_options + self.assertEqual( + config._properties["load"]["parquetOptions"], + {"enumAsString": False, "enableListInference": True}, + ) + + def test_parquet_options_setter_clearing(self): + config = self._get_target_class()() + config._properties["load"]["parquetOptions"] = dict( + enumAsString=False, enableListInference=True + ) + + config.parquet_options = None + self.assertNotIn("parquetOptions", config._properties["load"]) diff --git a/tests/unit/job/test_query.py b/tests/unit/job/test_query.py new file mode 100644 index 000000000..d41370520 --- /dev/null +++ b/tests/unit/job/test_query.py @@ -0,0 +1,1982 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import concurrent +import copy +import http +import textwrap +import types + +import freezegun +from google.api_core import exceptions +import google.api_core.retry +import mock +import requests + +from google.cloud.bigquery.client import _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS +import google.cloud.bigquery.query +from .helpers import _Base +from .helpers import _make_client +from .helpers import _make_connection + + +class TestQueryJob(_Base): + JOB_TYPE = "query" + QUERY = "select count(*) from persons" + DESTINATION_TABLE = "destination_table" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import QueryJob + + return QueryJob + + def _make_resource(self, started=False, ended=False, location="US"): + resource = super(TestQueryJob, self)._make_resource( + started, ended, location=location + ) + config = resource["configuration"]["query"] + config["query"] = self.QUERY + return resource + + def _verifyBooleanResourceProperties(self, job, config): + + if "allowLargeResults" in config: + self.assertEqual(job.allow_large_results, config["allowLargeResults"]) + else: + self.assertIsNone(job.allow_large_results) + if "flattenResults" in config: + self.assertEqual(job.flatten_results, config["flattenResults"]) + else: + self.assertIsNone(job.flatten_results) + if "useQueryCache" in config: + self.assertEqual(job.use_query_cache, config["useQueryCache"]) + else: + self.assertIsNone(job.use_query_cache) + if "useLegacySql" in config: + self.assertEqual(job.use_legacy_sql, config["useLegacySql"]) + else: + self.assertIsNone(job.use_legacy_sql) + + def _verifyIntegerResourceProperties(self, job, config): + if "maximumBillingTier" in config: + self.assertEqual(job.maximum_billing_tier, config["maximumBillingTier"]) + else: + self.assertIsNone(job.maximum_billing_tier) + if "maximumBytesBilled" in config: + self.assertEqual( + str(job.maximum_bytes_billed), config["maximumBytesBilled"] + ) + self.assertIsInstance(job.maximum_bytes_billed, int) + else: + self.assertIsNone(job.maximum_bytes_billed) + + def _verify_udf_resources(self, job, config): + udf_resources = config.get("userDefinedFunctionResources", ()) + self.assertEqual(len(job.udf_resources), len(udf_resources)) + for found, expected in zip(job.udf_resources, udf_resources): + if "resourceUri" in expected: + self.assertEqual(found.udf_type, "resourceUri") + self.assertEqual(found.value, expected["resourceUri"]) + else: + self.assertEqual(found.udf_type, "inlineCode") + self.assertEqual(found.value, expected["inlineCode"]) + + def _verifyQueryParameters(self, job, config): + query_parameters = config.get("queryParameters", ()) + self.assertEqual(len(job.query_parameters), len(query_parameters)) + for found, expected in zip(job.query_parameters, query_parameters): + self.assertEqual(found.to_api_repr(), expected) + + def _verify_table_definitions(self, job, config): + table_defs = config.get("tableDefinitions") + if job.table_definitions is None: + self.assertIsNone(table_defs) + else: + self.assertEqual(len(job.table_definitions), len(table_defs)) + for found_key, found_ec in job.table_definitions.items(): + expected_ec = table_defs.get(found_key) + self.assertIsNotNone(expected_ec) + self.assertEqual(found_ec.to_api_repr(), expected_ec) + + def _verify_dml_stats_resource_properties(self, job, resource): + query_stats = resource.get("statistics", {}).get("query", {}) + + if "dmlStats" in query_stats: + resource_dml_stats = query_stats["dmlStats"] + job_dml_stats = job.dml_stats + assert str(job_dml_stats.inserted_row_count) == resource_dml_stats.get( + "insertedRowCount", "0" + ) + assert str(job_dml_stats.updated_row_count) == resource_dml_stats.get( + "updatedRowCount", "0" + ) + assert str(job_dml_stats.deleted_row_count) == resource_dml_stats.get( + "deletedRowCount", "0" + ) + else: + assert job.dml_stats is None + + def _verify_transaction_info_resource_properties(self, job, resource): + resource_stats = resource.get("statistics", {}) + + if "transactionInfo" in resource_stats: + resource_transaction_info = resource_stats["transactionInfo"] + job_transaction_info = job.transaction_info + assert job_transaction_info.transaction_id == resource_transaction_info.get( + "transactionId" + ) + else: + assert job.transaction_info is None + + def _verify_configuration_properties(self, job, configuration): + if "dryRun" in configuration: + self.assertEqual(job.dry_run, configuration["dryRun"]) + else: + self.assertIsNone(job.dry_run) + + def _verifyResourceProperties(self, job, resource): + self._verifyReadonlyResourceProperties(job, resource) + self._verify_dml_stats_resource_properties(job, resource) + self._verify_transaction_info_resource_properties(job, resource) + + configuration = resource.get("configuration", {}) + self._verify_configuration_properties(job, configuration) + + query_config = resource.get("configuration", {}).get("query") + self._verifyBooleanResourceProperties(job, query_config) + self._verifyIntegerResourceProperties(job, query_config) + self._verify_udf_resources(job, query_config) + self._verifyQueryParameters(job, query_config) + self._verify_table_definitions(job, query_config) + + self.assertEqual(job.query, query_config["query"]) + + if "createDisposition" in query_config: + self.assertEqual(job.create_disposition, query_config["createDisposition"]) + else: + self.assertIsNone(job.create_disposition) + + if "defaultDataset" in query_config: + ds_ref = job.default_dataset + ds_ref = {"projectId": ds_ref.project, "datasetId": ds_ref.dataset_id} + self.assertEqual(ds_ref, query_config["defaultDataset"]) + else: + self.assertIsNone(job.default_dataset) + + if "destinationTable" in query_config: + table = job.destination + tb_ref = { + "projectId": table.project, + "datasetId": table.dataset_id, + "tableId": table.table_id, + } + self.assertEqual(tb_ref, query_config["destinationTable"]) + else: + self.assertIsNone(job.destination) + + if "priority" in query_config: + self.assertEqual(job.priority, query_config["priority"]) + else: + self.assertIsNone(job.priority) + + if "writeDisposition" in query_config: + self.assertEqual(job.write_disposition, query_config["writeDisposition"]) + else: + self.assertIsNone(job.write_disposition) + + if "destinationEncryptionConfiguration" in query_config: + self.assertIsNotNone(job.destination_encryption_configuration) + self.assertEqual( + job.destination_encryption_configuration.kms_key_name, + query_config["destinationEncryptionConfiguration"]["kmsKeyName"], + ) + else: + self.assertIsNone(job.destination_encryption_configuration) + + if "schemaUpdateOptions" in query_config: + self.assertEqual( + job.schema_update_options, query_config["schemaUpdateOptions"] + ) + else: + self.assertIsNone(job.schema_update_options) + + def test_ctor_defaults(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertEqual(job.query, self.QUERY) + self.assertIs(job._client, client) + self.assertEqual(job.job_type, self.JOB_TYPE) + self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) + + self._verifyInitialReadonlyProperties(job) + + self.assertFalse(job.use_legacy_sql) + + # set/read from resource['configuration']['query'] + self.assertIsNone(job.allow_large_results) + self.assertIsNone(job.create_disposition) + self.assertIsNone(job.default_dataset) + self.assertIsNone(job.destination) + self.assertIsNone(job.dml_stats) + self.assertIsNone(job.flatten_results) + self.assertIsNone(job.priority) + self.assertIsNone(job.use_query_cache) + self.assertIsNone(job.dry_run) + self.assertIsNone(job.write_disposition) + self.assertIsNone(job.maximum_billing_tier) + self.assertIsNone(job.maximum_bytes_billed) + self.assertIsNone(job.table_definitions) + self.assertIsNone(job.destination_encryption_configuration) + self.assertIsNone(job.range_partitioning) + self.assertIsNone(job.time_partitioning) + self.assertIsNone(job.clustering_fields) + self.assertIsNone(job.schema_update_options) + + def test_ctor_w_udf_resources(self): + from google.cloud.bigquery.job import QueryJobConfig + from google.cloud.bigquery.query import UDFResource + + RESOURCE_URI = "gs://some-bucket/js/lib.js" + udf_resources = [UDFResource("resourceUri", RESOURCE_URI)] + client = _make_client(project=self.PROJECT) + config = QueryJobConfig() + config.udf_resources = udf_resources + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) + self.assertEqual(job.udf_resources, udf_resources) + + def test_ctor_w_query_parameters(self): + from google.cloud.bigquery.job import QueryJobConfig + from google.cloud.bigquery.query import ScalarQueryParameter + + query_parameters = [ScalarQueryParameter("foo", "INT64", 123)] + client = _make_client(project=self.PROJECT) + config = QueryJobConfig(query_parameters=query_parameters) + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) + self.assertEqual(job.query_parameters, query_parameters) + + def test_from_api_repr_missing_identity(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = {} + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_missing_config(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": "%s:%s" % (self.PROJECT, self.DS_ID), + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + } + klass = self._get_target_class() + with self.assertRaises(KeyError): + klass.from_api_repr(RESOURCE, client=client) + + def test_from_api_repr_bare(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": {"query": {"query": self.QUERY}}, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_with_encryption(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": { + "query": self.QUERY, + "destinationEncryptionConfiguration": { + "kmsKeyName": self.KMS_KEY_NAME + }, + } + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_with_dml_stats(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": {"query": {"query": self.QUERY}}, + "statistics": { + "query": { + "dmlStats": {"insertedRowCount": "15", "updatedRowCount": "2"}, + }, + }, + } + klass = self._get_target_class() + + job = klass.from_api_repr(RESOURCE, client=client) + + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_with_transaction_info(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = { + "id": self.JOB_ID, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": {"query": {"query": self.QUERY}}, + "statistics": {"transactionInfo": {"transactionId": "1a2b-3c4d"}}, + } + klass = self._get_target_class() + + job = klass.from_api_repr(RESOURCE, client=client) + + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_from_api_repr_w_properties(self): + from google.cloud.bigquery.job import CreateDisposition + from google.cloud.bigquery.job import SchemaUpdateOption + from google.cloud.bigquery.job import WriteDisposition + + client = _make_client(project=self.PROJECT) + RESOURCE = self._make_resource() + query_config = RESOURCE["configuration"]["query"] + query_config["createDisposition"] = CreateDisposition.CREATE_IF_NEEDED + query_config["writeDisposition"] = WriteDisposition.WRITE_TRUNCATE + query_config["destinationTable"] = { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.DESTINATION_TABLE, + } + query_config["schemaUpdateOptions"] = [SchemaUpdateOption.ALLOW_FIELD_ADDITION] + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) + self.assertIs(job._client, client) + self._verifyResourceProperties(job, RESOURCE) + + def test_cancelled(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + job._properties["status"] = { + "state": "DONE", + "errorResult": {"reason": "stopped"}, + } + + self.assertTrue(job.cancelled()) + + def test__done_or_raise_w_timeout(self): + client = _make_client(project=self.PROJECT) + resource = self._make_resource(ended=False) + job = self._get_target_class().from_api_repr(resource, client) + + with mock.patch.object( + client, "_get_query_results" + ) as fake_get_results, mock.patch.object(job, "reload") as fake_reload: + job._done_or_raise(timeout=42) + + fake_get_results.assert_called_once() + call_args = fake_get_results.call_args + self.assertEqual(call_args.kwargs.get("timeout"), 42) + + call_args = fake_reload.call_args + self.assertEqual(call_args.kwargs.get("timeout"), 42) + + def test__done_or_raise_w_timeout_and_longer_internal_api_timeout(self): + client = _make_client(project=self.PROJECT) + resource = self._make_resource(ended=False) + job = self._get_target_class().from_api_repr(resource, client) + job._done_timeout = 8.8 + + with mock.patch.object( + client, "_get_query_results" + ) as fake_get_results, mock.patch.object(job, "reload") as fake_reload: + job._done_or_raise(timeout=5.5) + + # The expected timeout used is simply the given timeout, as the latter + # is shorter than the job's internal done timeout. + expected_timeout = 5.5 + + fake_get_results.assert_called_once() + call_args = fake_get_results.call_args + self.assertAlmostEqual(call_args.kwargs.get("timeout"), expected_timeout) + + call_args = fake_reload.call_args + self.assertAlmostEqual(call_args.kwargs.get("timeout"), expected_timeout) + + def test__done_or_raise_w_query_results_error_reload_ok(self): + client = _make_client(project=self.PROJECT) + bad_request_error = exceptions.BadRequest("Error in query") + client._get_query_results = mock.Mock(side_effect=bad_request_error) + + resource = self._make_resource(ended=False) + job = self._get_target_class().from_api_repr(resource, client) + job._exception = None + + def fake_reload(self, *args, **kwargs): + self._properties["status"]["state"] = "DONE" + self.set_exception(copy.copy(bad_request_error)) + + fake_reload_method = types.MethodType(fake_reload, job) + + with mock.patch.object(job, "reload", new=fake_reload_method): + job._done_or_raise() + + assert isinstance(job._exception, exceptions.BadRequest) + + def test__done_or_raise_w_query_results_error_reload_error(self): + client = _make_client(project=self.PROJECT) + bad_request_error = exceptions.BadRequest("Error in query") + client._get_query_results = mock.Mock(side_effect=bad_request_error) + + resource = self._make_resource(ended=False) + job = self._get_target_class().from_api_repr(resource, client) + reload_error = exceptions.DataLoss("Oops, sorry!") + job.reload = mock.Mock(side_effect=reload_error) + job._exception = None + + job._done_or_raise() + + assert job._exception is bad_request_error + + def test__done_or_raise_w_job_query_results_ok_reload_error(self): + client = _make_client(project=self.PROJECT) + query_results = google.cloud.bigquery.query._QueryResults( + properties={ + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": "12345"}, + } + ) + client._get_query_results = mock.Mock(return_value=query_results) + + resource = self._make_resource(ended=False) + job = self._get_target_class().from_api_repr(resource, client) + retry_error = exceptions.RetryError("Too many retries", cause=TimeoutError) + job.reload = mock.Mock(side_effect=retry_error) + job._exception = None + + job._done_or_raise() + + assert job._exception is retry_error + + def test_query_plan(self): + from google.cloud._helpers import _RFC3339_MICROS + from google.cloud.bigquery.job import QueryPlanEntry + from google.cloud.bigquery.job import QueryPlanEntryStep + + plan_entries = [ + { + "name": "NAME", + "id": "1234", + "inputStages": ["88", "101"], + "startMs": "1522540800000", + "endMs": "1522540804000", + "parallelInputs": "1000", + "completedParallelInputs": "5", + "waitMsAvg": "33", + "waitMsMax": "400", + "waitRatioAvg": 2.71828, + "waitRatioMax": 3.14159, + "readMsAvg": "45", + "readMsMax": "90", + "readRatioAvg": 1.41421, + "readRatioMax": 1.73205, + "computeMsAvg": "55", + "computeMsMax": "99", + "computeRatioAvg": 0.69315, + "computeRatioMax": 1.09861, + "writeMsAvg": "203", + "writeMsMax": "340", + "writeRatioAvg": 3.32193, + "writeRatioMax": 2.30258, + "recordsRead": "100", + "recordsWritten": "1", + "status": "STATUS", + "shuffleOutputBytes": "1024", + "shuffleOutputBytesSpilled": "1", + "steps": [{"kind": "KIND", "substeps": ["SUBSTEP1", "SUBSTEP2"]}], + } + ] + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertEqual(job.query_plan, []) + + statistics = job._properties["statistics"] = {} + self.assertEqual(job.query_plan, []) + + query_stats = statistics["query"] = {} + self.assertEqual(job.query_plan, []) + + query_stats["queryPlan"] = plan_entries + + self.assertEqual(len(job.query_plan), len(plan_entries)) + for found, expected in zip(job.query_plan, plan_entries): + self.assertIsInstance(found, QueryPlanEntry) + self.assertEqual(found.name, expected["name"]) + self.assertEqual(found.entry_id, expected["id"]) + self.assertEqual(len(found.input_stages), len(expected["inputStages"])) + for f_id in found.input_stages: + self.assertIn(f_id, [int(e) for e in expected["inputStages"]]) + self.assertEqual( + found.start.strftime(_RFC3339_MICROS), "2018-04-01T00:00:00.000000Z" + ) + self.assertEqual( + found.end.strftime(_RFC3339_MICROS), "2018-04-01T00:00:04.000000Z" + ) + self.assertEqual(found.parallel_inputs, int(expected["parallelInputs"])) + self.assertEqual( + found.completed_parallel_inputs, + int(expected["completedParallelInputs"]), + ) + self.assertEqual(found.wait_ms_avg, int(expected["waitMsAvg"])) + self.assertEqual(found.wait_ms_max, int(expected["waitMsMax"])) + self.assertEqual(found.wait_ratio_avg, expected["waitRatioAvg"]) + self.assertEqual(found.wait_ratio_max, expected["waitRatioMax"]) + self.assertEqual(found.read_ms_avg, int(expected["readMsAvg"])) + self.assertEqual(found.read_ms_max, int(expected["readMsMax"])) + self.assertEqual(found.read_ratio_avg, expected["readRatioAvg"]) + self.assertEqual(found.read_ratio_max, expected["readRatioMax"]) + self.assertEqual(found.compute_ms_avg, int(expected["computeMsAvg"])) + self.assertEqual(found.compute_ms_max, int(expected["computeMsMax"])) + self.assertEqual(found.compute_ratio_avg, expected["computeRatioAvg"]) + self.assertEqual(found.compute_ratio_max, expected["computeRatioMax"]) + self.assertEqual(found.write_ms_avg, int(expected["writeMsAvg"])) + self.assertEqual(found.write_ms_max, int(expected["writeMsMax"])) + self.assertEqual(found.write_ratio_avg, expected["writeRatioAvg"]) + self.assertEqual(found.write_ratio_max, expected["writeRatioMax"]) + self.assertEqual(found.records_read, int(expected["recordsRead"])) + self.assertEqual(found.records_written, int(expected["recordsWritten"])) + self.assertEqual(found.status, expected["status"]) + self.assertEqual( + found.shuffle_output_bytes, int(expected["shuffleOutputBytes"]) + ) + self.assertEqual( + found.shuffle_output_bytes_spilled, + int(expected["shuffleOutputBytesSpilled"]), + ) + + self.assertEqual(len(found.steps), len(expected["steps"])) + for f_step, e_step in zip(found.steps, expected["steps"]): + self.assertIsInstance(f_step, QueryPlanEntryStep) + self.assertEqual(f_step.kind, e_step["kind"]) + self.assertEqual(f_step.substeps, e_step["substeps"]) + + def test_total_bytes_processed(self): + total_bytes = 1234 + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.total_bytes_processed) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.total_bytes_processed) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.total_bytes_processed) + + query_stats["totalBytesProcessed"] = str(total_bytes) + self.assertEqual(job.total_bytes_processed, total_bytes) + + def test_total_bytes_billed(self): + total_bytes = 1234 + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.total_bytes_billed) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.total_bytes_billed) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.total_bytes_billed) + + query_stats["totalBytesBilled"] = str(total_bytes) + self.assertEqual(job.total_bytes_billed, total_bytes) + + def test_billing_tier(self): + billing_tier = 1 + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.billing_tier) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.billing_tier) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.billing_tier) + + query_stats["billingTier"] = billing_tier + self.assertEqual(job.billing_tier, billing_tier) + + def test_cache_hit(self): + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.cache_hit) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.cache_hit) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.cache_hit) + + query_stats["cacheHit"] = True + self.assertTrue(job.cache_hit) + + def test_ddl_operation_performed(self): + op = "SKIP" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.ddl_operation_performed) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.ddl_operation_performed) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.ddl_operation_performed) + + query_stats["ddlOperationPerformed"] = op + self.assertEqual(job.ddl_operation_performed, op) + + def test_ddl_target_routine(self): + from google.cloud.bigquery.routine import RoutineReference + + ref_routine = { + "projectId": self.PROJECT, + "datasetId": "ddl_ds", + "routineId": "targetroutine", + } + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.ddl_target_routine) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.ddl_target_routine) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.ddl_target_routine) + + query_stats["ddlTargetRoutine"] = ref_routine + self.assertIsInstance(job.ddl_target_routine, RoutineReference) + self.assertEqual(job.ddl_target_routine.routine_id, "targetroutine") + self.assertEqual(job.ddl_target_routine.dataset_id, "ddl_ds") + self.assertEqual(job.ddl_target_routine.project, self.PROJECT) + + def test_ddl_target_table(self): + from google.cloud.bigquery.table import TableReference + + ref_table = { + "projectId": self.PROJECT, + "datasetId": "ddl_ds", + "tableId": "targettable", + } + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.ddl_target_table) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.ddl_target_table) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.ddl_target_table) + + query_stats["ddlTargetTable"] = ref_table + self.assertIsInstance(job.ddl_target_table, TableReference) + self.assertEqual(job.ddl_target_table.table_id, "targettable") + self.assertEqual(job.ddl_target_table.dataset_id, "ddl_ds") + self.assertEqual(job.ddl_target_table.project, self.PROJECT) + + def test_num_dml_affected_rows(self): + num_rows = 1234 + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.num_dml_affected_rows) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.num_dml_affected_rows) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.num_dml_affected_rows) + + query_stats["numDmlAffectedRows"] = str(num_rows) + self.assertEqual(job.num_dml_affected_rows, num_rows) + + def test_slot_millis(self): + millis = 1234 + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.slot_millis) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.slot_millis) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.slot_millis) + + query_stats["totalSlotMs"] = millis + self.assertEqual(job.slot_millis, millis) + + def test_statement_type(self): + statement_type = "SELECT" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.statement_type) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.statement_type) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.statement_type) + + query_stats["statementType"] = statement_type + self.assertEqual(job.statement_type, statement_type) + + def test_referenced_tables(self): + from google.cloud.bigquery.table import TableReference + + ref_tables_resource = [ + {"projectId": self.PROJECT, "datasetId": "dataset", "tableId": "local1"}, + {"projectId": self.PROJECT, "datasetId": "dataset", "tableId": "local2"}, + { + "projectId": "other-project-123", + "datasetId": "other-dataset", + "tableId": "other-table", + }, + ] + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertEqual(job.referenced_tables, []) + + statistics = job._properties["statistics"] = {} + self.assertEqual(job.referenced_tables, []) + + query_stats = statistics["query"] = {} + self.assertEqual(job.referenced_tables, []) + + query_stats["referencedTables"] = ref_tables_resource + + local1, local2, remote = job.referenced_tables + + self.assertIsInstance(local1, TableReference) + self.assertEqual(local1.table_id, "local1") + self.assertEqual(local1.dataset_id, "dataset") + self.assertEqual(local1.project, self.PROJECT) + + self.assertIsInstance(local2, TableReference) + self.assertEqual(local2.table_id, "local2") + self.assertEqual(local2.dataset_id, "dataset") + self.assertEqual(local2.project, self.PROJECT) + + self.assertIsInstance(remote, TableReference) + self.assertEqual(remote.table_id, "other-table") + self.assertEqual(remote.dataset_id, "other-dataset") + self.assertEqual(remote.project, "other-project-123") + + def test_timeline(self): + timeline_resource = [ + { + "elapsedMs": 1, + "activeUnits": 22, + "pendingUnits": 33, + "completedUnits": 44, + "totalSlotMs": 101, + } + ] + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertEqual(job.timeline, []) + + statistics = job._properties["statistics"] = {} + self.assertEqual(job.timeline, []) + + query_stats = statistics["query"] = {} + self.assertEqual(job.timeline, []) + + query_stats["timeline"] = timeline_resource + + self.assertEqual(len(job.timeline), len(timeline_resource)) + self.assertEqual(job.timeline[0].elapsed_ms, 1) + self.assertEqual(job.timeline[0].active_units, 22) + self.assertEqual(job.timeline[0].pending_units, 33) + self.assertEqual(job.timeline[0].completed_units, 44) + self.assertEqual(job.timeline[0].slot_millis, 101) + + def test_undeclared_query_parameters(self): + from google.cloud.bigquery.query import ArrayQueryParameter + from google.cloud.bigquery.query import ScalarQueryParameter + from google.cloud.bigquery.query import StructQueryParameter + + undeclared = [ + { + "name": "my_scalar", + "parameterType": {"type": "STRING"}, + "parameterValue": {"value": "value"}, + }, + { + "name": "my_array", + "parameterType": {"type": "ARRAY", "arrayType": {"type": "INT64"}}, + "parameterValue": { + "arrayValues": [{"value": "1066"}, {"value": "1745"}] + }, + }, + { + "name": "my_struct", + "parameterType": { + "type": "STRUCT", + "structTypes": [{"name": "count", "type": {"type": "INT64"}}], + }, + "parameterValue": {"structValues": {"count": {"value": "123"}}}, + }, + ] + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertEqual(job.undeclared_query_parameters, []) + + statistics = job._properties["statistics"] = {} + self.assertEqual(job.undeclared_query_parameters, []) + + query_stats = statistics["query"] = {} + self.assertEqual(job.undeclared_query_parameters, []) + + query_stats["undeclaredQueryParameters"] = undeclared + + scalar, array, struct = job.undeclared_query_parameters + + self.assertIsInstance(scalar, ScalarQueryParameter) + self.assertEqual(scalar.name, "my_scalar") + self.assertEqual(scalar.type_, "STRING") + self.assertEqual(scalar.value, "value") + + self.assertIsInstance(array, ArrayQueryParameter) + self.assertEqual(array.name, "my_array") + self.assertEqual(array.array_type, "INT64") + self.assertEqual(array.values, [1066, 1745]) + + self.assertIsInstance(struct, StructQueryParameter) + self.assertEqual(struct.name, "my_struct") + self.assertEqual(struct.struct_types, {"count": "INT64"}) + self.assertEqual(struct.struct_values, {"count": 123}) + + def test_estimated_bytes_processed(self): + est_bytes = 123456 + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + self.assertIsNone(job.estimated_bytes_processed) + + statistics = job._properties["statistics"] = {} + self.assertIsNone(job.estimated_bytes_processed) + + query_stats = statistics["query"] = {} + self.assertIsNone(job.estimated_bytes_processed) + + query_stats["estimatedBytesProcessed"] = str(est_bytes) + self.assertEqual(job.estimated_bytes_processed, est_bytes) + + def test_dml_stats(self): + from google.cloud.bigquery.job.query import DmlStats + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, self.QUERY, client) + assert job.dml_stats is None + + statistics = job._properties["statistics"] = {} + assert job.dml_stats is None + + query_stats = statistics["query"] = {} + assert job.dml_stats is None + + query_stats["dmlStats"] = {"insertedRowCount": "35"} + assert isinstance(job.dml_stats, DmlStats) + assert job.dml_stats.inserted_row_count == 35 + + def test_result(self): + from google.cloud.bigquery.table import RowIterator + + query_resource = { + "jobComplete": False, + "jobReference": { + "projectId": self.PROJECT, + "jobId": self.JOB_ID, + "location": "EU", + }, + } + query_resource_done = { + "jobComplete": True, + "jobReference": { + "projectId": self.PROJECT, + "jobId": self.JOB_ID, + "location": "EU", + }, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "2", + } + job_resource = self._make_resource(started=True, location="EU") + job_resource_done = self._make_resource(started=True, ended=True, location="EU") + job_resource_done["configuration"]["query"]["destinationTable"] = { + "projectId": "dest-project", + "datasetId": "dest_dataset", + "tableId": "dest_table", + } + query_page_resource = { + # Explicitly set totalRows to be different from the initial + # response to test update during iteration. + "totalRows": "1", + "pageToken": None, + "rows": [{"f": [{"v": "abc"}]}], + } + conn = _make_connection( + query_resource, query_resource_done, job_resource_done, query_page_resource + ) + client = _make_client(self.PROJECT, connection=conn) + job = self._get_target_class().from_api_repr(job_resource, client) + + result = job.result() + + self.assertIsInstance(result, RowIterator) + self.assertEqual(result.total_rows, 2) + rows = list(result) + self.assertEqual(len(rows), 1) + self.assertEqual(rows[0].col1, "abc") + # Test that the total_rows property has changed during iteration, based + # on the response from tabledata.list. + self.assertEqual(result.total_rows, 1) + + query_results_path = f"/projects/{self.PROJECT}/queries/{self.JOB_ID}" + query_results_call = mock.call( + method="GET", + path=query_results_path, + query_params={"maxResults": 0, "location": "EU"}, + timeout=None, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", + query_params={"location": "EU"}, + timeout=None, + ) + query_page_call = mock.call( + method="GET", + path=query_results_path, + query_params={ + "fields": _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS, + "location": "EU", + "formatOptions.useInt64Timestamp": True, + }, + timeout=None, + ) + conn.api_request.assert_has_calls( + [query_results_call, query_results_call, reload_call, query_page_call] + ) + + def test_result_with_done_job_calls_get_query_results(self): + query_resource_done = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "1", + } + job_resource = self._make_resource(started=True, ended=True, location="EU") + job_resource["configuration"]["query"]["destinationTable"] = { + "projectId": "dest-project", + "datasetId": "dest_dataset", + "tableId": "dest_table", + } + results_page_resource = { + "totalRows": "1", + "pageToken": None, + "rows": [{"f": [{"v": "abc"}]}], + } + conn = _make_connection(query_resource_done, results_page_resource) + client = _make_client(self.PROJECT, connection=conn) + job = self._get_target_class().from_api_repr(job_resource, client) + + result = job.result() + + rows = list(result) + self.assertEqual(len(rows), 1) + self.assertEqual(rows[0].col1, "abc") + + query_results_path = f"/projects/{self.PROJECT}/queries/{self.JOB_ID}" + query_results_call = mock.call( + method="GET", + path=query_results_path, + query_params={"maxResults": 0, "location": "EU"}, + timeout=None, + ) + query_results_page_call = mock.call( + method="GET", + path=query_results_path, + query_params={ + "fields": _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS, + "location": "EU", + "formatOptions.useInt64Timestamp": True, + }, + timeout=None, + ) + conn.api_request.assert_has_calls([query_results_call, query_results_page_call]) + + def test_result_with_max_results(self): + from google.cloud.bigquery.table import RowIterator + + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "5", + } + query_page_resource = { + "totalRows": "5", + "pageToken": None, + "rows": [ + {"f": [{"v": "abc"}]}, + {"f": [{"v": "def"}]}, + {"f": [{"v": "ghi"}]}, + ], + } + connection = _make_connection(query_resource, query_page_resource) + client = _make_client(self.PROJECT, connection=connection) + resource = self._make_resource(ended=True) + job = self._get_target_class().from_api_repr(resource, client) + + max_results = 3 + + result = job.result(max_results=max_results) + + self.assertIsInstance(result, RowIterator) + self.assertEqual(result.total_rows, 5) + + rows = list(result) + + self.assertEqual(len(rows), 3) + self.assertEqual(len(connection.api_request.call_args_list), 2) + query_page_request = connection.api_request.call_args_list[1] + self.assertEqual( + query_page_request[1]["query_params"]["maxResults"], max_results + ) + + def test_result_w_retry(self): + from google.cloud.bigquery.table import RowIterator + + query_resource = { + "jobComplete": False, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + } + query_resource_done = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "2", + } + job_resource = self._make_resource(started=True, location="asia-northeast1") + job_resource_done = self._make_resource( + started=True, ended=True, location="asia-northeast1" + ) + job_resource_done["configuration"]["query"]["destinationTable"] = { + "projectId": "dest-project", + "datasetId": "dest_dataset", + "tableId": "dest_table", + } + + connection = _make_connection( + exceptions.NotFound("not normally retriable"), + query_resource, + exceptions.NotFound("not normally retriable"), + query_resource_done, + exceptions.NotFound("not normally retriable"), + job_resource_done, + ) + client = _make_client(self.PROJECT, connection=connection) + job = self._get_target_class().from_api_repr(job_resource, client) + + custom_predicate = mock.Mock() + custom_predicate.return_value = True + custom_retry = google.api_core.retry.Retry( + initial=0.001, + maximum=0.001, + multiplier=1.0, + deadline=0.1, + predicate=custom_predicate, + ) + + self.assertIsInstance(job.result(retry=custom_retry), RowIterator) + query_results_call = mock.call( + method="GET", + path=f"/projects/{self.PROJECT}/queries/{self.JOB_ID}", + query_params={"maxResults": 0, "location": "asia-northeast1"}, + timeout=None, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", + query_params={"location": "asia-northeast1"}, + timeout=None, + ) + + connection.api_request.assert_has_calls( + [query_results_call, query_results_call, reload_call] + ) + + def test_result_w_empty_schema(self): + from google.cloud.bigquery.table import _EmptyRowIterator + + # Destination table may have no schema for some DDL and DML queries. + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": []}, + } + connection = _make_connection(query_resource, query_resource) + client = _make_client(self.PROJECT, connection=connection) + resource = self._make_resource(ended=True) + job = self._get_target_class().from_api_repr(resource, client) + + result = job.result() + + self.assertIsInstance(result, _EmptyRowIterator) + self.assertEqual(list(result), []) + + def test_result_invokes_begins(self): + begun_resource = self._make_resource() + incomplete_resource = { + "jobComplete": False, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + } + query_resource = copy.deepcopy(incomplete_resource) + query_resource["jobComplete"] = True + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, + incomplete_resource, + query_resource, + done_resource, + query_resource, + ) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + + job.result() + + self.assertEqual(len(connection.api_request.call_args_list), 4) + begin_request = connection.api_request.call_args_list[0] + query_request = connection.api_request.call_args_list[2] + reload_request = connection.api_request.call_args_list[3] + self.assertEqual(begin_request[1]["method"], "POST") + self.assertEqual(query_request[1]["method"], "GET") + self.assertEqual(reload_request[1]["method"], "GET") + + def test_result_w_timeout(self): + import google.cloud.bigquery.client + + begun_resource = self._make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + } + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection(begun_resource, query_resource, done_resource) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + + with freezegun.freeze_time("1970-01-01 00:00:00", tick=False): + job.result(timeout=1.0) + + self.assertEqual(len(connection.api_request.call_args_list), 3) + begin_request = connection.api_request.call_args_list[0] + query_request = connection.api_request.call_args_list[1] + reload_request = connection.api_request.call_args_list[2] + self.assertEqual(begin_request[1]["method"], "POST") + self.assertEqual(query_request[1]["method"], "GET") + self.assertEqual( + query_request[1]["path"], + "/projects/{}/queries/{}".format(self.PROJECT, self.JOB_ID), + ) + self.assertEqual(query_request[1]["query_params"]["timeoutMs"], 900) + self.assertEqual( + query_request[1]["timeout"], + google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT, + ) + self.assertEqual(reload_request[1]["method"], "GET") + + def test_result_w_page_size(self): + # Arrange + query_results_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "4", + } + job_resource = self._make_resource(started=True, ended=True, location="US") + q_config = job_resource["configuration"]["query"] + q_config["destinationTable"] = { + "projectId": self.PROJECT, + "datasetId": self.DS_ID, + "tableId": self.TABLE_ID, + } + query_page_resource = { + "totalRows": 4, + "pageToken": "some-page-token", + "rows": [ + {"f": [{"v": "row1"}]}, + {"f": [{"v": "row2"}]}, + {"f": [{"v": "row3"}]}, + ], + } + query_page_resource_2 = {"totalRows": 4, "rows": [{"f": [{"v": "row4"}]}]} + conn = _make_connection( + query_results_resource, query_page_resource, query_page_resource_2 + ) + client = _make_client(self.PROJECT, connection=conn) + job = self._get_target_class().from_api_repr(job_resource, client) + + # Act + result = job.result(page_size=3) + + # Assert + actual_rows = list(result) + self.assertEqual(len(actual_rows), 4) + + query_results_path = f"/projects/{self.PROJECT}/queries/{self.JOB_ID}" + query_page_1_call = mock.call( + method="GET", + path=query_results_path, + query_params={ + "maxResults": 3, + "fields": _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS, + "location": "US", + "formatOptions.useInt64Timestamp": True, + }, + timeout=None, + ) + query_page_2_call = mock.call( + method="GET", + path=query_results_path, + query_params={ + "pageToken": "some-page-token", + "maxResults": 3, + "fields": _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS, + "location": "US", + "formatOptions.useInt64Timestamp": True, + }, + timeout=None, + ) + conn.api_request.assert_has_calls([query_page_1_call, query_page_2_call]) + + def test_result_with_start_index(self): + from google.cloud.bigquery.table import RowIterator + + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "5", + } + tabledata_resource = { + "totalRows": "5", + "pageToken": None, + "rows": [ + {"f": [{"v": "abc"}]}, + {"f": [{"v": "def"}]}, + {"f": [{"v": "ghi"}]}, + {"f": [{"v": "jkl"}]}, + ], + } + connection = _make_connection(query_resource, tabledata_resource) + client = _make_client(self.PROJECT, connection=connection) + resource = self._make_resource(ended=True) + job = self._get_target_class().from_api_repr(resource, client) + + start_index = 1 + + result = job.result(start_index=start_index) + + self.assertIsInstance(result, RowIterator) + self.assertEqual(result.total_rows, 5) + + rows = list(result) + + self.assertEqual(len(rows), 4) + self.assertEqual(len(connection.api_request.call_args_list), 2) + tabledata_list_request = connection.api_request.call_args_list[1] + self.assertEqual( + tabledata_list_request[1]["query_params"]["startIndex"], start_index + ) + + def test_result_error(self): + from google.cloud import exceptions + + query = textwrap.dedent( + """ + SELECT foo, bar + FROM table_baz + WHERE foo == bar""" + ) + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, query, client) + error_result = { + "debugInfo": "DEBUG", + "location": "LOCATION", + "message": "MESSAGE", + "reason": "invalid", + } + job._properties["status"] = { + "errorResult": error_result, + "errors": [error_result], + "state": "DONE", + } + job._query_results = google.cloud.bigquery.query._QueryResults.from_api_repr( + {"jobComplete": True, "jobReference": job._properties["jobReference"]} + ) + job._set_future_result() + + with self.assertRaises(exceptions.GoogleCloudError) as exc_info: + job.result() + + self.assertIsInstance(exc_info.exception, exceptions.GoogleCloudError) + self.assertEqual(exc_info.exception.code, http.client.BAD_REQUEST) + + exc_job_instance = getattr(exc_info.exception, "query_job", None) + self.assertIs(exc_job_instance, job) + + full_text = str(exc_info.exception) + assert job.job_id in full_text + assert "Query Job SQL Follows" in full_text + + for i, line in enumerate(query.splitlines(), start=1): + expected_line = "{}:{}".format(i, line) + assert expected_line in full_text + + def test_result_transport_timeout_error(self): + query = textwrap.dedent( + """ + SELECT foo, bar + FROM table_baz + WHERE foo == bar""" + ) + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, query, client) + call_api_patch = mock.patch( + "google.cloud.bigquery.client.Client._call_api", + autospec=True, + side_effect=requests.exceptions.Timeout("Server response took too long."), + ) + + # Make sure that timeout errors get rebranded to concurrent futures timeout. + with call_api_patch, self.assertRaises(concurrent.futures.TimeoutError): + job.result(timeout=1) + + def test__begin_error(self): + from google.cloud import exceptions + + query = textwrap.dedent( + """ + SELECT foo, bar + FROM table_baz + WHERE foo == bar""" + ) + + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, query, client) + call_api_patch = mock.patch( + "google.cloud.bigquery.client.Client._call_api", + autospec=True, + side_effect=exceptions.BadRequest("Syntax error in SQL query"), + ) + + with call_api_patch, self.assertRaises(exceptions.GoogleCloudError) as exc_info: + job.result() + + self.assertIsInstance(exc_info.exception, exceptions.GoogleCloudError) + self.assertEqual(exc_info.exception.code, http.client.BAD_REQUEST) + + exc_job_instance = getattr(exc_info.exception, "query_job", None) + self.assertIs(exc_job_instance, job) + + full_text = str(exc_info.exception) + assert job.job_id in full_text + assert "Query Job SQL Follows" in full_text + + for i, line in enumerate(query.splitlines(), start=1): + expected_line = "{}:{}".format(i, line) + assert expected_line in full_text + + def test__begin_w_timeout(self): + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, self.QUERY, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin(timeout=7.5) + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": {"query": self.QUERY, "useLegacySql": False} + }, + }, + timeout=7.5, + ) + + def test_begin_w_bound_client(self): + from google.cloud.bigquery.dataset import DatasetReference + from google.cloud.bigquery.job import QueryJobConfig + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + DS_ID = "DATASET" + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + + config = QueryJobConfig() + config.default_dataset = DatasetReference(self.PROJECT, DS_ID) + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + self.assertIsNone(job.default_dataset) + self.assertEqual(job.udf_resources, []) + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": { + "query": self.QUERY, + "useLegacySql": False, + "defaultDataset": { + "projectId": self.PROJECT, + "datasetId": DS_ID, + }, + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_alternate_client(self): + from google.cloud.bigquery.dataset import DatasetReference + from google.cloud.bigquery.job import CreateDisposition + from google.cloud.bigquery.job import QueryJobConfig + from google.cloud.bigquery.job import QueryPriority + from google.cloud.bigquery.job import SchemaUpdateOption + from google.cloud.bigquery.job import WriteDisposition + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + TABLE = "TABLE" + DS_ID = "DATASET" + RESOURCE = self._make_resource(ended=True) + QUERY_CONFIGURATION = { + "query": self.QUERY, + "allowLargeResults": True, + "createDisposition": CreateDisposition.CREATE_NEVER, + "defaultDataset": {"projectId": self.PROJECT, "datasetId": DS_ID}, + "destinationTable": { + "projectId": self.PROJECT, + "datasetId": DS_ID, + "tableId": TABLE, + }, + "flattenResults": True, + "priority": QueryPriority.INTERACTIVE, + "useQueryCache": True, + "useLegacySql": True, + "writeDisposition": WriteDisposition.WRITE_TRUNCATE, + "maximumBillingTier": 4, + "maximumBytesBilled": "123456", + "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_RELAXATION], + } + RESOURCE["configuration"]["query"] = QUERY_CONFIGURATION + RESOURCE["configuration"]["dryRun"] = True + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + dataset_ref = DatasetReference(self.PROJECT, DS_ID) + table_ref = dataset_ref.table(TABLE) + + config = QueryJobConfig() + config.allow_large_results = True + config.create_disposition = CreateDisposition.CREATE_NEVER + config.default_dataset = dataset_ref + config.destination = table_ref + config.dry_run = True + config.flatten_results = True + config.maximum_billing_tier = 4 + config.priority = QueryPriority.INTERACTIVE + config.use_legacy_sql = True + config.use_query_cache = True + config.write_disposition = WriteDisposition.WRITE_TRUNCATE + config.maximum_bytes_billed = 123456 + config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_RELAXATION] + job = self._make_one(self.JOB_ID, self.QUERY, client1, job_config=config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": {"dryRun": True, "query": QUERY_CONFIGURATION}, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_udf(self): + from google.cloud.bigquery.job import QueryJobConfig + from google.cloud.bigquery.query import UDFResource + + RESOURCE_URI = "gs://some-bucket/js/lib.js" + INLINE_UDF_CODE = 'var someCode = "here";' + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + RESOURCE["configuration"]["query"]["userDefinedFunctionResources"] = [ + {"resourceUri": RESOURCE_URI}, + {"inlineCode": INLINE_UDF_CODE}, + ] + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + udf_resources = [ + UDFResource("resourceUri", RESOURCE_URI), + UDFResource("inlineCode", INLINE_UDF_CODE), + ] + config = QueryJobConfig() + config.udf_resources = udf_resources + config.use_legacy_sql = True + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + self.assertEqual(job.udf_resources, udf_resources) + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": { + "query": self.QUERY, + "useLegacySql": True, + "userDefinedFunctionResources": [ + {"resourceUri": RESOURCE_URI}, + {"inlineCode": INLINE_UDF_CODE}, + ], + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_named_query_parameter(self): + from google.cloud.bigquery.job import QueryJobConfig + from google.cloud.bigquery.query import ScalarQueryParameter + + query_parameters = [ScalarQueryParameter("foo", "INT64", 123)] + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + config = RESOURCE["configuration"]["query"] + config["parameterMode"] = "NAMED" + config["queryParameters"] = [ + { + "name": "foo", + "parameterType": {"type": "INT64"}, + "parameterValue": {"value": "123"}, + } + ] + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + jconfig = QueryJobConfig() + jconfig.query_parameters = query_parameters + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=jconfig) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + self.assertEqual(job.query_parameters, query_parameters) + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": { + "query": self.QUERY, + "useLegacySql": False, + "parameterMode": "NAMED", + "queryParameters": config["queryParameters"], + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_positional_query_parameter(self): + from google.cloud.bigquery.job import QueryJobConfig + from google.cloud.bigquery.query import ScalarQueryParameter + + query_parameters = [ScalarQueryParameter.positional("INT64", 123)] + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + config = RESOURCE["configuration"]["query"] + config["parameterMode"] = "POSITIONAL" + config["queryParameters"] = [ + {"parameterType": {"type": "INT64"}, "parameterValue": {"value": "123"}} + ] + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + jconfig = QueryJobConfig() + jconfig.query_parameters = query_parameters + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=jconfig) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + self.assertEqual(job.query_parameters, query_parameters) + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": { + "query": self.QUERY, + "useLegacySql": False, + "parameterMode": "POSITIONAL", + "queryParameters": config["queryParameters"], + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_begin_w_table_defs(self): + from google.cloud.bigquery.job import QueryJobConfig + from google.cloud.bigquery.external_config import ExternalConfig + from google.cloud.bigquery.external_config import BigtableColumn + from google.cloud.bigquery.external_config import BigtableColumnFamily + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + + bt_config = ExternalConfig("BIGTABLE") + bt_config.ignore_unknown_values = True + bt_config.options.read_rowkey_as_string = True + cf = BigtableColumnFamily() + cf.family_id = "cf" + col = BigtableColumn() + col.field_name = "fn" + cf.columns = [col] + bt_config.options.column_families = [cf] + BT_CONFIG_RESOURCE = { + "sourceFormat": "BIGTABLE", + "ignoreUnknownValues": True, + "bigtableOptions": { + "readRowkeyAsString": True, + "columnFamilies": [ + {"familyId": "cf", "columns": [{"fieldName": "fn"}]} + ], + }, + } + CSV_CONFIG_RESOURCE = { + "sourceFormat": "CSV", + "maxBadRecords": 8, + "csvOptions": {"allowJaggedRows": True}, + } + csv_config = ExternalConfig("CSV") + csv_config.max_bad_records = 8 + csv_config.options.allow_jagged_rows = True + bt_table = "bigtable-table" + csv_table = "csv-table" + RESOURCE["configuration"]["query"]["tableDefinitions"] = { + bt_table: BT_CONFIG_RESOURCE, + csv_table: CSV_CONFIG_RESOURCE, + } + want_resource = copy.deepcopy(RESOURCE) + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + config = QueryJobConfig() + config.table_definitions = {bt_table: bt_config, csv_table: csv_config} + config.use_legacy_sql = True + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": { + "query": self.QUERY, + "useLegacySql": True, + "tableDefinitions": { + bt_table: BT_CONFIG_RESOURCE, + csv_table: CSV_CONFIG_RESOURCE, + }, + } + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, want_resource) + + def test_dry_run_query(self): + from google.cloud.bigquery.job import QueryJobConfig + + PATH = "/projects/%s/jobs" % (self.PROJECT,) + RESOURCE = self._make_resource() + # Ensure None for missing server-set props + del RESOURCE["statistics"]["creationTime"] + del RESOURCE["etag"] + del RESOURCE["selfLink"] + del RESOURCE["user_email"] + RESOURCE["configuration"]["dryRun"] = True + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + config = QueryJobConfig() + config.dry_run = True + job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job._begin() + + final_attributes.assert_called_with({"path": PATH}, client, job) + self.assertEqual(job.udf_resources, []) + conn.api_request.assert_called_once_with( + method="POST", + path=PATH, + data={ + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "configuration": { + "query": {"query": self.QUERY, "useLegacySql": False}, + "dryRun": True, + }, + }, + timeout=None, + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_exists_miss_w_bound_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn = _make_connection() + client = _make_client(project=self.PROJECT, connection=conn) + job = self._make_one(self.JOB_ID, self.QUERY, client) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertFalse(job.exists()) + + final_attributes.assert_called_with({"path": PATH}, client, job) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_exists_hit_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection({}) + client2 = _make_client(project=self.PROJECT, connection=conn2) + job = self._make_one(self.JOB_ID, self.QUERY, client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + self.assertTrue(job.exists(client=client2)) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={"fields": "id"}, timeout=None + ) + + def test_reload_w_bound_client(self): + from google.cloud.bigquery.dataset import DatasetReference + from google.cloud.bigquery.job import QueryJobConfig + + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + DS_ID = "DATASET" + DEST_TABLE = "dest_table" + RESOURCE = self._make_resource() + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + dataset_ref = DatasetReference(self.PROJECT, DS_ID) + table_ref = dataset_ref.table(DEST_TABLE) + config = QueryJobConfig() + config.destination = table_ref + job = self._make_one(self.JOB_ID, None, client, job_config=config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload() + + final_attributes.assert_called_with({"path": PATH}, client, job) + + self.assertNotEqual(job.destination, table_ref) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_reload_w_alternate_client(self): + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + DS_ID = "DATASET" + DEST_TABLE = "dest_table" + RESOURCE = self._make_resource() + q_config = RESOURCE["configuration"]["query"] + q_config["destinationTable"] = { + "projectId": self.PROJECT, + "datasetId": DS_ID, + "tableId": DEST_TABLE, + } + conn1 = _make_connection() + client1 = _make_client(project=self.PROJECT, connection=conn1) + conn2 = _make_connection(RESOURCE) + client2 = _make_client(project=self.PROJECT, connection=conn2) + job = self._make_one(self.JOB_ID, self.QUERY, client1) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload(client=client2) + + final_attributes.assert_called_with({"path": PATH}, client2, job) + + conn1.api_request.assert_not_called() + conn2.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=None + ) + self._verifyResourceProperties(job, RESOURCE) + + def test_reload_w_timeout(self): + from google.cloud.bigquery.dataset import DatasetReference + from google.cloud.bigquery.job import QueryJobConfig + + PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) + DS_ID = "DATASET" + DEST_TABLE = "dest_table" + RESOURCE = self._make_resource() + conn = _make_connection(RESOURCE) + client = _make_client(project=self.PROJECT, connection=conn) + dataset_ref = DatasetReference(self.PROJECT, DS_ID) + table_ref = dataset_ref.table(DEST_TABLE) + config = QueryJobConfig() + config.destination = table_ref + job = self._make_one(self.JOB_ID, None, client, job_config=config) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + job.reload(timeout=4.2) + + final_attributes.assert_called_with({"path": PATH}, client, job) + + self.assertNotEqual(job.destination, table_ref) + + conn.api_request.assert_called_once_with( + method="GET", path=PATH, query_params={}, timeout=4.2 + ) + + def test_iter(self): + begun_resource = self._make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "totalRows": "0", + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + } + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection(begun_resource, query_resource, done_resource) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + + self.assertIsInstance(iter(job), types.GeneratorType) diff --git a/tests/unit/job/test_query_config.py b/tests/unit/job/test_query_config.py new file mode 100644 index 000000000..109cf7e44 --- /dev/null +++ b/tests/unit/job/test_query_config.py @@ -0,0 +1,311 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .helpers import _Base + + +class TestQueryJobConfig(_Base): + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import QueryJobConfig + + return QueryJobConfig + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def test_ctor(self): + config = self._make_one() + self.assertEqual(config._properties, {"query": {}}) + + def test_ctor_w_none(self): + config = self._make_one() + config.default_dataset = None + config.destination = None + self.assertIsNone(config.default_dataset) + self.assertIsNone(config.destination) + + def test_ctor_w_properties(self): + config = self._get_target_class()(use_query_cache=False, use_legacy_sql=True) + + self.assertFalse(config.use_query_cache) + self.assertTrue(config.use_legacy_sql) + + def test_ctor_w_string_default_dataset(self): + from google.cloud.bigquery import dataset + + default_dataset = "default-proj.default_dset" + config = self._get_target_class()(default_dataset=default_dataset) + expected = dataset.DatasetReference.from_string(default_dataset) + self.assertEqual(config.default_dataset, expected) + + def test_ctor_w_string_destinaton(self): + from google.cloud.bigquery import table + + destination = "dest-proj.dest_dset.dest_tbl" + config = self._get_target_class()(destination=destination) + expected = table.TableReference.from_string(destination) + self.assertEqual(config.destination, expected) + + def test_default_dataset_w_string(self): + from google.cloud.bigquery import dataset + + default_dataset = "default-proj.default_dset" + config = self._make_one() + config.default_dataset = default_dataset + expected = dataset.DatasetReference.from_string(default_dataset) + self.assertEqual(config.default_dataset, expected) + + def test_default_dataset_w_dataset(self): + from google.cloud.bigquery import dataset + + default_dataset = "default-proj.default_dset" + expected = dataset.DatasetReference.from_string(default_dataset) + config = self._make_one() + config.default_dataset = dataset.Dataset(expected) + self.assertEqual(config.default_dataset, expected) + + def test_destinaton_w_string(self): + from google.cloud.bigquery import table + + destination = "dest-proj.dest_dset.dest_tbl" + config = self._make_one() + config.destination = destination + expected = table.TableReference.from_string(destination) + self.assertEqual(config.destination, expected) + + def test_range_partitioning_w_none(self): + object_under_test = self._get_target_class()() + assert object_under_test.range_partitioning is None + + def test_range_partitioning_w_value(self): + object_under_test = self._get_target_class()() + object_under_test._properties["query"]["rangePartitioning"] = { + "field": "column_one", + "range": {"start": 1, "end": 1000, "interval": 10}, + } + object_under_test.range_partitioning.field == "column_one" + object_under_test.range_partitioning.range_.start == 1 + object_under_test.range_partitioning.range_.end == 1000 + object_under_test.range_partitioning.range_.interval == 10 + + def test_range_partitioning_setter(self): + from google.cloud.bigquery.table import PartitionRange + from google.cloud.bigquery.table import RangePartitioning + + object_under_test = self._get_target_class()() + object_under_test.range_partitioning = RangePartitioning( + field="column_one", range_=PartitionRange(start=1, end=1000, interval=10) + ) + object_under_test.range_partitioning.field == "column_one" + object_under_test.range_partitioning.range_.start == 1 + object_under_test.range_partitioning.range_.end == 1000 + object_under_test.range_partitioning.range_.interval == 10 + + def test_range_partitioning_setter_w_none(self): + object_under_test = self._get_target_class()() + object_under_test.range_partitioning = None + assert object_under_test.range_partitioning is None + + def test_range_partitioning_setter_w_wrong_type(self): + object_under_test = self._get_target_class()() + with pytest.raises(ValueError, match="RangePartitioning"): + object_under_test.range_partitioning = object() + + def test_time_partitioning(self): + from google.cloud.bigquery import table + + time_partitioning = table.TimePartitioning( + type_=table.TimePartitioningType.DAY, field="name" + ) + config = self._make_one() + config.time_partitioning = time_partitioning + # TimePartitioning should be configurable after assigning + time_partitioning.expiration_ms = 10000 + + self.assertEqual(config.time_partitioning.type_, table.TimePartitioningType.DAY) + self.assertEqual(config.time_partitioning.field, "name") + self.assertEqual(config.time_partitioning.expiration_ms, 10000) + + config.time_partitioning = None + self.assertIsNone(config.time_partitioning) + + def test_clustering_fields(self): + fields = ["email", "postal_code"] + config = self._get_target_class()() + config.clustering_fields = fields + self.assertEqual(config.clustering_fields, fields) + + config.clustering_fields = None + self.assertIsNone(config.clustering_fields) + + def test_from_api_repr_empty(self): + klass = self._get_target_class() + config = klass.from_api_repr({}) + self.assertIsNone(config.dry_run) + self.assertIsNone(config.use_legacy_sql) + self.assertIsNone(config.default_dataset) + self.assertIsNone(config.destination) + self.assertIsNone(config.destination_encryption_configuration) + + def test_from_api_repr_normal(self): + from google.cloud.bigquery.dataset import DatasetReference + + resource = { + "query": { + "useLegacySql": True, + "query": "no property for me", + "defaultDataset": { + "projectId": "someproject", + "datasetId": "somedataset", + }, + "someNewProperty": "I should be saved, too.", + }, + "dryRun": True, + } + klass = self._get_target_class() + + config = klass.from_api_repr(resource) + + self.assertTrue(config.use_legacy_sql) + self.assertEqual( + config.default_dataset, DatasetReference("someproject", "somedataset") + ) + self.assertTrue(config.dry_run) + # Make sure unknown properties propagate. + self.assertEqual(config._properties["query"]["query"], "no property for me") + self.assertEqual( + config._properties["query"]["someNewProperty"], "I should be saved, too." + ) + + def test_to_api_repr_normal(self): + from google.cloud.bigquery.dataset import DatasetReference + + config = self._make_one() + config.use_legacy_sql = True + config.default_dataset = DatasetReference("someproject", "somedataset") + config.dry_run = False + config._properties["someNewProperty"] = "Woohoo, alpha stuff." + + resource = config.to_api_repr() + + self.assertFalse(resource["dryRun"]) + self.assertTrue(resource["query"]["useLegacySql"]) + self.assertEqual( + resource["query"]["defaultDataset"]["projectId"], "someproject" + ) + self.assertEqual( + resource["query"]["defaultDataset"]["datasetId"], "somedataset" + ) + # Make sure unknown properties propagate. + self.assertEqual(resource["someNewProperty"], "Woohoo, alpha stuff.") + + def test_to_api_repr_with_encryption(self): + from google.cloud.bigquery.encryption_configuration import ( + EncryptionConfiguration, + ) + + config = self._make_one() + config.destination_encryption_configuration = EncryptionConfiguration( + kms_key_name=self.KMS_KEY_NAME + ) + resource = config.to_api_repr() + self.assertEqual( + resource, + { + "query": { + "destinationEncryptionConfiguration": { + "kmsKeyName": self.KMS_KEY_NAME + } + } + }, + ) + + def test_to_api_repr_with_encryption_none(self): + config = self._make_one() + config.destination_encryption_configuration = None + resource = config.to_api_repr() + self.assertEqual( + resource, {"query": {"destinationEncryptionConfiguration": None}} + ) + + def test_from_api_repr_with_encryption(self): + resource = { + "query": { + "destinationEncryptionConfiguration": {"kmsKeyName": self.KMS_KEY_NAME} + } + } + klass = self._get_target_class() + config = klass.from_api_repr(resource) + self.assertEqual( + config.destination_encryption_configuration.kms_key_name, self.KMS_KEY_NAME + ) + + def test_to_api_repr_with_script_options_none(self): + config = self._make_one() + config.script_options = None + + resource = config.to_api_repr() + + self.assertEqual(resource, {"query": {"scriptOptions": None}}) + self.assertIsNone(config.script_options) + + def test_to_api_repr_with_script_options(self): + from google.cloud.bigquery import KeyResultStatementKind + from google.cloud.bigquery import ScriptOptions + + config = self._make_one() + config.script_options = ScriptOptions( + statement_timeout_ms=60, + statement_byte_budget=999, + key_result_statement=KeyResultStatementKind.FIRST_SELECT, + ) + + resource = config.to_api_repr() + + expected_script_options_repr = { + "statementTimeoutMs": "60", + "statementByteBudget": "999", + "keyResultStatement": KeyResultStatementKind.FIRST_SELECT, + } + self.assertEqual( + resource, {"query": {"scriptOptions": expected_script_options_repr}} + ) + + def test_from_api_repr_with_script_options(self): + from google.cloud.bigquery import KeyResultStatementKind + from google.cloud.bigquery import ScriptOptions + + resource = { + "query": { + "scriptOptions": { + "statementTimeoutMs": "42", + "statementByteBudget": "123", + "keyResultStatement": KeyResultStatementKind.LAST, + }, + }, + } + klass = self._get_target_class() + + config = klass.from_api_repr(resource) + + script_options = config.script_options + self.assertIsInstance(script_options, ScriptOptions) + self.assertEqual(script_options.statement_timeout_ms, 42) + self.assertEqual(script_options.statement_byte_budget, 123) + self.assertEqual( + script_options.key_result_statement, KeyResultStatementKind.LAST + ) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py new file mode 100644 index 000000000..b5af90c0b --- /dev/null +++ b/tests/unit/job/test_query_pandas.py @@ -0,0 +1,972 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import concurrent.futures +import copy +import json + +import mock +import pytest + +try: + import pandas +except (ImportError, AttributeError): # pragma: NO COVER + pandas = None +try: + import shapely +except (ImportError, AttributeError): # pragma: NO COVER + shapely = None +try: + import geopandas +except (ImportError, AttributeError): # pragma: NO COVER + geopandas = None +try: + import pyarrow +except (ImportError, AttributeError): # pragma: NO COVER + pyarrow = None +try: + from google.cloud import bigquery_storage +except (ImportError, AttributeError): # pragma: NO COVER + bigquery_storage = None +try: + from tqdm import tqdm +except (ImportError, AttributeError): # pragma: NO COVER + tqdm = None + +from .helpers import _make_client +from .helpers import _make_connection +from .helpers import _make_job_resource + + +@pytest.fixture +def table_read_options_kwarg(): + # Create a BigQuery Storage table read options object with pyarrow compression + # enabled if a recent-enough version of google-cloud-bigquery-storage dependency is + # installed to support the compression. + if not hasattr(bigquery_storage, "ArrowSerializationOptions"): + return {} + + read_options = bigquery_storage.ReadSession.TableReadOptions( + arrow_serialization_options=bigquery_storage.ArrowSerializationOptions( + buffer_compression=bigquery_storage.ArrowSerializationOptions.CompressionCodec.LZ4_FRAME + ) + ) + return {"read_options": read_options} + + +@pytest.mark.parametrize( + "query,expected", + ( + (None, False), + ("", False), + ("select name, age from table", False), + ("select name, age from table LIMIT 10;", False), + ("select name, age from table order by other_column;", True), + ("Select name, age From table Order By other_column", True), + ("SELECT name, age FROM table ORDER BY other_column;", True), + ("select name, age from table order\nby other_column", True), + ("Select name, age From table Order\nBy other_column;", True), + ("SELECT name, age FROM table ORDER\nBY other_column", True), + ("SelecT name, age froM table OrdeR \n\t BY other_column;", True), + ), +) +def test__contains_order_by(query, expected): + from google.cloud.bigquery import job as mut + + if expected: + assert mut._contains_order_by(query) + else: + assert not mut._contains_order_by(query) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +@pytest.mark.parametrize( + "query", + ( + "select name, age from table order by other_column;", + "Select name, age From table Order By other_column;", + "SELECT name, age FROM table ORDER BY other_column;", + "select name, age from table order\nby other_column;", + "Select name, age From table Order\nBy other_column;", + "SELECT name, age FROM table ORDER\nBY other_column;", + "SelecT name, age froM table OrdeR \n\t BY other_column;", + ), +) +def test_to_dataframe_bqstorage_preserve_order(query, table_read_options_kwarg): + from google.cloud.bigquery.job import QueryJob as target_class + + job_resource = _make_job_resource( + project_id="test-project", job_type="query", ended=True + ) + job_resource["configuration"]["query"]["query"] = query + job_resource["status"] = {"state": "DONE"} + get_query_results_resource = { + "jobComplete": True, + "jobReference": {"projectId": "test-project", "jobId": "test-job"}, + "schema": { + "fields": [ + {"name": "name", "type": "STRING", "mode": "NULLABLE"}, + {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, + ] + }, + "totalRows": "4", + } + connection = _make_connection(get_query_results_resource, job_resource) + client = _make_client(connection=connection) + job = target_class.from_api_repr(job_resource, client) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() + session.avro_schema.schema = json.dumps( + { + "type": "record", + "name": "__root__", + "fields": [ + {"name": "name", "type": ["null", "string"]}, + {"name": "age", "type": ["null", "long"]}, + ], + } + ) + bqstorage_client.create_read_session.return_value = session + + job.to_dataframe(bqstorage_client=bqstorage_client) + + destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( + **job_resource["configuration"]["query"]["destinationTable"] + ) + expected_session = bigquery_storage.ReadSession( + table=destination_table, + data_format=bigquery_storage.DataFormat.ARROW, + **table_read_options_kwarg, + ) + bqstorage_client.create_read_session.assert_called_once_with( + parent="projects/test-project", + read_session=expected_session, + max_stream_count=1, # Use a single stream to preserve row order. + ) + + +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +def test_to_arrow(): + from google.cloud.bigquery.job import QueryJob as target_class + + begun_resource = _make_job_resource(job_type="query") + query_resource = { + "jobComplete": True, + "jobReference": begun_resource["jobReference"], + "totalRows": "4", + "schema": { + "fields": [ + { + "name": "spouse_1", + "type": "RECORD", + "fields": [ + {"name": "name", "type": "STRING", "mode": "NULLABLE"}, + {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, + ], + }, + { + "name": "spouse_2", + "type": "RECORD", + "fields": [ + {"name": "name", "type": "STRING", "mode": "NULLABLE"}, + {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, + ], + }, + ] + }, + } + tabledata_resource = { + "rows": [ + { + "f": [ + {"v": {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}}, + {"v": {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}}, + ] + }, + { + "f": [ + {"v": {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}}, + {"v": {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}}, + ] + }, + ] + } + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, tabledata_resource + ) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + tbl = job.to_arrow(create_bqstorage_client=False) + + assert isinstance(tbl, pyarrow.Table) + assert tbl.num_rows == 2 + + # Check the schema. + assert tbl.schema[0].name == "spouse_1" + assert tbl.schema[0].type[0].name == "name" + assert tbl.schema[0].type[1].name == "age" + assert pyarrow.types.is_struct(tbl.schema[0].type) + assert pyarrow.types.is_string(tbl.schema[0].type[0].type) + assert pyarrow.types.is_int64(tbl.schema[0].type[1].type) + assert tbl.schema[1].name == "spouse_2" + assert tbl.schema[1].type[0].name == "name" + assert tbl.schema[1].type[1].name == "age" + assert pyarrow.types.is_struct(tbl.schema[1].type) + assert pyarrow.types.is_string(tbl.schema[1].type[0].type) + assert pyarrow.types.is_int64(tbl.schema[1].type[1].type) + + # Check the data. + tbl_data = tbl.to_pydict() + spouse_1 = tbl_data["spouse_1"] + assert spouse_1 == [ + {"name": "Phred Phlyntstone", "age": 32}, + {"name": "Bhettye Rhubble", "age": 27}, + ] + spouse_2 = tbl_data["spouse_2"] + assert spouse_2 == [ + {"name": "Wylma Phlyntstone", "age": 29}, + {"name": "Bharney Rhubble", "age": 33}, + ] + + +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +def test_to_arrow_max_results_no_progress_bar(): + from google.cloud.bigquery import table + from google.cloud.bigquery.job import QueryJob as target_class + from google.cloud.bigquery.schema import SchemaField + + connection = _make_connection({}) + client = _make_client(connection=connection) + begun_resource = _make_job_resource(job_type="query") + job = target_class.from_api_repr(begun_resource, client) + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + rows = [ + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = table.RowIterator(client, api_request, path, schema) + + result_patch = mock.patch( + "google.cloud.bigquery.job.QueryJob.result", return_value=row_iterator, + ) + with result_patch as result_patch_tqdm: + tbl = job.to_arrow(create_bqstorage_client=False, max_results=123) + + result_patch_tqdm.assert_called_once_with(max_results=123) + + assert isinstance(tbl, pyarrow.Table) + assert tbl.num_rows == 2 + + +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") +def test_to_arrow_w_tqdm_w_query_plan(): + from google.cloud.bigquery import table + from google.cloud.bigquery.job import QueryJob as target_class + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL + + begun_resource = _make_job_resource(job_type="query") + rows = [ + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, + ] + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + connection = _make_connection({}) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = table.RowIterator(client, api_request, path, schema) + + job._properties["statistics"] = { + "query": { + "queryPlan": [ + {"name": "S00: Input", "id": "0", "status": "COMPLETE"}, + {"name": "S01: Output", "id": "1", "status": "COMPLETE"}, + ] + }, + } + reload_patch = mock.patch( + "google.cloud.bigquery.job._AsyncJob.reload", autospec=True + ) + result_patch = mock.patch( + "google.cloud.bigquery.job.QueryJob.result", + side_effect=[ + concurrent.futures.TimeoutError, + concurrent.futures.TimeoutError, + row_iterator, + ], + ) + + with result_patch as result_patch_tqdm, reload_patch: + tbl = job.to_arrow(progress_bar_type="tqdm", create_bqstorage_client=False) + + assert result_patch_tqdm.call_count == 3 + assert isinstance(tbl, pyarrow.Table) + assert tbl.num_rows == 2 + result_patch_tqdm.assert_called_with( + timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None + ) + + +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") +def test_to_arrow_w_tqdm_w_pending_status(): + from google.cloud.bigquery import table + from google.cloud.bigquery.job import QueryJob as target_class + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL + + begun_resource = _make_job_resource(job_type="query") + rows = [ + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, + ] + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + connection = _make_connection({}) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = table.RowIterator(client, api_request, path, schema) + + job._properties["statistics"] = { + "query": { + "queryPlan": [ + {"name": "S00: Input", "id": "0", "status": "PENDING"}, + {"name": "S00: Input", "id": "1", "status": "COMPLETE"}, + ] + }, + } + reload_patch = mock.patch( + "google.cloud.bigquery.job._AsyncJob.reload", autospec=True + ) + result_patch = mock.patch( + "google.cloud.bigquery.job.QueryJob.result", + side_effect=[concurrent.futures.TimeoutError, row_iterator], + ) + + with result_patch as result_patch_tqdm, reload_patch: + tbl = job.to_arrow(progress_bar_type="tqdm", create_bqstorage_client=False) + + assert result_patch_tqdm.call_count == 2 + assert isinstance(tbl, pyarrow.Table) + assert tbl.num_rows == 2 + result_patch_tqdm.assert_called_with( + timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None + ) + + +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") +def test_to_arrow_w_tqdm_wo_query_plan(): + from google.cloud.bigquery import table + from google.cloud.bigquery.job import QueryJob as target_class + from google.cloud.bigquery.schema import SchemaField + + begun_resource = _make_job_resource(job_type="query") + rows = [ + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, + ] + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + connection = _make_connection({}) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = table.RowIterator(client, api_request, path, schema) + + reload_patch = mock.patch( + "google.cloud.bigquery.job._AsyncJob.reload", autospec=True + ) + result_patch = mock.patch( + "google.cloud.bigquery.job.QueryJob.result", + side_effect=[concurrent.futures.TimeoutError, row_iterator], + ) + + with result_patch as result_patch_tqdm, reload_patch: + tbl = job.to_arrow(progress_bar_type="tqdm", create_bqstorage_client=False) + + assert result_patch_tqdm.call_count == 2 + assert isinstance(tbl, pyarrow.Table) + assert tbl.num_rows == 2 + result_patch_tqdm.assert_called() + + +def _make_job(schema=(), rows=()): + from google.cloud.bigquery.job import QueryJob as target_class + + begun_resource = _make_job_resource(job_type="query") + query_resource = { + "jobComplete": True, + "jobReference": begun_resource["jobReference"], + "totalRows": str(len(rows)), + "schema": { + "fields": [ + dict(name=field[0], type=field[1], mode=field[2]) for field in schema + ] + }, + } + tabledata_resource = {"rows": [{"f": [{"v": v} for v in row]} for row in rows]} + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, tabledata_resource + ) + client = _make_client(connection=connection) + return target_class.from_api_repr(begun_resource, client) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_to_dataframe(): + job = _make_job( + (("name", "STRING", "NULLABLE"), ("age", "INTEGER", "NULLABLE")), + ( + ("Phred Phlyntstone", "32"), + ("Bharney Rhubble", "33"), + ("Wylma Phlyntstone", "29"), + ("Bhettye Rhubble", "27"), + ), + ) + df = job.to_dataframe(create_bqstorage_client=False) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 4 # verify the number of rows + assert list(df) == ["name", "age"] # verify the column names + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_to_dataframe_ddl_query(): + from google.cloud.bigquery.job import QueryJob as target_class + + # Destination table may have no schema for some DDL and DML queries. + resource = _make_job_resource(job_type="query", ended=True) + query_resource = { + "jobComplete": True, + "jobReference": resource["jobReference"], + "schema": {"fields": []}, + } + connection = _make_connection(query_resource) + client = _make_client(connection=connection) + job = target_class.from_api_repr(resource, client) + + df = job.to_dataframe() + + assert len(df) == 0 + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +def test_to_dataframe_bqstorage(table_read_options_kwarg): + from google.cloud.bigquery.job import QueryJob as target_class + + resource = _make_job_resource(job_type="query", ended=True) + query_resource = { + "jobComplete": True, + "jobReference": resource["jobReference"], + "totalRows": "4", + "schema": { + "fields": [ + {"name": "name", "type": "STRING", "mode": "NULLABLE"}, + {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, + ] + }, + } + connection = _make_connection(query_resource) + client = _make_client(connection=connection) + job = target_class.from_api_repr(resource, client) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() + session.avro_schema.schema = json.dumps( + { + "type": "record", + "name": "__root__", + "fields": [ + {"name": "name", "type": ["null", "string"]}, + {"name": "age", "type": ["null", "long"]}, + ], + } + ) + bqstorage_client.create_read_session.return_value = session + + job.to_dataframe(bqstorage_client=bqstorage_client) + + destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( + **resource["configuration"]["query"]["destinationTable"] + ) + expected_session = bigquery_storage.ReadSession( + table=destination_table, + data_format=bigquery_storage.DataFormat.ARROW, + **table_read_options_kwarg, + ) + bqstorage_client.create_read_session.assert_called_once_with( + parent=f"projects/{client.project}", + read_session=expected_session, + max_stream_count=0, # Use default number of streams for best performance. + ) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +def test_to_dataframe_bqstorage_no_pyarrow_compression(): + from google.cloud.bigquery.job import QueryJob as target_class + + resource = _make_job_resource(job_type="query", ended=True) + query_resource = { + "jobComplete": True, + "jobReference": resource["jobReference"], + "totalRows": "4", + "schema": {"fields": [{"name": "name", "type": "STRING", "mode": "NULLABLE"}]}, + } + connection = _make_connection(query_resource) + client = _make_client(connection=connection) + job = target_class.from_api_repr(resource, client) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() + session.avro_schema.schema = json.dumps( + { + "type": "record", + "name": "__root__", + "fields": [{"name": "name", "type": ["null", "string"]}], + } + ) + bqstorage_client.create_read_session.return_value = session + + with mock.patch( + "google.cloud.bigquery._pandas_helpers._ARROW_COMPRESSION_SUPPORT", new=False + ): + job.to_dataframe(bqstorage_client=bqstorage_client) + + destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( + **resource["configuration"]["query"]["destinationTable"] + ) + expected_session = bigquery_storage.ReadSession( + table=destination_table, data_format=bigquery_storage.DataFormat.ARROW, + ) + bqstorage_client.create_read_session.assert_called_once_with( + parent=f"projects/{client.project}", + read_session=expected_session, + max_stream_count=0, + ) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_to_dataframe_column_dtypes(): + from google.cloud.bigquery.job import QueryJob as target_class + + begun_resource = _make_job_resource(job_type="query") + query_resource = { + "jobComplete": True, + "jobReference": begun_resource["jobReference"], + "totalRows": "4", + "schema": { + "fields": [ + {"name": "start_timestamp", "type": "TIMESTAMP"}, + {"name": "seconds", "type": "INT64"}, + {"name": "miles", "type": "FLOAT64"}, + {"name": "km", "type": "FLOAT64"}, + {"name": "payment_type", "type": "STRING"}, + {"name": "complete", "type": "BOOL"}, + {"name": "date", "type": "DATE"}, + ] + }, + } + row_data = [ + [ + "1433836800000000", + "420", + "1.1", + "1.77", + "Cto_dataframeash", + "true", + "1999-12-01", + ], + ["1387811700000000", "2580", "17.7", "28.5", "Cash", "false", "1953-06-14"], + ["1385565300000000", "2280", "4.4", "7.1", "Credit", "true", "1981-11-04"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + query_resource["rows"] = rows + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, query_resource + ) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + df = job.to_dataframe(dtypes={"km": "float16"}, create_bqstorage_client=False) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 3 # verify the number of rows + exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] + assert list(df) == exp_columns # verify the column names + + assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" + assert df.seconds.dtype.name == "int64" + assert df.miles.dtype.name == "float64" + assert df.km.dtype.name == "float16" + assert df.payment_type.dtype.name == "object" + assert df.complete.dtype.name == "bool" + assert df.date.dtype.name == "object" + + +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_to_dataframe_column_date_dtypes(): + from google.cloud.bigquery.job import QueryJob as target_class + + begun_resource = _make_job_resource(job_type="query") + query_resource = { + "jobComplete": True, + "jobReference": begun_resource["jobReference"], + "totalRows": "1", + "schema": {"fields": [{"name": "date", "type": "DATE"}]}, + } + row_data = [ + ["1999-12-01"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + query_resource["rows"] = rows + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, query_resource + ) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 1 # verify the number of rows + exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] + assert list(df) == exp_columns # verify the column names + assert df.date.dtype.name == "datetime64[ns]" + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") +@mock.patch("tqdm.tqdm") +def test_to_dataframe_with_progress_bar(tqdm_mock): + from google.cloud.bigquery.job import QueryJob as target_class + + begun_resource = _make_job_resource(job_type="query") + query_resource = { + "jobComplete": True, + "jobReference": begun_resource["jobReference"], + "totalRows": "4", + "schema": {"fields": [{"name": "name", "type": "STRING", "mode": "NULLABLE"}]}, + } + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, query_resource, query_resource, + ) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + job.to_dataframe(progress_bar_type=None, create_bqstorage_client=False) + tqdm_mock.assert_not_called() + + job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False) + tqdm_mock.assert_called() + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") +def test_to_dataframe_w_tqdm_pending(): + from google.cloud.bigquery import table + from google.cloud.bigquery.job import QueryJob as target_class + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL + + begun_resource = _make_job_resource(job_type="query") + schema = [ + SchemaField("name", "STRING", mode="NULLABLE"), + SchemaField("age", "INTEGER", mode="NULLABLE"), + ] + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, + {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, + ] + + connection = _make_connection({}) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = table.RowIterator(client, api_request, path, schema) + + job._properties["statistics"] = { + "query": { + "queryPlan": [ + {"name": "S00: Input", "id": "0", "status": "PRNDING"}, + {"name": "S01: Output", "id": "1", "status": "COMPLETE"}, + ] + }, + } + reload_patch = mock.patch( + "google.cloud.bigquery.job._AsyncJob.reload", autospec=True + ) + result_patch = mock.patch( + "google.cloud.bigquery.job.QueryJob.result", + side_effect=[concurrent.futures.TimeoutError, row_iterator], + ) + + with result_patch as result_patch_tqdm, reload_patch: + df = job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False) + + assert result_patch_tqdm.call_count == 2 + assert isinstance(df, pandas.DataFrame) + assert len(df) == 4 # verify the number of rows + assert list(df) == ["name", "age"] # verify the column names + result_patch_tqdm.assert_called_with( + timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None + ) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") +def test_to_dataframe_w_tqdm(): + from google.cloud.bigquery import table + from google.cloud.bigquery.job import QueryJob as target_class + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL + + begun_resource = _make_job_resource(job_type="query") + schema = [ + SchemaField("name", "STRING", mode="NULLABLE"), + SchemaField("age", "INTEGER", mode="NULLABLE"), + ] + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, + {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, + ] + + connection = _make_connection({}) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = table.RowIterator(client, api_request, path, schema) + + job._properties["statistics"] = { + "query": { + "queryPlan": [ + {"name": "S00: Input", "id": "0", "status": "COMPLETE"}, + {"name": "S01: Output", "id": "1", "status": "COMPLETE"}, + ] + }, + } + reload_patch = mock.patch( + "google.cloud.bigquery.job._AsyncJob.reload", autospec=True + ) + result_patch = mock.patch( + "google.cloud.bigquery.job.QueryJob.result", + side_effect=[ + concurrent.futures.TimeoutError, + concurrent.futures.TimeoutError, + row_iterator, + ], + ) + + with result_patch as result_patch_tqdm, reload_patch: + df = job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False) + + assert result_patch_tqdm.call_count == 3 + assert isinstance(df, pandas.DataFrame) + assert len(df) == 4 # verify the number of rows + assert list(df), ["name", "age"] # verify the column names + result_patch_tqdm.assert_called_with( + timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None + ) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") +def test_to_dataframe_w_tqdm_max_results(): + from google.cloud.bigquery import table + from google.cloud.bigquery.job import QueryJob as target_class + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL + + begun_resource = _make_job_resource(job_type="query") + schema = [ + SchemaField("name", "STRING", mode="NULLABLE"), + SchemaField("age", "INTEGER", mode="NULLABLE"), + ] + rows = [{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}] + + connection = _make_connection({}) + client = _make_client(connection=connection) + job = target_class.from_api_repr(begun_resource, client) + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = table.RowIterator(client, api_request, path, schema) + + job._properties["statistics"] = { + "query": { + "queryPlan": [ + {"name": "S00: Input", "id": "0", "status": "COMPLETE"}, + {"name": "S01: Output", "id": "1", "status": "COMPLETE"}, + ] + }, + } + reload_patch = mock.patch( + "google.cloud.bigquery.job._AsyncJob.reload", autospec=True + ) + result_patch = mock.patch( + "google.cloud.bigquery.job.QueryJob.result", + side_effect=[concurrent.futures.TimeoutError, row_iterator], + ) + + with result_patch as result_patch_tqdm, reload_patch: + job.to_dataframe( + progress_bar_type="tqdm", create_bqstorage_client=False, max_results=3 + ) + + assert result_patch_tqdm.call_count == 2 + result_patch_tqdm.assert_called_with( + timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=3 + ) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(shapely is None, reason="Requires `shapely`") +def test_to_dataframe_geography_as_object(): + job = _make_job( + (("name", "STRING", "NULLABLE"), ("geog", "GEOGRAPHY", "NULLABLE")), + ( + ("Phred Phlyntstone", "Point(0 0)"), + ("Bharney Rhubble", "Point(0 1)"), + ("Wylma Phlyntstone", None), + ), + ) + df = job.to_dataframe(create_bqstorage_client=False, geography_as_object=True) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 3 # verify the number of rows + assert list(df) == ["name", "geog"] # verify the column names + assert [v.__class__.__name__ for v in df.geog] == [ + "Point", + "Point", + "float", + ] # float because nan + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +def test_to_geodataframe(): + job = _make_job( + (("name", "STRING", "NULLABLE"), ("geog", "GEOGRAPHY", "NULLABLE")), + ( + ("Phred Phlyntstone", "Point(0 0)"), + ("Bharney Rhubble", "Point(0 1)"), + ("Wylma Phlyntstone", None), + ), + ) + df = job.to_geodataframe(create_bqstorage_client=False) + + assert isinstance(df, geopandas.GeoDataFrame) + assert len(df) == 3 # verify the number of rows + assert list(df) == ["name", "geog"] # verify the column names + assert [v.__class__.__name__ for v in df.geog] == [ + "Point", + "Point", + "NoneType", + ] # float because nan + assert isinstance(df.geog, geopandas.GeoSeries) + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@mock.patch("google.cloud.bigquery.job.query.wait_for_query") +def test_query_job_to_geodataframe_delegation(wait_for_query): + """ + QueryJob.to_geodataframe just delegates to RowIterator.to_geodataframe. + + This test just demonstrates that. We don't need to test all the + variations, which are tested for RowIterator. + """ + import numpy + + job = _make_job() + bqstorage_client = object() + dtypes = dict(xxx=numpy.dtype("int64")) + progress_bar_type = "normal" + create_bqstorage_client = False + date_as_object = False + max_results = 42 + geography_column = "g" + + df = job.to_geodataframe( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + max_results=max_results, + geography_column=geography_column, + ) + + wait_for_query.assert_called_once_with( + job, progress_bar_type, max_results=max_results + ) + row_iterator = wait_for_query.return_value + row_iterator.to_geodataframe.assert_called_once_with( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + geography_column=geography_column, + ) + assert df is row_iterator.to_geodataframe.return_value diff --git a/tests/unit/job/test_query_stats.py b/tests/unit/job/test_query_stats.py new file mode 100644 index 000000000..e70eb097c --- /dev/null +++ b/tests/unit/job/test_query_stats.py @@ -0,0 +1,393 @@ +# Copyright 2015 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .helpers import _Base + + +class TestDmlStats: + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import DmlStats + + return DmlStats + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def test_ctor_defaults(self): + dml_stats = self._make_one() + assert dml_stats.inserted_row_count == 0 + assert dml_stats.deleted_row_count == 0 + assert dml_stats.updated_row_count == 0 + + def test_from_api_repr_partial_stats(self): + klass = self._get_target_class() + result = klass.from_api_repr({"deletedRowCount": "12"}) + + assert isinstance(result, klass) + assert result.inserted_row_count == 0 + assert result.deleted_row_count == 12 + assert result.updated_row_count == 0 + + def test_from_api_repr_full_stats(self): + klass = self._get_target_class() + result = klass.from_api_repr( + {"updatedRowCount": "4", "insertedRowCount": "7", "deletedRowCount": "25"} + ) + + assert isinstance(result, klass) + assert result.inserted_row_count == 7 + assert result.deleted_row_count == 25 + assert result.updated_row_count == 4 + + +class TestQueryPlanEntryStep(_Base): + KIND = "KIND" + SUBSTEPS = ("SUB1", "SUB2") + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import QueryPlanEntryStep + + return QueryPlanEntryStep + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def test_ctor(self): + step = self._make_one(self.KIND, self.SUBSTEPS) + self.assertEqual(step.kind, self.KIND) + self.assertEqual(step.substeps, list(self.SUBSTEPS)) + + def test_from_api_repr_empty(self): + klass = self._get_target_class() + step = klass.from_api_repr({}) + self.assertIsNone(step.kind) + self.assertEqual(step.substeps, []) + + def test_from_api_repr_normal(self): + resource = {"kind": self.KIND, "substeps": self.SUBSTEPS} + klass = self._get_target_class() + step = klass.from_api_repr(resource) + self.assertEqual(step.kind, self.KIND) + self.assertEqual(step.substeps, list(self.SUBSTEPS)) + + def test___eq___mismatched_type(self): + step = self._make_one(self.KIND, self.SUBSTEPS) + self.assertNotEqual(step, object()) + + def test___eq___mismatch_kind(self): + step = self._make_one(self.KIND, self.SUBSTEPS) + other = self._make_one("OTHER", self.SUBSTEPS) + self.assertNotEqual(step, other) + + def test___eq___mismatch_substeps(self): + step = self._make_one(self.KIND, self.SUBSTEPS) + other = self._make_one(self.KIND, ()) + self.assertNotEqual(step, other) + + def test___eq___hit(self): + step = self._make_one(self.KIND, self.SUBSTEPS) + other = self._make_one(self.KIND, self.SUBSTEPS) + self.assertEqual(step, other) + + def test___eq___wrong_type(self): + step = self._make_one(self.KIND, self.SUBSTEPS) + self.assertFalse(step == "hello") + + +class TestQueryPlanEntry(_Base): + NAME = "NAME" + ENTRY_ID = 1234 + START_MS = 1522540800000 + END_MS = 1522540804000 + INPUT_STAGES = (88, 101) + PARALLEL_INPUTS = 1000 + COMPLETED_PARALLEL_INPUTS = 5 + WAIT_MS_AVG = 33 + WAIT_MS_MAX = 400 + WAIT_RATIO_AVG = 2.71828 + WAIT_RATIO_MAX = 3.14159 + READ_MS_AVG = 45 + READ_MS_MAX = 90 + READ_RATIO_AVG = 1.41421 + READ_RATIO_MAX = 1.73205 + COMPUTE_MS_AVG = 55 + COMPUTE_MS_MAX = 99 + COMPUTE_RATIO_AVG = 0.69315 + COMPUTE_RATIO_MAX = 1.09861 + WRITE_MS_AVG = 203 + WRITE_MS_MAX = 340 + WRITE_RATIO_AVG = 3.32193 + WRITE_RATIO_MAX = 2.30258 + RECORDS_READ = 100 + RECORDS_WRITTEN = 1 + STATUS = "STATUS" + SHUFFLE_OUTPUT_BYTES = 1024 + SHUFFLE_OUTPUT_BYTES_SPILLED = 1 + + START_RFC3339_MICROS = "2018-04-01T00:00:00.000000Z" + END_RFC3339_MICROS = "2018-04-01T00:00:04.000000Z" + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import QueryPlanEntry + + return QueryPlanEntry + + def test_from_api_repr_empty(self): + klass = self._get_target_class() + + entry = klass.from_api_repr({}) + + self.assertIsNone(entry.name) + self.assertIsNone(entry.entry_id) + self.assertEqual(entry.input_stages, []) + self.assertIsNone(entry.start) + self.assertIsNone(entry.end) + self.assertIsNone(entry.parallel_inputs) + self.assertIsNone(entry.completed_parallel_inputs) + self.assertIsNone(entry.wait_ms_avg) + self.assertIsNone(entry.wait_ms_max) + self.assertIsNone(entry.wait_ratio_avg) + self.assertIsNone(entry.wait_ratio_max) + self.assertIsNone(entry.read_ms_avg) + self.assertIsNone(entry.read_ms_max) + self.assertIsNone(entry.read_ratio_avg) + self.assertIsNone(entry.read_ratio_max) + self.assertIsNone(entry.compute_ms_avg) + self.assertIsNone(entry.compute_ms_max) + self.assertIsNone(entry.compute_ratio_avg) + self.assertIsNone(entry.compute_ratio_max) + self.assertIsNone(entry.write_ms_avg) + self.assertIsNone(entry.write_ms_max) + self.assertIsNone(entry.write_ratio_avg) + self.assertIsNone(entry.write_ratio_max) + self.assertIsNone(entry.records_read) + self.assertIsNone(entry.records_written) + self.assertIsNone(entry.status) + self.assertIsNone(entry.shuffle_output_bytes) + self.assertIsNone(entry.shuffle_output_bytes_spilled) + self.assertEqual(entry.steps, []) + + def test_from_api_repr_normal(self): + from google.cloud.bigquery.job import QueryPlanEntryStep + + steps = [ + QueryPlanEntryStep( + kind=TestQueryPlanEntryStep.KIND, + substeps=TestQueryPlanEntryStep.SUBSTEPS, + ) + ] + resource = { + "name": self.NAME, + "id": self.ENTRY_ID, + "inputStages": self.INPUT_STAGES, + "startMs": self.START_MS, + "endMs": self.END_MS, + "waitMsAvg": self.WAIT_MS_AVG, + "waitMsMax": self.WAIT_MS_MAX, + "waitRatioAvg": self.WAIT_RATIO_AVG, + "waitRatioMax": self.WAIT_RATIO_MAX, + "readMsAvg": self.READ_MS_AVG, + "readMsMax": self.READ_MS_MAX, + "readRatioAvg": self.READ_RATIO_AVG, + "readRatioMax": self.READ_RATIO_MAX, + "computeMsAvg": self.COMPUTE_MS_AVG, + "computeMsMax": self.COMPUTE_MS_MAX, + "computeRatioAvg": self.COMPUTE_RATIO_AVG, + "computeRatioMax": self.COMPUTE_RATIO_MAX, + "writeMsAvg": self.WRITE_MS_AVG, + "writeMsMax": self.WRITE_MS_MAX, + "writeRatioAvg": self.WRITE_RATIO_AVG, + "writeRatioMax": self.WRITE_RATIO_MAX, + "recordsRead": self.RECORDS_READ, + "recordsWritten": self.RECORDS_WRITTEN, + "status": self.STATUS, + "shuffleOutputBytes": self.SHUFFLE_OUTPUT_BYTES, + "shuffleOutputBytesSpilled": self.SHUFFLE_OUTPUT_BYTES_SPILLED, + "steps": [ + { + "kind": TestQueryPlanEntryStep.KIND, + "substeps": TestQueryPlanEntryStep.SUBSTEPS, + } + ], + } + klass = self._get_target_class() + + entry = klass.from_api_repr(resource) + self.assertEqual(entry.name, self.NAME) + self.assertEqual(entry.entry_id, self.ENTRY_ID) + self.assertEqual(entry.wait_ratio_avg, self.WAIT_RATIO_AVG) + self.assertEqual(entry.wait_ratio_max, self.WAIT_RATIO_MAX) + self.assertEqual(entry.read_ratio_avg, self.READ_RATIO_AVG) + self.assertEqual(entry.read_ratio_max, self.READ_RATIO_MAX) + self.assertEqual(entry.compute_ratio_avg, self.COMPUTE_RATIO_AVG) + self.assertEqual(entry.compute_ratio_max, self.COMPUTE_RATIO_MAX) + self.assertEqual(entry.write_ratio_avg, self.WRITE_RATIO_AVG) + self.assertEqual(entry.write_ratio_max, self.WRITE_RATIO_MAX) + self.assertEqual(entry.records_read, self.RECORDS_READ) + self.assertEqual(entry.records_written, self.RECORDS_WRITTEN) + self.assertEqual(entry.status, self.STATUS) + self.assertEqual(entry.steps, steps) + + def test_start(self): + from google.cloud._helpers import _RFC3339_MICROS + + klass = self._get_target_class() + + entry = klass.from_api_repr({}) + self.assertEqual(entry.start, None) + + entry._properties["startMs"] = self.START_MS + self.assertEqual( + entry.start.strftime(_RFC3339_MICROS), self.START_RFC3339_MICROS + ) + + def test_end(self): + from google.cloud._helpers import _RFC3339_MICROS + + klass = self._get_target_class() + + entry = klass.from_api_repr({}) + self.assertEqual(entry.end, None) + + entry._properties["endMs"] = self.END_MS + self.assertEqual(entry.end.strftime(_RFC3339_MICROS), self.END_RFC3339_MICROS) + + +class TestScriptStackFrame(_Base): + def _make_one(self, resource): + from google.cloud.bigquery.job import ScriptStackFrame + + return ScriptStackFrame(resource) + + def test_procedure_id(self): + frame = self._make_one({"procedureId": "some-procedure"}) + self.assertEqual(frame.procedure_id, "some-procedure") + del frame._properties["procedureId"] + self.assertIsNone(frame.procedure_id) + + def test_start_line(self): + frame = self._make_one({"startLine": 5}) + self.assertEqual(frame.start_line, 5) + frame._properties["startLine"] = "5" + self.assertEqual(frame.start_line, 5) + + def test_start_column(self): + frame = self._make_one({"startColumn": 29}) + self.assertEqual(frame.start_column, 29) + frame._properties["startColumn"] = "29" + self.assertEqual(frame.start_column, 29) + + def test_end_line(self): + frame = self._make_one({"endLine": 9}) + self.assertEqual(frame.end_line, 9) + frame._properties["endLine"] = "9" + self.assertEqual(frame.end_line, 9) + + def test_end_column(self): + frame = self._make_one({"endColumn": 14}) + self.assertEqual(frame.end_column, 14) + frame._properties["endColumn"] = "14" + self.assertEqual(frame.end_column, 14) + + def test_text(self): + frame = self._make_one({"text": "QUERY TEXT"}) + self.assertEqual(frame.text, "QUERY TEXT") + + +class TestScriptStatistics(_Base): + def _make_one(self, resource): + from google.cloud.bigquery.job import ScriptStatistics + + return ScriptStatistics(resource) + + def test_evalutation_kind(self): + stats = self._make_one({"evaluationKind": "EXPRESSION"}) + self.assertEqual(stats.evaluation_kind, "EXPRESSION") + self.assertEqual(stats.stack_frames, []) + + def test_stack_frames(self): + stats = self._make_one( + { + "stackFrames": [ + { + "procedureId": "some-procedure", + "startLine": 5, + "startColumn": 29, + "endLine": 9, + "endColumn": 14, + "text": "QUERY TEXT", + }, + {}, + ] + } + ) + stack_frames = stats.stack_frames + self.assertEqual(len(stack_frames), 2) + stack_frame = stack_frames[0] + self.assertEqual(stack_frame.procedure_id, "some-procedure") + self.assertEqual(stack_frame.start_line, 5) + self.assertEqual(stack_frame.start_column, 29) + self.assertEqual(stack_frame.end_line, 9) + self.assertEqual(stack_frame.end_column, 14) + self.assertEqual(stack_frame.text, "QUERY TEXT") + stack_frame = stack_frames[1] + self.assertIsNone(stack_frame.procedure_id) + self.assertIsNone(stack_frame.start_line) + self.assertIsNone(stack_frame.start_column) + self.assertIsNone(stack_frame.end_line) + self.assertIsNone(stack_frame.end_column) + self.assertIsNone(stack_frame.text) + + +class TestTimelineEntry(_Base): + ELAPSED_MS = 101 + ACTIVE_UNITS = 50 + PENDING_UNITS = 98 + COMPLETED_UNITS = 520 + SLOT_MILLIS = 12029 + + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.job import TimelineEntry + + return TimelineEntry + + def test_from_api_repr_empty(self): + klass = self._get_target_class() + entry = klass.from_api_repr({}) + self.assertIsNone(entry.elapsed_ms) + self.assertIsNone(entry.active_units) + self.assertIsNone(entry.pending_units) + self.assertIsNone(entry.completed_units) + self.assertIsNone(entry.slot_millis) + + def test_from_api_repr_normal(self): + resource = { + "elapsedMs": self.ELAPSED_MS, + "activeUnits": self.ACTIVE_UNITS, + "pendingUnits": self.PENDING_UNITS, + "completedUnits": self.COMPLETED_UNITS, + "totalSlotMs": self.SLOT_MILLIS, + } + klass = self._get_target_class() + + entry = klass.from_api_repr(resource) + self.assertEqual(entry.elapsed_ms, self.ELAPSED_MS) + self.assertEqual(entry.active_units, self.ACTIVE_UNITS) + self.assertEqual(entry.pending_units, self.PENDING_UNITS) + self.assertEqual(entry.completed_units, self.COMPLETED_UNITS) + self.assertEqual(entry.slot_millis, self.SLOT_MILLIS) diff --git a/tests/unit/model/test_model.py b/tests/unit/model/test_model.py index 90fc09e66..8f0bf58d5 100644 --- a/tests/unit/model/test_model.py +++ b/tests/unit/model/test_model.py @@ -19,7 +19,7 @@ import pytest import google.cloud._helpers -from google.cloud.bigquery_v2.gapic import enums +from google.cloud.bigquery_v2 import types KMS_KEY_NAME = "projects/1/locations/us/keyRings/1/cryptoKeys/1" @@ -117,7 +117,7 @@ def test_from_api_repr(target_class): assert got.expires == expiration_time assert got.description == u"A friendly description." assert got.friendly_name == u"A friendly name." - assert got.model_type == enums.Model.ModelType.LOGISTIC_REGRESSION + assert got.model_type == types.Model.ModelType.LOGISTIC_REGRESSION assert got.labels == {"greeting": u"こんにちは"} assert got.encryption_configuration.kms_key_name == KMS_KEY_NAME assert got.training_runs[0].training_options.initial_learn_rate == 1.0 @@ -162,7 +162,7 @@ def test_from_api_repr_w_minimal_resource(target_class): assert got.expires is None assert got.description is None assert got.friendly_name is None - assert got.model_type == enums.Model.ModelType.MODEL_TYPE_UNSPECIFIED + assert got.model_type == types.Model.ModelType.MODEL_TYPE_UNSPECIFIED assert got.labels == {} assert got.encryption_configuration is None assert len(got.training_runs) == 0 @@ -186,6 +186,23 @@ def test_from_api_repr_w_unknown_fields(target_class): assert got._properties is resource +def test_from_api_repr_w_unknown_type(target_class): + from google.cloud.bigquery import ModelReference + + resource = { + "modelReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "modelId": "my_model", + }, + "modelType": "BE_A_GOOD_ROLE_MODEL", + } + got = target_class.from_api_repr(resource) + assert got.reference == ModelReference.from_string("my-project.my_dataset.my_model") + assert got.model_type == 0 + assert got._properties is resource + + @pytest.mark.parametrize( "resource,filter_fields,expected", [ @@ -318,3 +335,47 @@ def test_repr(target_class): "Model(reference=ModelReference(" "project_id='my-proj', dataset_id='my_dset', model_id='my_model'))" ) + + +def test_to_api_repr(target_class): + from google.protobuf import json_format + + model = target_class("my-proj.my_dset.my_model") + resource = { + "etag": "abcdefg", + "modelReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "modelId": "my_model", + }, + "creationTime": "1274284800000", + "lastModifiedTime": "1317484800000", + "modelType": "LOGISTIC_REGRESSION", + "trainingRuns": [ + { + "trainingOptions": {"initialLearnRate": 1.0}, + "startTime": "2010-05-19T16:00:00Z", + }, + { + "trainingOptions": {"initialLearnRate": 0.5}, + "startTime": "2011-10-01T16:00:00Z", + }, + { + "trainingOptions": {"initialLearnRate": 0.25}, + "startTime": "2012-12-21T16:00:00Z", + }, + ], + "description": "A friendly description.", + "location": "US", + "friendlyName": "A friendly name.", + "labels": {"greeting": "こんにちは"}, + "expirationTime": "1356105600000", + "encryptionConfiguration": { + "kmsKeyName": "projects/1/locations/us/keyRings/1/cryptoKeys/1" + }, + } + model._proto = json_format.ParseDict( + resource, types.Model()._pb, ignore_unknown_fields=True + ) + got = model.to_api_repr() + assert got == resource diff --git a/tests/unit/routine/test_routine.py b/tests/unit/routine/test_routine.py index 02f703535..fdaf13324 100644 --- a/tests/unit/routine/test_routine.py +++ b/tests/unit/routine/test_routine.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright 2019 Google LLC # @@ -19,6 +18,7 @@ import pytest import google.cloud._helpers +from google.cloud import bigquery from google.cloud import bigquery_v2 @@ -63,17 +63,18 @@ def test_ctor_w_properties(target_class): RoutineArgument( name="x", data_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ), ) ] body = "x * 3" language = "SQL" return_type = bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ) type_ = "SCALAR_FUNCTION" description = "A routine description." + determinism_level = bigquery.DeterminismLevel.NOT_DETERMINISTIC actual_routine = target_class( routine_id, @@ -83,6 +84,7 @@ def test_ctor_w_properties(target_class): return_type=return_type, type_=type_, description=description, + determinism_level=determinism_level, ) ref = RoutineReference.from_string(routine_id) @@ -93,6 +95,9 @@ def test_ctor_w_properties(target_class): assert actual_routine.return_type == return_type assert actual_routine.type_ == type_ assert actual_routine.description == description + assert ( + actual_routine.determinism_level == bigquery.DeterminismLevel.NOT_DETERMINISTIC + ) def test_from_api_repr(target_class): @@ -121,6 +126,7 @@ def test_from_api_repr(target_class): "routineType": "SCALAR_FUNCTION", "someNewField": "someValue", "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISTIC, } actual_routine = target_class.from_api_repr(resource) @@ -141,18 +147,93 @@ def test_from_api_repr(target_class): RoutineArgument( name="x", data_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ), ) ] assert actual_routine.body == "42" assert actual_routine.language == "SQL" assert actual_routine.return_type == bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ) + assert actual_routine.return_table_type is None assert actual_routine.type_ == "SCALAR_FUNCTION" assert actual_routine._properties["someNewField"] == "someValue" assert actual_routine.description == "A routine description." + assert actual_routine.determinism_level == "DETERMINISTIC" + + +def test_from_api_repr_tvf_function(target_class): + from google.cloud.bigquery.routine import RoutineArgument + from google.cloud.bigquery.routine import RoutineReference + from google.cloud.bigquery.routine import RoutineType + + StandardSqlDataType = bigquery_v2.types.StandardSqlDataType + StandardSqlField = bigquery_v2.types.StandardSqlField + StandardSqlTableType = bigquery_v2.types.StandardSqlTableType + + creation_time = datetime.datetime( + 2010, 5, 19, 16, 0, 0, tzinfo=google.cloud._helpers.UTC + ) + modified_time = datetime.datetime( + 2011, 10, 1, 16, 0, 0, tzinfo=google.cloud._helpers.UTC + ) + resource = { + "routineReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "routineId": "my_routine", + }, + "etag": "abcdefg", + "creationTime": str(google.cloud._helpers._millis(creation_time)), + "lastModifiedTime": str(google.cloud._helpers._millis(modified_time)), + "definitionBody": "SELECT x FROM UNNEST([1,2,3]) x WHERE x > a", + "arguments": [{"name": "a", "dataType": {"typeKind": "INT64"}}], + "language": "SQL", + "returnTableType": { + "columns": [{"name": "int_col", "type": {"typeKind": "INT64"}}] + }, + "routineType": "TABLE_VALUED_FUNCTION", + "someNewField": "someValue", + "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISTIC, + } + actual_routine = target_class.from_api_repr(resource) + + assert actual_routine.project == "my-project" + assert actual_routine.dataset_id == "my_dataset" + assert actual_routine.routine_id == "my_routine" + assert ( + actual_routine.path + == "/projects/my-project/datasets/my_dataset/routines/my_routine" + ) + assert actual_routine.reference == RoutineReference.from_string( + "my-project.my_dataset.my_routine" + ) + assert actual_routine.etag == "abcdefg" + assert actual_routine.created == creation_time + assert actual_routine.modified == modified_time + assert actual_routine.arguments == [ + RoutineArgument( + name="a", + data_type=StandardSqlDataType(type_kind=StandardSqlDataType.TypeKind.INT64), + ) + ] + assert actual_routine.body == "SELECT x FROM UNNEST([1,2,3]) x WHERE x > a" + assert actual_routine.language == "SQL" + assert actual_routine.return_type is None + assert actual_routine.return_table_type == StandardSqlTableType( + columns=[ + StandardSqlField( + name="int_col", + type=StandardSqlDataType(type_kind=StandardSqlDataType.TypeKind.INT64), + ) + ] + ) + assert actual_routine.type_ == RoutineType.TABLE_VALUED_FUNCTION + assert actual_routine._properties["someNewField"] == "someValue" + assert actual_routine.description == "A routine description." + assert actual_routine.determinism_level == "DETERMINISTIC" def test_from_api_repr_w_minimal_resource(target_class): @@ -178,6 +259,7 @@ def test_from_api_repr_w_minimal_resource(target_class): assert actual_routine.return_type is None assert actual_routine.type_ is None assert actual_routine.description is None + assert actual_routine.determinism_level is None def test_from_api_repr_w_unknown_fields(target_class): @@ -209,6 +291,7 @@ def test_from_api_repr_w_unknown_fields(target_class): "returnType": {"typeKind": "INT64"}, "routineType": "SCALAR_FUNCTION", "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, }, ["arguments"], {"arguments": [{"name": "x", "dataType": {"typeKind": "INT64"}}]}, @@ -221,6 +304,7 @@ def test_from_api_repr_w_unknown_fields(target_class): "returnType": {"typeKind": "INT64"}, "routineType": "SCALAR_FUNCTION", "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, }, ["body"], {"definitionBody": "x * 3"}, @@ -233,6 +317,7 @@ def test_from_api_repr_w_unknown_fields(target_class): "returnType": {"typeKind": "INT64"}, "routineType": "SCALAR_FUNCTION", "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, }, ["language"], {"language": "SQL"}, @@ -245,10 +330,29 @@ def test_from_api_repr_w_unknown_fields(target_class): "returnType": {"typeKind": "INT64"}, "routineType": "SCALAR_FUNCTION", "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, }, ["return_type"], {"returnType": {"typeKind": "INT64"}}, ), + ( + { + "definitionBody": "SELECT x FROM UNNEST([1,2,3]) x WHERE x > 1", + "language": "SQL", + "returnTableType": { + "columns": [{"name": "int_col", "type": {"typeKind": "INT64"}}] + }, + "routineType": "TABLE_VALUED_FUNCTION", + "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, + }, + ["return_table_type"], + { + "returnTableType": { + "columns": [{"name": "int_col", "type": {"typeKind": "INT64"}}] + } + }, + ), ( { "arguments": [{"name": "x", "dataType": {"typeKind": "INT64"}}], @@ -257,6 +361,7 @@ def test_from_api_repr_w_unknown_fields(target_class): "returnType": {"typeKind": "INT64"}, "routineType": "SCALAR_FUNCTION", "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, }, ["type_"], {"routineType": "SCALAR_FUNCTION"}, @@ -269,13 +374,37 @@ def test_from_api_repr_w_unknown_fields(target_class): "returnType": {"typeKind": "INT64"}, "routineType": "SCALAR_FUNCTION", "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, }, ["description"], {"description": "A routine description."}, ), + ( + { + "arguments": [{"name": "x", "dataType": {"typeKind": "INT64"}}], + "definitionBody": "x * 3", + "language": "SQL", + "returnType": {"typeKind": "INT64"}, + "routineType": "SCALAR_FUNCTION", + "description": "A routine description.", + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED, + }, + ["determinism_level"], + { + "determinismLevel": bigquery.DeterminismLevel.DETERMINISM_LEVEL_UNSPECIFIED + }, + ), ( {}, - ["arguments", "language", "body", "type_", "return_type", "description"], + [ + "arguments", + "language", + "body", + "type_", + "return_type", + "description", + "determinism_level", + ], { "arguments": None, "definitionBody": None, @@ -283,6 +412,7 @@ def test_from_api_repr_w_unknown_fields(target_class): "returnType": None, "routineType": None, "description": None, + "determinismLevel": None, }, ), ( @@ -323,6 +453,41 @@ def test_set_return_type_w_none(object_under_test): assert object_under_test._properties["returnType"] is None +def test_set_return_table_type_w_none(object_under_test): + object_under_test.return_table_type = None + assert object_under_test.return_table_type is None + assert object_under_test._properties["returnTableType"] is None + + +def test_set_return_table_type_w_not_none(object_under_test): + StandardSqlDataType = bigquery_v2.types.StandardSqlDataType + StandardSqlField = bigquery_v2.types.StandardSqlField + StandardSqlTableType = bigquery_v2.types.StandardSqlTableType + + table_type = StandardSqlTableType( + columns=[ + StandardSqlField( + name="int_col", + type=StandardSqlDataType(type_kind=StandardSqlDataType.TypeKind.INT64), + ), + StandardSqlField( + name="str_col", + type=StandardSqlDataType(type_kind=StandardSqlDataType.TypeKind.STRING), + ), + ] + ) + + object_under_test.return_table_type = table_type + + assert object_under_test.return_table_type == table_type + assert object_under_test._properties["returnTableType"] == { + "columns": [ + {"name": "int_col", "type": {"typeKind": "INT64"}}, + {"name": "str_col", "type": {"typeKind": "STRING"}}, + ] + } + + def test_set_description_w_none(object_under_test): object_under_test.description = None assert object_under_test.description is None diff --git a/tests/unit/routine/test_routine_argument.py b/tests/unit/routine/test_routine_argument.py index 7d17b5fc7..e3bda9539 100644 --- a/tests/unit/routine/test_routine_argument.py +++ b/tests/unit/routine/test_routine_argument.py @@ -28,7 +28,7 @@ def target_class(): def test_ctor(target_class): data_type = bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ) actual_arg = target_class( name="field_name", kind="FIXED_TYPE", mode="IN", data_type=data_type @@ -51,7 +51,7 @@ def test_from_api_repr(target_class): assert actual_arg.kind == "FIXED_TYPE" assert actual_arg.mode == "IN" assert actual_arg.data_type == bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ) @@ -72,7 +72,7 @@ def test_from_api_repr_w_unknown_fields(target_class): def test_eq(target_class): data_type = bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ) arg = target_class( name="field_name", kind="FIXED_TYPE", mode="IN", data_type=data_type diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 28ebe8144..f8d00e67d 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -18,7 +18,75 @@ import unittest import mock -import six + +try: + from google.cloud import bigquery_storage +except ImportError: # pragma: NO COVER + bigquery_storage = None + + +@unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") +class TestBQStorageVersions(unittest.TestCase): + def _object_under_test(self): + from google.cloud.bigquery import _helpers + + return _helpers.BQStorageVersions() + + def _call_fut(self): + from google.cloud.bigquery import _helpers + + _helpers.BQ_STORAGE_VERSIONS._installed_version = None + return _helpers.BQ_STORAGE_VERSIONS.verify_version() + + def test_raises_no_error_w_recent_bqstorage(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + with mock.patch("google.cloud.bigquery_storage.__version__", new="2.0.0"): + try: + self._call_fut() + except LegacyBigQueryStorageError: # pragma: NO COVER + self.fail("Legacy error raised with a non-legacy dependency version.") + + def test_raises_error_w_legacy_bqstorage(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + with mock.patch("google.cloud.bigquery_storage.__version__", new="1.9.9"): + with self.assertRaises(LegacyBigQueryStorageError): + self._call_fut() + + def test_raises_error_w_unknown_bqstorage_version(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + with mock.patch("google.cloud.bigquery_storage", autospec=True) as fake_module: + del fake_module.__version__ + error_pattern = r"version found: 0.0.0" + with self.assertRaisesRegex(LegacyBigQueryStorageError, error_pattern): + self._call_fut() + + def test_installed_version_returns_cached(self): + versions = self._object_under_test() + versions._installed_version = object() + assert versions.installed_version is versions._installed_version + + def test_installed_version_returns_parsed_version(self): + versions = self._object_under_test() + + with mock.patch("google.cloud.bigquery_storage.__version__", new="1.2.3"): + version = versions.installed_version + + assert version.major == 1 + assert version.minor == 2 + assert version.micro == 3 + + def test_is_read_session_optional_true(self): + versions = self._object_under_test() + with mock.patch("google.cloud.bigquery_storage.__version__", new="2.6.0"): + assert versions.is_read_session_optional + + def test_is_read_session_optional_false(self): + versions = self._object_under_test() + with mock.patch("google.cloud.bigquery_storage.__version__", new="2.5.0"): + assert not versions.is_read_session_optional class Test_not_null(unittest.TestCase): @@ -190,18 +258,18 @@ def test_w_none_required(self): with self.assertRaises(TypeError): self._call_fut(None, _Field("REQUIRED")) - def test_w_string_value(self): + def test_w_string_int_value(self): from google.cloud._helpers import _EPOCH - coerced = self._call_fut("1.234567", object()) + coerced = self._call_fut("1234567", object()) self.assertEqual( coerced, _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) ) - def test_w_float_value(self): + def test_w_int_value(self): from google.cloud._helpers import _EPOCH - coerced = self._call_fut(1.234567, object()) + coerced = self._call_fut(1234567, object()) self.assertEqual( coerced, _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) ) @@ -421,13 +489,13 @@ def _call_fut(self, row, schema): def test_w_single_scalar_column(self): # SELECT 1 AS col col = _Field("REQUIRED", "col", "INTEGER") - row = {u"f": [{u"v": u"1"}]} + row = {"f": [{"v": "1"}]} self.assertEqual(self._call_fut(row, schema=[col]), (1,)) def test_w_single_scalar_geography_column(self): # SELECT 1 AS col col = _Field("REQUIRED", "geo", "GEOGRAPHY") - row = {u"f": [{u"v": u"POINT(1, 2)"}]} + row = {"f": [{"v": "POINT(1, 2)"}]} self.assertEqual(self._call_fut(row, schema=[col]), ("POINT(1, 2)",)) def test_w_single_struct_column(self): @@ -435,13 +503,13 @@ def test_w_single_struct_column(self): sub_1 = _Field("REQUIRED", "sub_1", "INTEGER") sub_2 = _Field("REQUIRED", "sub_2", "INTEGER") col = _Field("REQUIRED", "col", "RECORD", fields=[sub_1, sub_2]) - row = {u"f": [{u"v": {u"f": [{u"v": u"1"}, {u"v": u"2"}]}}]} + row = {"f": [{"v": {"f": [{"v": "1"}, {"v": "2"}]}}]} self.assertEqual(self._call_fut(row, schema=[col]), ({"sub_1": 1, "sub_2": 2},)) def test_w_single_array_column(self): # SELECT [1, 2, 3] as col col = _Field("REPEATED", "col", "INTEGER") - row = {u"f": [{u"v": [{u"v": u"1"}, {u"v": u"2"}, {u"v": u"3"}]}]} + row = {"f": [{"v": [{"v": "1"}, {"v": "2"}, {"v": "3"}]}]} self.assertEqual(self._call_fut(row, schema=[col]), ([1, 2, 3],)) def test_w_struct_w_nested_array_column(self): @@ -451,13 +519,13 @@ def test_w_struct_w_nested_array_column(self): third = _Field("REPEATED", "third", "INTEGER") col = _Field("REQUIRED", "col", "RECORD", fields=[first, second, third]) row = { - u"f": [ + "f": [ { - u"v": { - u"f": [ - {u"v": [{u"v": u"1"}, {u"v": u"2"}]}, - {u"v": u"3"}, - {u"v": [{u"v": u"4"}, {u"v": u"5"}]}, + "v": { + "f": [ + {"v": [{"v": "1"}, {"v": "2"}]}, + {"v": "3"}, + {"v": [{"v": "4"}, {"v": "5"}]}, ] } } @@ -465,7 +533,7 @@ def test_w_struct_w_nested_array_column(self): } self.assertEqual( self._call_fut(row, schema=[col]), - ({u"first": [1, 2], u"second": 3, u"third": [4, 5]},), + ({"first": [1, 2], "second": 3, "third": [4, 5]},), ) def test_w_array_of_struct(self): @@ -475,11 +543,11 @@ def test_w_array_of_struct(self): third = _Field("REQUIRED", "third", "INTEGER") col = _Field("REPEATED", "col", "RECORD", fields=[first, second, third]) row = { - u"f": [ + "f": [ { - u"v": [ - {u"v": {u"f": [{u"v": u"1"}, {u"v": u"2"}, {u"v": u"3"}]}}, - {u"v": {u"f": [{u"v": u"4"}, {u"v": u"5"}, {u"v": u"6"}]}}, + "v": [ + {"v": {"f": [{"v": "1"}, {"v": "2"}, {"v": "3"}]}}, + {"v": {"f": [{"v": "4"}, {"v": "5"}, {"v": "6"}]}}, ] } ] @@ -488,8 +556,8 @@ def test_w_array_of_struct(self): self._call_fut(row, schema=[col]), ( [ - {u"first": 1, u"second": 2, u"third": 3}, - {u"first": 4, u"second": 5, u"third": 6}, + {"first": 1, "second": 2, "third": 3}, + {"first": 4, "second": 5, "third": 6}, ], ), ) @@ -500,32 +568,25 @@ def test_w_array_of_struct_w_array(self): second = _Field("REQUIRED", "second", "INTEGER") col = _Field("REPEATED", "col", "RECORD", fields=[first, second]) row = { - u"f": [ + "f": [ { - u"v": [ + "v": [ { - u"v": { - u"f": [ - {u"v": [{u"v": u"1"}, {u"v": u"2"}, {u"v": u"3"}]}, - {u"v": u"4"}, - ] - } - }, - { - u"v": { - u"f": [ - {u"v": [{u"v": u"5"}, {u"v": u"6"}]}, - {u"v": u"7"}, + "v": { + "f": [ + {"v": [{"v": "1"}, {"v": "2"}, {"v": "3"}]}, + {"v": "4"}, ] } }, + {"v": {"f": [{"v": [{"v": "5"}, {"v": "6"}]}, {"v": "7"}]}}, ] } ] } self.assertEqual( self._call_fut(row, schema=[col]), - ([{u"first": [1, 2, 3], u"second": 4}, {u"first": [5, 6], u"second": 7}],), + ([{"first": [1, 2, 3], "second": 4}, {"first": [5, 6], "second": 7}],), ) @@ -626,9 +687,48 @@ def _call_fut(self, value): return _float_to_json(value) + def test_w_none(self): + self.assertEqual(self._call_fut(None), None) + + def test_w_non_numeric(self): + with self.assertRaises(TypeError): + self._call_fut(object()) + + def test_w_integer(self): + result = self._call_fut(123) + self.assertIsInstance(result, float) + self.assertEqual(result, 123.0) + def test_w_float(self): self.assertEqual(self._call_fut(1.23), 1.23) + def test_w_float_as_string(self): + self.assertEqual(self._call_fut("1.23"), 1.23) + + def test_w_nan(self): + result = self._call_fut(float("nan")) + self.assertEqual(result.lower(), "nan") + + def test_w_nan_as_string(self): + result = self._call_fut("NaN") + self.assertEqual(result.lower(), "nan") + + def test_w_infinity(self): + result = self._call_fut(float("inf")) + self.assertEqual(result.lower(), "inf") + + def test_w_infinity_as_string(self): + result = self._call_fut("inf") + self.assertEqual(result.lower(), "inf") + + def test_w_negative_infinity(self): + result = self._call_fut(float("-inf")) + self.assertEqual(result.lower(), "-inf") + + def test_w_negative_infinity_as_string(self): + result = self._call_fut("-inf") + self.assertEqual(result.lower(), "-inf") + class Test_decimal_to_json(unittest.TestCase): def _call_fut(self, value): @@ -674,7 +774,7 @@ def test_w_non_bytes(self): def test_w_bytes(self): source = b"source" - expected = u"c291cmNl" + expected = "c291cmNl" converted = self._call_fut(source) self.assertEqual(converted, expected) @@ -727,11 +827,23 @@ def test_w_string(self): ZULU = "2016-12-20 15:58:27.339328+00:00" self.assertEqual(self._call_fut(ZULU), ZULU) - def test_w_datetime(self): - from google.cloud._helpers import _microseconds_from_datetime - + def test_w_datetime_no_zone(self): when = datetime.datetime(2016, 12, 20, 15, 58, 27, 339328) - self.assertEqual(self._call_fut(when), _microseconds_from_datetime(when) / 1e6) + self.assertEqual(self._call_fut(when), "2016-12-20T15:58:27.339328Z") + + def test_w_datetime_w_utc_zone(self): + from google.cloud._helpers import UTC + + when = datetime.datetime(2020, 11, 17, 1, 6, 52, 353795, tzinfo=UTC) + self.assertEqual(self._call_fut(when), "2020-11-17T01:06:52.353795Z") + + def test_w_datetime_w_non_utc_zone(self): + class EstZone(datetime.tzinfo): + def utcoffset(self, _): + return datetime.timedelta(minutes=-300) + + when = datetime.datetime(2020, 11, 17, 1, 6, 52, 353795, tzinfo=EstZone()) + self.assertEqual(self._call_fut(when), "2020-11-17T06:06:52.353795Z") class Test_datetime_to_json(unittest.TestCase): @@ -750,6 +862,14 @@ def test_w_datetime(self): when = datetime.datetime(2016, 12, 3, 14, 11, 27, 123456, tzinfo=UTC) self.assertEqual(self._call_fut(when), "2016-12-03T14:11:27.123456") + def test_w_datetime_w_non_utc_zone(self): + class EstZone(datetime.tzinfo): + def utcoffset(self, _): + return datetime.timedelta(minutes=-300) + + when = datetime.datetime(2016, 12, 3, 14, 11, 27, 123456, tzinfo=EstZone()) + self.assertEqual(self._call_fut(when), "2016-12-03T19:11:27.123456") + class Test_date_to_json(unittest.TestCase): def _call_fut(self, value): @@ -806,6 +926,41 @@ def test_w_known_field_type(self): self.assertEqual(converted, str(original)) +class Test_single_field_to_json(unittest.TestCase): + def _call_fut(self, field, value): + from google.cloud.bigquery._helpers import _single_field_to_json + + return _single_field_to_json(field, value) + + def test_w_none(self): + field = _make_field("INT64") + original = None + converted = self._call_fut(field, original) + self.assertIsNone(converted) + + def test_w_record(self): + subfields = [ + _make_field("INT64", name="one"), + _make_field("STRING", name="two"), + ] + field = _make_field("RECORD", fields=subfields) + original = {"one": 42, "two": "two"} + converted = self._call_fut(field, original) + self.assertEqual(converted, {"one": "42", "two": "two"}) + + def test_w_scalar(self): + field = _make_field("INT64") + original = 42 + converted = self._call_fut(field, original) + self.assertEqual(converted, str(original)) + + def test_w_scalar_ignores_mode(self): + field = _make_field("STRING", mode="REPEATED") + original = "hello world" + converted = self._call_fut(field, original) + self.assertEqual(converted, original) + + class Test_repeated_field_to_json(unittest.TestCase): def _call_fut(self, field, value): from google.cloud.bigquery._helpers import _repeated_field_to_json @@ -855,7 +1010,7 @@ def test_w_list_missing_fields(self): ] original = [42] - with six.assertRaisesRegex(self, ValueError, r".*not match schema length.*"): + with self.assertRaisesRegex(ValueError, r".*not match schema length.*"): self._call_fut(fields, original) def test_w_list_too_many_fields(self): @@ -865,7 +1020,7 @@ def test_w_list_too_many_fields(self): ] original = [42, "two", "three"] - with six.assertRaisesRegex(self, ValueError, r".*not match schema length.*"): + with self.assertRaisesRegex(ValueError, r".*not match schema length.*"): self._call_fut(fields, original) def test_w_non_empty_dict(self): @@ -1112,3 +1267,18 @@ def fake_isinstance(instance, target_class): "google.cloud.bigquery.schema.isinstance", side_effect=fake_isinstance ) return patcher + + +def test_decimal_as_float_api_repr(): + """Make sure decimals get converted to float.""" + import google.cloud.bigquery.query + from decimal import Decimal + + param = google.cloud.bigquery.query.ScalarQueryParameter( + "x", "FLOAT64", Decimal(42) + ) + assert param.to_api_repr() == { + "parameterType": {"type": "FLOAT64"}, + "parameterValue": {"value": 42.0}, + "name": "x", + } diff --git a/tests/unit/test__http.py b/tests/unit/test__http.py index 4da805d48..09f6d29d7 100644 --- a/tests/unit/test__http.py +++ b/tests/unit/test__http.py @@ -32,22 +32,43 @@ def _get_target_class(): return Connection def _make_one(self, *args, **kw): + if "api_endpoint" not in kw: + kw["api_endpoint"] = "https://bigquery.googleapis.com" + return self._get_target_class()(*args, **kw) def test_build_api_url_no_extra_query_params(self): + from urllib.parse import parse_qsl + from urllib.parse import urlsplit + conn = self._make_one(object()) - URI = "/".join([conn.DEFAULT_API_ENDPOINT, "bigquery", conn.API_VERSION, "foo"]) - self.assertEqual(conn.build_api_url("/foo"), URI) + uri = conn.build_api_url("/foo") + scheme, netloc, path, qs, _ = urlsplit(uri) + self.assertEqual("%s://%s" % (scheme, netloc), conn.API_BASE_URL) + self.assertEqual(path, "/".join(["", "bigquery", conn.API_VERSION, "foo"])) + parms = dict(parse_qsl(qs)) + pretty_print = parms.pop("prettyPrint", "false") + self.assertEqual(pretty_print, "false") + self.assertEqual(parms, {}) def test_build_api_url_w_custom_endpoint(self): - custom_endpoint = "https://www.foo-googleapis.com" + from urllib.parse import parse_qsl + from urllib.parse import urlsplit + + custom_endpoint = "https://foo-bigquery.googleapis.com" conn = self._make_one(object(), api_endpoint=custom_endpoint) - URI = "/".join([custom_endpoint, "bigquery", conn.API_VERSION, "foo"]) - self.assertEqual(conn.build_api_url("/foo"), URI) + uri = conn.build_api_url("/foo") + scheme, netloc, path, qs, _ = urlsplit(uri) + self.assertEqual("%s://%s" % (scheme, netloc), custom_endpoint) + self.assertEqual(path, "/".join(["", "bigquery", conn.API_VERSION, "foo"])) + parms = dict(parse_qsl(qs)) + pretty_print = parms.pop("prettyPrint", "false") + self.assertEqual(pretty_print, "false") + self.assertEqual(parms, {}) def test_build_api_url_w_extra_query_params(self): - from six.moves.urllib.parse import parse_qsl - from six.moves.urllib.parse import urlsplit + from urllib.parse import parse_qsl + from urllib.parse import urlsplit conn = self._make_one(object()) uri = conn.build_api_url("/foo", {"bar": "baz"}) @@ -120,3 +141,14 @@ def test_extra_headers_replace(self): url=expected_uri, timeout=self._get_default_timeout(), ) + + def test_ctor_mtls(self): + conn = self._make_one(object(), api_endpoint=None) + self.assertEqual(conn.ALLOW_AUTO_SWITCH_TO_MTLS_URL, True) + self.assertEqual(conn.API_BASE_URL, "https://bigquery.googleapis.com") + self.assertEqual(conn.API_BASE_MTLS_URL, "https://bigquery.mtls.googleapis.com") + + conn = self._make_one(object(), api_endpoint="http://foo") + self.assertEqual(conn.ALLOW_AUTO_SWITCH_TO_MTLS_URL, False) + self.assertEqual(conn.API_BASE_URL, "http://foo") + self.assertEqual(conn.API_BASE_MTLS_URL, "https://bigquery.mtls.googleapis.com") diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index f4355072a..a9b0ae21f 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -17,10 +17,11 @@ import decimal import functools import operator +import queue import warnings +import pkg_resources import mock -import six try: import pandas @@ -35,12 +36,32 @@ # Mock out pyarrow when missing, because methods from pyarrow.types are # used in test parameterization. pyarrow = mock.Mock() +try: + import geopandas +except ImportError: # pragma: NO COVER + geopandas = None + import pytest -import pytz from google import api_core +from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema +try: + from google.cloud import bigquery_storage + + _helpers.BQ_STORAGE_VERSIONS.verify_version() +except ImportError: # pragma: NO COVER + bigquery_storage = None + +PANDAS_MINIUM_VERSION = pkg_resources.parse_version("1.0.0") + +if pandas is not None: + PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version +else: + # Set to less than MIN version. + PANDAS_INSTALLED_VERSION = pkg_resources.parse_version("0.0.0") + @pytest.fixture def module_under_test(): @@ -71,6 +92,15 @@ def is_numeric(type_): )(type_) +def is_bignumeric(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type + return all_( + pyarrow.types.is_decimal, + lambda type_: type_.precision == 76, + lambda type_: type_.scale == 38, + )(type_) + + def is_timestamp(type_): # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp-type return all_( @@ -121,6 +151,7 @@ def test_all_(): ("FLOAT", "NULLABLE", pyarrow.types.is_float64), ("FLOAT64", "NULLABLE", pyarrow.types.is_float64), ("NUMERIC", "NULLABLE", is_numeric), + ("BIGNUMERIC", "NULLABLE", is_bignumeric), ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), ("BOOL", "NULLABLE", pyarrow.types.is_boolean), ("TIMESTAMP", "NULLABLE", is_timestamp), @@ -199,6 +230,11 @@ def test_all_(): "REPEATED", all_(pyarrow.types.is_list, lambda type_: is_numeric(type_.value_type)), ), + ( + "BIGNUMERIC", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), + ), ( "BOOLEAN", "REPEATED", @@ -271,39 +307,40 @@ def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), - schema.SchemaField("field08", "BOOLEAN"), - schema.SchemaField("field09", "BOOL"), - schema.SchemaField("field10", "TIMESTAMP"), - schema.SchemaField("field11", "DATE"), - schema.SchemaField("field12", "TIME"), - schema.SchemaField("field13", "DATETIME"), - schema.SchemaField("field14", "GEOGRAPHY"), + schema.SchemaField("field08", "BIGNUMERIC"), + schema.SchemaField("field09", "BOOLEAN"), + schema.SchemaField("field10", "BOOL"), + schema.SchemaField("field11", "TIMESTAMP"), + schema.SchemaField("field12", "DATE"), + schema.SchemaField("field13", "TIME"), + schema.SchemaField("field14", "DATETIME"), + schema.SchemaField("field15", "GEOGRAPHY"), ) + field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) - expected = pyarrow.struct( - ( - pyarrow.field("field01", pyarrow.string()), - pyarrow.field("field02", pyarrow.binary()), - pyarrow.field("field03", pyarrow.int64()), - pyarrow.field("field04", pyarrow.int64()), - pyarrow.field("field05", pyarrow.float64()), - pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", pyarrow.bool_()), - pyarrow.field("field09", pyarrow.bool_()), - pyarrow.field("field10", module_under_test.pyarrow_timestamp()), - pyarrow.field("field11", pyarrow.date32()), - pyarrow.field("field12", module_under_test.pyarrow_time()), - pyarrow.field("field13", module_under_test.pyarrow_datetime()), - pyarrow.field("field14", pyarrow.string()), - ) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), ) + expected = pyarrow.struct(expected) + assert pyarrow.types.is_struct(actual) - try: - assert actual.num_fields == len(fields) - except AttributeError: # py27 - assert actual.num_children == len(fields) + assert actual.num_fields == len(fields) assert actual.equals(expected) @@ -318,40 +355,41 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), - schema.SchemaField("field08", "BOOLEAN"), - schema.SchemaField("field09", "BOOL"), - schema.SchemaField("field10", "TIMESTAMP"), - schema.SchemaField("field11", "DATE"), - schema.SchemaField("field12", "TIME"), - schema.SchemaField("field13", "DATETIME"), - schema.SchemaField("field14", "GEOGRAPHY"), + schema.SchemaField("field08", "BIGNUMERIC"), + schema.SchemaField("field09", "BOOLEAN"), + schema.SchemaField("field10", "BOOL"), + schema.SchemaField("field11", "TIMESTAMP"), + schema.SchemaField("field12", "DATE"), + schema.SchemaField("field13", "TIME"), + schema.SchemaField("field14", "DATETIME"), + schema.SchemaField("field15", "GEOGRAPHY"), ) + field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) - expected_value_type = pyarrow.struct( - ( - pyarrow.field("field01", pyarrow.string()), - pyarrow.field("field02", pyarrow.binary()), - pyarrow.field("field03", pyarrow.int64()), - pyarrow.field("field04", pyarrow.int64()), - pyarrow.field("field05", pyarrow.float64()), - pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", pyarrow.bool_()), - pyarrow.field("field09", pyarrow.bool_()), - pyarrow.field("field10", module_under_test.pyarrow_timestamp()), - pyarrow.field("field11", pyarrow.date32()), - pyarrow.field("field12", module_under_test.pyarrow_time()), - pyarrow.field("field13", module_under_test.pyarrow_datetime()), - pyarrow.field("field14", pyarrow.string()), - ) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), ) + expected_value_type = pyarrow.struct(expected) + assert pyarrow.types.is_list(actual) assert pyarrow.types.is_struct(actual.value_type) - try: - assert actual.value_type.num_fields == len(fields) - except AttributeError: # py27 - assert actual.value_type.num_children == len(fields) + assert actual.value_type.num_fields == len(fields) assert actual.value_type.equals(expected_value_type) @@ -392,15 +430,26 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): decimal.Decimal("999.123456789"), ], ), + ( + "BIGNUMERIC", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + decimal.Decimal("3.141592653589793238462643383279"), + ], + ), ("BOOLEAN", [True, None, False, None]), ("BOOL", [False, None, True, None]), ( "TIMESTAMP", [ - datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc), + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), None, - datetime.datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc), - datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc), + datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), ], ), ( @@ -540,6 +589,60 @@ def test_bq_to_arrow_array_w_special_floats(module_under_test): assert roundtrip[3] is None +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_array_w_geography_dtype(module_under_test): + from shapely import wkb, wkt + + bq_field = schema.SchemaField("field_name", "GEOGRAPHY") + + series = geopandas.GeoSeries([None, wkt.loads("point(0 0)")]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + # The result is binary, because we use wkb format + assert array.type == pyarrow.binary() + assert array.to_pylist() == [None, wkb.dumps(series[1])] + + # All na: + series = geopandas.GeoSeries([None, None]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + assert array.type == pyarrow.string() + assert array.to_pylist() == list(series) + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_array_w_geography_type_shapely_data(module_under_test): + from shapely import wkb, wkt + + bq_field = schema.SchemaField("field_name", "GEOGRAPHY") + + series = pandas.Series([None, wkt.loads("point(0 0)")]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + # The result is binary, because we use wkb format + assert array.type == pyarrow.binary() + assert array.to_pylist() == [None, wkb.dumps(series[1])] + + # All na: + series = pandas.Series([None, None]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + assert array.type == pyarrow.string() + assert array.to_pylist() == list(series) + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): + from shapely import wkb, wkt + + bq_field = schema.SchemaField("field_name", "GEOGRAPHY") + + series = pandas.Series([None, wkb.dumps(wkt.loads("point(0 0)"))]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + # The result is binary, because we use wkb format + assert array.type == pyarrow.binary() + assert array.to_pylist() == list(series) + + @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_schema_w_unknown_type(module_under_test): fields = ( @@ -553,12 +656,9 @@ def test_bq_to_arrow_schema_w_unknown_type(module_under_test): actual = module_under_test.bq_to_arrow_schema(fields) assert actual is None - if six.PY3: - assert len(warned) == 1 - warning = warned[0] - assert "field3" in str(warning) - else: - assert len(warned) == 0 + assert len(warned) == 1 + warning = warned[0] + assert "field3" in str(warning) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @@ -690,6 +790,37 @@ def test_list_columns_and_indexes_with_named_index_same_as_column_name( assert columns_and_indexes == expected +@pytest.mark.skipif( + pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIUM_VERSION, + reason="Requires `pandas version >= 1.0.0` which introduces pandas.NA", +) +def test_dataframe_to_json_generator(module_under_test): + utcnow = datetime.datetime.utcnow() + df_data = collections.OrderedDict( + [ + ("a_series", [pandas.NA, 2, 3, 4]), + ("b_series", [0.1, float("NaN"), 0.3, 0.4]), + ("c_series", ["a", "b", pandas.NA, "d"]), + ("d_series", [utcnow, utcnow, utcnow, pandas.NaT]), + ("e_series", [True, False, True, None]), + ] + ) + dataframe = pandas.DataFrame( + df_data, index=pandas.Index([4, 5, 6, 7], name="a_index") + ) + + dataframe = dataframe.astype({"a_series": pandas.Int64Dtype()}) + + rows = module_under_test.dataframe_to_json_generator(dataframe) + expected = [ + {"b_series": 0.1, "c_series": "a", "d_series": utcnow, "e_series": True}, + {"a_series": 2, "c_series": "b", "d_series": utcnow, "e_series": False}, + {"a_series": 3, "b_series": 0.3, "d_series": utcnow, "e_series": True}, + {"a_series": 4, "b_series": 0.4, "c_series": "d"}, + ] + assert list(rows) == expected + + @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_list_columns_and_indexes_with_named_index(module_under_test): df_data = collections.OrderedDict( @@ -773,26 +904,6 @@ def test_dataframe_to_bq_schema_dict_sequence(module_under_test): assert returned_schema == expected_schema -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -@pytest.mark.skipif(not six.PY2, reason="Requires Python 2.7") -def test_dataframe_to_bq_schema_w_struct_raises_py27(module_under_test): - dataframe = pandas.DataFrame( - data=[{"struct_field": {"int_col": 1}}, {"struct_field": {"int_col": 2}}] - ) - bq_schema = [ - schema.SchemaField( - "struct_field", - field_type="STRUCT", - fields=[schema.SchemaField("int_col", field_type="INT64")], - ), - ] - - with pytest.raises(ValueError) as excinfo: - module_under_test.dataframe_to_bq_schema(dataframe, bq_schema=bq_schema) - - assert "struct (record) column types is not supported" in str(excinfo.value) - - @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_arrow_with_multiindex(module_under_test): @@ -871,42 +982,44 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): schema.SchemaField("field05", "FLOAT", mode="REQUIRED"), schema.SchemaField("field06", "FLOAT64", mode="REQUIRED"), schema.SchemaField("field07", "NUMERIC", mode="REQUIRED"), - schema.SchemaField("field08", "BOOLEAN", mode="REQUIRED"), - schema.SchemaField("field09", "BOOL", mode="REQUIRED"), - schema.SchemaField("field10", "TIMESTAMP", mode="REQUIRED"), - schema.SchemaField("field11", "DATE", mode="REQUIRED"), - schema.SchemaField("field12", "TIME", mode="REQUIRED"), - schema.SchemaField("field13", "DATETIME", mode="REQUIRED"), - schema.SchemaField("field14", "GEOGRAPHY", mode="REQUIRED"), - ) - dataframe = pandas.DataFrame( - { - "field01": ["hello", "world"], - "field02": [b"abd", b"efg"], - "field03": [1, 2], - "field04": [3, 4], - "field05": [1.25, 9.75], - "field06": [-1.75, -3.5], - "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")], - "field08": [True, False], - "field09": [False, True], - "field10": [ - datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc), - datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=pytz.utc), - ], - "field11": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)], - "field12": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)], - "field13": [ - datetime.datetime(1970, 1, 1, 0, 0, 0), - datetime.datetime(2012, 12, 21, 9, 7, 42), - ], - "field14": [ - "POINT(30 10)", - "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", - ], - } + schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"), + schema.SchemaField("field09", "BOOLEAN", mode="REQUIRED"), + schema.SchemaField("field10", "BOOL", mode="REQUIRED"), + schema.SchemaField("field11", "TIMESTAMP", mode="REQUIRED"), + schema.SchemaField("field12", "DATE", mode="REQUIRED"), + schema.SchemaField("field13", "TIME", mode="REQUIRED"), + schema.SchemaField("field14", "DATETIME", mode="REQUIRED"), + schema.SchemaField("field15", "GEOGRAPHY", mode="REQUIRED"), ) + data = { + "field01": ["hello", "world"], + "field02": [b"abd", b"efg"], + "field03": [1, 2], + "field04": [3, 4], + "field05": [1.25, 9.75], + "field06": [-1.75, -3.5], + "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")], + "field08": [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], + "field09": [True, False], + "field10": [False, True], + "field11": [ + datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=datetime.timezone.utc), + ], + "field12": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)], + "field13": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)], + "field14": [ + datetime.datetime(1970, 1, 1, 0, 0, 0), + datetime.datetime(2012, 12, 21, 9, 7, 42), + ], + "field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], + } + dataframe = pandas.DataFrame(data) + arrow_table = module_under_test.dataframe_to_arrow(dataframe, bq_schema) arrow_schema = arrow_table.schema @@ -1104,6 +1217,28 @@ def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test): assert "struct_field" in str(expected_warnings[0]) +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +def test_dataframe_to_bq_schema_geography(module_under_test): + from shapely import wkt + + df = geopandas.GeoDataFrame( + pandas.DataFrame( + dict( + name=["foo", "bar"], + geo1=[None, None], + geo2=[None, wkt.loads("Point(1 1)")], + ) + ), + geometry="geo1", + ) + bq_schema = module_under_test.dataframe_to_bq_schema(df, []) + assert bq_schema == ( + schema.SchemaField("name", "STRING"), + schema.SchemaField("geo1", "GEOGRAPHY"), + schema.SchemaField("geo2", "GEOGRAPHY"), + ) + + @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_augment_schema_type_detection_succeeds(module_under_test): @@ -1119,6 +1254,7 @@ def test_augment_schema_type_detection_succeeds(module_under_test): "bytes_field": b"some bytes", "string_field": u"some characters", "numeric_field": decimal.Decimal("123.456"), + "bignumeric_field": decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), } ] ) @@ -1138,6 +1274,7 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("bytes_field", field_type=None, mode="NULLABLE"), schema.SchemaField("string_field", field_type=None, mode="NULLABLE"), schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"), + schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), ) with warnings.catch_warnings(record=True) as warned: @@ -1160,7 +1297,11 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("bytes_field", field_type="BYTES", mode="NULLABLE"), schema.SchemaField("string_field", field_type="STRING", mode="NULLABLE"), schema.SchemaField("numeric_field", field_type="NUMERIC", mode="NULLABLE"), + schema.SchemaField( + "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" + ), ) + by_name = operator.attrgetter("name") assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) @@ -1231,8 +1372,134 @@ def test_dataframe_to_parquet_dict_sequence_schema(module_under_test): assert schema_arg == expected_schema_arg +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +def test__download_table_bqstorage_stream_includes_read_session( + monkeypatch, module_under_test +): + import google.cloud.bigquery_storage_v1.reader + import google.cloud.bigquery_storage_v1.types + + monkeypatch.setattr(_helpers.BQ_STORAGE_VERSIONS, "_installed_version", None) + monkeypatch.setattr(bigquery_storage, "__version__", "2.5.0") + bqstorage_client = mock.create_autospec( + bigquery_storage.BigQueryReadClient, instance=True + ) + reader = mock.create_autospec( + google.cloud.bigquery_storage_v1.reader.ReadRowsStream, instance=True + ) + bqstorage_client.read_rows.return_value = reader + session = google.cloud.bigquery_storage_v1.types.ReadSession() + + module_under_test._download_table_bqstorage_stream( + module_under_test._DownloadState(), + bqstorage_client, + session, + google.cloud.bigquery_storage_v1.types.ReadStream(name="test"), + queue.Queue(), + mock.Mock(), + ) + + reader.rows.assert_called_once_with(session) + + +@pytest.mark.skipif( + bigquery_storage is None + or not _helpers.BQ_STORAGE_VERSIONS.is_read_session_optional, + reason="Requires `google-cloud-bigquery-storage` >= 2.6.0", +) +def test__download_table_bqstorage_stream_omits_read_session( + monkeypatch, module_under_test +): + import google.cloud.bigquery_storage_v1.reader + import google.cloud.bigquery_storage_v1.types + + monkeypatch.setattr(_helpers.BQ_STORAGE_VERSIONS, "_installed_version", None) + monkeypatch.setattr(bigquery_storage, "__version__", "2.6.0") + bqstorage_client = mock.create_autospec( + bigquery_storage.BigQueryReadClient, instance=True + ) + reader = mock.create_autospec( + google.cloud.bigquery_storage_v1.reader.ReadRowsStream, instance=True + ) + bqstorage_client.read_rows.return_value = reader + session = google.cloud.bigquery_storage_v1.types.ReadSession() + + module_under_test._download_table_bqstorage_stream( + module_under_test._DownloadState(), + bqstorage_client, + session, + google.cloud.bigquery_storage_v1.types.ReadStream(name="test"), + queue.Queue(), + mock.Mock(), + ) + + reader.rows.assert_called_once_with() + + +@pytest.mark.parametrize( + "stream_count,maxsize_kwarg,expected_call_count,expected_maxsize", + [ + (3, {"max_queue_size": 2}, 3, 2), # custom queue size + (4, {}, 4, 4), # default queue size + (7, {"max_queue_size": None}, 7, 0), # infinite queue size + ], +) +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +def test__download_table_bqstorage( + module_under_test, + stream_count, + maxsize_kwarg, + expected_call_count, + expected_maxsize, +): + from google.cloud.bigquery import dataset + from google.cloud.bigquery import table + + queue_used = None # A reference to the queue used by code under test. + + bqstorage_client = mock.create_autospec( + bigquery_storage.BigQueryReadClient, instance=True + ) + fake_session = mock.Mock(streams=["stream/s{i}" for i in range(stream_count)]) + bqstorage_client.create_read_session.return_value = fake_session + + table_ref = table.TableReference( + dataset.DatasetReference("project-x", "dataset-y"), "table-z", + ) + + def fake_download_stream( + download_state, bqstorage_client, session, stream, worker_queue, page_to_item + ): + nonlocal queue_used + queue_used = worker_queue + try: + worker_queue.put_nowait("result_page") + except queue.Full: # pragma: NO COVER + pass + + download_stream = mock.Mock(side_effect=fake_download_stream) + + with mock.patch.object( + module_under_test, "_download_table_bqstorage_stream", new=download_stream + ): + result_gen = module_under_test._download_table_bqstorage( + "some-project", table_ref, bqstorage_client, **maxsize_kwarg + ) + list(result_gen) + + # Timing-safe, as the method under test should block until the pool shutdown is + # complete, at which point all download stream workers have already been submitted + # to the thread pool. + assert download_stream.call_count == stream_count # once for each stream + assert queue_used.maxsize == expected_maxsize + + @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") -def test_download_arrow_tabledata_list_unknown_field_type(module_under_test): +def test_download_arrow_row_iterator_unknown_field_type(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), items=[{"page_data": "foo"}], @@ -1246,7 +1513,7 @@ def test_download_arrow_tabledata_list_unknown_field_type(module_under_test): schema.SchemaField("alien_field", "ALIEN_FLOAT_TYPE"), ] - results_gen = module_under_test.download_arrow_tabledata_list(pages, bq_schema) + results_gen = module_under_test.download_arrow_row_iterator(pages, bq_schema) with warnings.catch_warnings(record=True) as warned: result = next(results_gen) @@ -1268,7 +1535,7 @@ def test_download_arrow_tabledata_list_unknown_field_type(module_under_test): @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") -def test_download_arrow_tabledata_list_known_field_type(module_under_test): +def test_download_arrow_row_iterator_known_field_type(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), items=[{"page_data": "foo"}], @@ -1282,7 +1549,7 @@ def test_download_arrow_tabledata_list_known_field_type(module_under_test): schema.SchemaField("non_alien_field", "STRING"), ] - results_gen = module_under_test.download_arrow_tabledata_list(pages, bq_schema) + results_gen = module_under_test.download_arrow_row_iterator(pages, bq_schema) with warnings.catch_warnings(record=True) as warned: result = next(results_gen) @@ -1303,7 +1570,7 @@ def test_download_arrow_tabledata_list_known_field_type(module_under_test): @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") -def test_download_arrow_tabledata_list_dict_sequence_schema(module_under_test): +def test_download_arrow_row_iterator_dict_sequence_schema(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), items=[{"page_data": "foo"}], @@ -1317,7 +1584,7 @@ def test_download_arrow_tabledata_list_dict_sequence_schema(module_under_test): {"name": "non_alien_field", "type": "STRING", "mode": "NULLABLE"}, ] - results_gen = module_under_test.download_arrow_tabledata_list(pages, dict_schema) + results_gen = module_under_test.download_arrow_row_iterator(pages, dict_schema) result = next(results_gen) assert len(result.columns) == 2 @@ -1331,7 +1598,7 @@ def test_download_arrow_tabledata_list_dict_sequence_schema(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") -def test_download_dataframe_tabledata_list_dict_sequence_schema(module_under_test): +def test_download_dataframe_row_iterator_dict_sequence_schema(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), items=[{"page_data": "foo"}], @@ -1345,7 +1612,7 @@ def test_download_dataframe_tabledata_list_dict_sequence_schema(module_under_tes {"name": "non_alien_field", "type": "STRING", "mode": "NULLABLE"}, ] - results_gen = module_under_test.download_dataframe_tabledata_list( + results_gen = module_under_test.download_dataframe_row_iterator( pages, dict_schema, dtypes={} ) result = next(results_gen) @@ -1359,3 +1626,31 @@ def test_download_dataframe_tabledata_list_dict_sequence_schema(module_under_tes ) ) assert result.equals(expected_result) + + with pytest.raises(StopIteration): + result = next(results_gen) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_table_data_listpage_to_dataframe_skips_stop_iteration(module_under_test): + dataframe = module_under_test._row_iterator_page_to_dataframe([], [], {}) + assert isinstance(dataframe, pandas.DataFrame) + + +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_field_type_override(module_under_test): + # When loading pandas data, we may need to override the type + # decision based on data contents, because GEOGRAPHY data can be + # stored as either text or binary. + + assert ( + module_under_test.bq_to_arrow_field(schema.SchemaField("g", "GEOGRAPHY")).type + == pyarrow.string() + ) + + assert ( + module_under_test.bq_to_arrow_field( + schema.SchemaField("g", "GEOGRAPHY"), pyarrow.binary(), + ).type + == pyarrow.binary() + ) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index c4c604ed0..e9204f1de 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -18,24 +18,20 @@ import decimal import email import gzip +import http.client import io +import itertools import json import operator import unittest import warnings import mock +import packaging import requests -import six -from six.moves import http_client import pytest -import pytz import pkg_resources -try: - import fastparquet -except (ImportError, AttributeError): # pragma: NO COVER - fastparquet = None try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -48,7 +44,7 @@ from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( InMemorySpanExporter, ) -except (ImportError, AttributeError): +except (ImportError, AttributeError): # pragma: NO COVER opentelemetry = None try: import pyarrow @@ -60,16 +56,22 @@ import google.cloud._helpers from google.cloud import bigquery_v2 from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT try: - from google.cloud import bigquery_storage_v1 + from google.cloud import bigquery_storage except (ImportError, AttributeError): # pragma: NO COVER - bigquery_storage_v1 = None + bigquery_storage = None from test_utils.imports import maybe_fail_import from tests.unit.helpers import make_connection PANDAS_MINIUM_VERSION = pkg_resources.parse_version("1.0.0") -PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version + +if pandas is not None: + PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version +else: + # Set to less than MIN version. + PANDAS_INSTALLED_VERSION = pkg_resources.parse_version("0.0.0") def _make_credentials(): @@ -315,7 +317,7 @@ def test__get_query_results_miss_w_explicit_project_and_timeout(self): project="other-project", location=self.LOCATION, timeout_ms=500, - timeout=42, + timeout=420, ) final_attributes.assert_called_once_with({"path": path}, client, None) @@ -324,7 +326,32 @@ def test__get_query_results_miss_w_explicit_project_and_timeout(self): method="GET", path=path, query_params={"maxResults": 0, "timeoutMs": 500, "location": self.LOCATION}, - timeout=42, + timeout=420, + ) + + def test__get_query_results_miss_w_short_timeout(self): + import google.cloud.bigquery.client + from google.cloud.exceptions import NotFound + + creds = _make_credentials() + client = self._make_one(self.PROJECT, creds) + conn = client._connection = make_connection() + path = "/projects/other-project/queries/nothere" + with self.assertRaises(NotFound): + client._get_query_results( + "nothere", + None, + project="other-project", + location=self.LOCATION, + timeout_ms=500, + timeout=1, + ) + + conn.api_request.assert_called_once_with( + method="GET", + path=path, + query_params={"maxResults": 0, "timeoutMs": 500, "location": self.LOCATION}, + timeout=google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT, ) def test__get_query_results_miss_w_client_location(self): @@ -341,7 +368,7 @@ def test__get_query_results_miss_w_client_location(self): method="GET", path="/projects/PROJECT/queries/nothere", query_params={"maxResults": 0, "location": self.LOCATION}, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test__get_query_results_hit(self): @@ -402,7 +429,9 @@ def test_get_service_account_email_w_alternate_project(self): service_account_email = client.get_service_account_email(project=project) final_attributes.assert_called_once_with({"path": path}, client, None) - conn.api_request.assert_called_once_with(method="GET", path=path, timeout=None) + conn.api_request.assert_called_once_with( + method="GET", path=path, timeout=DEFAULT_TIMEOUT + ) self.assertEqual(service_account_email, email) def test_get_service_account_email_w_custom_retry(self): @@ -445,221 +474,6 @@ def test_get_service_account_email_w_custom_retry(self): ], ) - def test_list_projects_defaults(self): - from google.cloud.bigquery.client import Project - - PROJECT_1 = "PROJECT_ONE" - PROJECT_2 = "PROJECT_TWO" - TOKEN = "TOKEN" - DATA = { - "nextPageToken": TOKEN, - "projects": [ - { - "kind": "bigquery#project", - "id": PROJECT_1, - "numericId": 1, - "projectReference": {"projectId": PROJECT_1}, - "friendlyName": "One", - }, - { - "kind": "bigquery#project", - "id": PROJECT_2, - "numericId": 2, - "projectReference": {"projectId": PROJECT_2}, - "friendlyName": "Two", - }, - ], - } - creds = _make_credentials() - client = self._make_one(PROJECT_1, creds) - conn = client._connection = make_connection(DATA) - iterator = client.list_projects() - - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/projects"}, client, None) - projects = list(page) - token = iterator.next_page_token - - self.assertEqual(len(projects), len(DATA["projects"])) - for found, expected in zip(projects, DATA["projects"]): - self.assertIsInstance(found, Project) - self.assertEqual(found.project_id, expected["id"]) - self.assertEqual(found.numeric_id, expected["numericId"]) - self.assertEqual(found.friendly_name, expected["friendlyName"]) - self.assertEqual(token, TOKEN) - - conn.api_request.assert_called_once_with( - method="GET", path="/projects", query_params={}, timeout=None - ) - - def test_list_projects_w_timeout(self): - PROJECT_1 = "PROJECT_ONE" - TOKEN = "TOKEN" - DATA = { - "nextPageToken": TOKEN, - "projects": [], - } - creds = _make_credentials() - client = self._make_one(PROJECT_1, creds) - conn = client._connection = make_connection(DATA) - - iterator = client.list_projects(timeout=7.5) - - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/projects"}, client, None) - - conn.api_request.assert_called_once_with( - method="GET", path="/projects", query_params={}, timeout=7.5 - ) - - def test_list_projects_explicit_response_missing_projects_key(self): - TOKEN = "TOKEN" - DATA = {} - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(DATA) - - iterator = client.list_projects(max_results=3, page_token=TOKEN) - - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/projects"}, client, None) - projects = list(page) - token = iterator.next_page_token - - self.assertEqual(len(projects), 0) - self.assertIsNone(token) - - conn.api_request.assert_called_once_with( - method="GET", - path="/projects", - query_params={"maxResults": 3, "pageToken": TOKEN}, - timeout=None, - ) - - def test_list_datasets_defaults(self): - from google.cloud.bigquery.dataset import DatasetListItem - - DATASET_1 = "dataset_one" - DATASET_2 = "dataset_two" - PATH = "projects/%s/datasets" % self.PROJECT - TOKEN = "TOKEN" - DATA = { - "nextPageToken": TOKEN, - "datasets": [ - { - "kind": "bigquery#dataset", - "id": "%s:%s" % (self.PROJECT, DATASET_1), - "datasetReference": { - "datasetId": DATASET_1, - "projectId": self.PROJECT, - }, - "friendlyName": None, - }, - { - "kind": "bigquery#dataset", - "id": "%s:%s" % (self.PROJECT, DATASET_2), - "datasetReference": { - "datasetId": DATASET_2, - "projectId": self.PROJECT, - }, - "friendlyName": "Two", - }, - ], - } - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(DATA) - - iterator = client.list_datasets() - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - datasets = list(page) - token = iterator.next_page_token - - self.assertEqual(len(datasets), len(DATA["datasets"])) - for found, expected in zip(datasets, DATA["datasets"]): - self.assertIsInstance(found, DatasetListItem) - self.assertEqual(found.full_dataset_id, expected["id"]) - self.assertEqual(found.friendly_name, expected["friendlyName"]) - self.assertEqual(token, TOKEN) - - conn.api_request.assert_called_once_with( - method="GET", path="/%s" % PATH, query_params={}, timeout=None - ) - - def test_list_datasets_w_project_and_timeout(self): - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection({}) - - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - list(client.list_datasets(project="other-project", timeout=7.5)) - - final_attributes.assert_called_once_with( - {"path": "/projects/other-project/datasets"}, client, None - ) - - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/other-project/datasets", - query_params={}, - timeout=7.5, - ) - - def test_list_datasets_explicit_response_missing_datasets_key(self): - PATH = "projects/%s/datasets" % self.PROJECT - TOKEN = "TOKEN" - FILTER = "FILTER" - DATA = {} - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(DATA) - - iterator = client.list_datasets( - include_all=True, filter=FILTER, max_results=3, page_token=TOKEN - ) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - datasets = list(page) - token = iterator.next_page_token - - self.assertEqual(len(datasets), 0) - self.assertIsNone(token) - - conn.api_request.assert_called_once_with( - method="GET", - path="/%s" % PATH, - query_params={ - "all": True, - "filter": FILTER, - "maxResults": 3, - "pageToken": TOKEN, - }, - timeout=None, - ) - def test_dataset_with_specified_project(self): from google.cloud.bigquery.dataset import DatasetReference @@ -794,37 +608,44 @@ def test_get_dataset(self): self.assertEqual(dataset.dataset_id, self.DS_ID) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) - def test_create_bqstorage_client(self): - mock_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + def test_ensure_bqstorage_client_creating_new_instance(self): + mock_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) mock_client_instance = object() mock_client.return_value = mock_client_instance creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) with mock.patch( - "google.cloud.bigquery_storage_v1.BigQueryReadClient", mock_client + "google.cloud.bigquery_storage.BigQueryReadClient", mock_client ): - bqstorage_client = client._create_bqstorage_client() + bqstorage_client = client._ensure_bqstorage_client( + client_options=mock.sentinel.client_options, + client_info=mock.sentinel.client_info, + ) self.assertIs(bqstorage_client, mock_client_instance) - mock_client.assert_called_once_with(credentials=creds) + mock_client.assert_called_once_with( + credentials=creds, + client_options=mock.sentinel.client_options, + client_info=mock.sentinel.client_info, + ) - def test_create_bqstorage_client_missing_dependency(self): + def test_ensure_bqstorage_client_missing_dependency(self): creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) def fail_bqstorage_import(name, globals, locals, fromlist, level): # NOTE: *very* simplified, assuming a straightforward absolute import - return "bigquery_storage_v1" in name or ( - fromlist is not None and "bigquery_storage_v1" in fromlist + return "bigquery_storage" in name or ( + fromlist is not None and "bigquery_storage" in fromlist ) no_bqstorage = maybe_fail_import(predicate=fail_bqstorage_import) with no_bqstorage, warnings.catch_warnings(record=True) as warned: - bqstorage_client = client._create_bqstorage_client() + bqstorage_client = client._ensure_bqstorage_client() self.assertIsNone(bqstorage_client) matching_warnings = [ @@ -835,477 +656,103 @@ def fail_bqstorage_import(name, globals, locals, fromlist, level): ] assert matching_warnings, "Missing dependency warning not raised." - def test_create_dataset_minimal(self): - from google.cloud.bigquery.dataset import Dataset + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test_ensure_bqstorage_client_obsolete_dependency(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - PATH = "projects/%s/datasets" % self.PROJECT - RESOURCE = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - } creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection(RESOURCE) - - ds_ref = DatasetReference(self.PROJECT, self.DS_ID) - before = Dataset(ds_ref) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - after = client.create_dataset(before, timeout=7.5) - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - - self.assertEqual(after.dataset_id, self.DS_ID) - self.assertEqual(after.project, self.PROJECT) - self.assertEqual(after.etag, RESOURCE["etag"]) - self.assertEqual(after.full_dataset_id, RESOURCE["id"]) - - conn.api_request.assert_called_once_with( - method="POST", - path="/%s" % PATH, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "labels": {}, - }, - timeout=7.5, + patcher = mock.patch( + "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), ) + with patcher, warnings.catch_warnings(record=True) as warned: + bqstorage_client = client._ensure_bqstorage_client() - def test_create_dataset_w_attrs(self): - from google.cloud.bigquery.dataset import Dataset, AccessEntry + self.assertIsNone(bqstorage_client) + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." - PATH = "projects/%s/datasets" % self.PROJECT - DESCRIPTION = "DESC" - FRIENDLY_NAME = "FN" - LOCATION = "US" - USER_EMAIL = "phred@example.com" - LABELS = {"color": "red"} - VIEW = { - "projectId": "my-proj", - "datasetId": "starry-skies", - "tableId": "northern-hemisphere", - } - RESOURCE = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "description": DESCRIPTION, - "friendlyName": FRIENDLY_NAME, - "location": LOCATION, - "defaultTableExpirationMs": "3600", - "labels": LABELS, - "access": [{"role": "OWNER", "userByEmail": USER_EMAIL}, {"view": VIEW}], - } + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test_ensure_bqstorage_client_existing_client_check_passes(self): creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection(RESOURCE) - entries = [ - AccessEntry("OWNER", "userByEmail", USER_EMAIL), - AccessEntry(None, "view", VIEW), - ] + mock_storage_client = mock.sentinel.mock_storage_client - ds_ref = DatasetReference(self.PROJECT, self.DS_ID) - before = Dataset(ds_ref) - before.access_entries = entries - before.description = DESCRIPTION - before.friendly_name = FRIENDLY_NAME - before.default_table_expiration_ms = 3600 - before.location = LOCATION - before.labels = LABELS - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - after = client.create_dataset(before) + bqstorage_client = client._ensure_bqstorage_client( + bqstorage_client=mock_storage_client + ) - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) + self.assertIs(bqstorage_client, mock_storage_client) - self.assertEqual(after.dataset_id, self.DS_ID) - self.assertEqual(after.project, self.PROJECT) - self.assertEqual(after.etag, RESOURCE["etag"]) - self.assertEqual(after.full_dataset_id, RESOURCE["id"]) - self.assertEqual(after.description, DESCRIPTION) - self.assertEqual(after.friendly_name, FRIENDLY_NAME) - self.assertEqual(after.location, LOCATION) - self.assertEqual(after.default_table_expiration_ms, 3600) - self.assertEqual(after.labels, LABELS) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test_ensure_bqstorage_client_existing_client_check_fails(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - conn.api_request.assert_called_once_with( - method="POST", - path="/%s" % PATH, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "description": DESCRIPTION, - "friendlyName": FRIENDLY_NAME, - "location": LOCATION, - "defaultTableExpirationMs": "3600", - "access": [ - {"role": "OWNER", "userByEmail": USER_EMAIL}, - {"view": VIEW}, - ], - "labels": LABELS, - }, - timeout=None, + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + mock_storage_client = mock.sentinel.mock_storage_client + + patcher = mock.patch( + "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), ) + with patcher, warnings.catch_warnings(record=True) as warned: + bqstorage_client = client._ensure_bqstorage_client(mock_storage_client) - def test_create_dataset_w_custom_property(self): - # The library should handle sending properties to the API that are not - # yet part of the library - from google.cloud.bigquery.dataset import Dataset + self.assertIsNone(bqstorage_client) + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." + + def test_create_routine_w_minimal_resource(self): + from google.cloud.bigquery.routine import Routine + from google.cloud.bigquery.routine import RoutineReference - path = "/projects/%s/datasets" % self.PROJECT + creds = _make_credentials() + path = "/projects/test-routine-project/datasets/test_routines/routines" resource = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "newAlphaProperty": "unreleased property", + "routineReference": { + "projectId": "test-routine-project", + "datasetId": "test_routines", + "routineId": "minimal_routine", + } } - creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) conn = client._connection = make_connection(resource) - - ds_ref = DatasetReference(self.PROJECT, self.DS_ID) - before = Dataset(ds_ref) - before._properties["newAlphaProperty"] = "unreleased property" + full_routine_id = "test-routine-project.test_routines.minimal_routine" + routine = Routine(full_routine_id) with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: - after = client.create_dataset(before) + actual_routine = client.create_routine(routine, timeout=7.5) final_attributes.assert_called_once_with({"path": path}, client, None) - self.assertEqual(after.dataset_id, self.DS_ID) - self.assertEqual(after.project, self.PROJECT) - self.assertEqual(after._properties["newAlphaProperty"], "unreleased property") - conn.api_request.assert_called_once_with( - method="POST", - path=path, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "newAlphaProperty": "unreleased property", - "labels": {}, - }, - timeout=None, + method="POST", path=path, data=resource, timeout=7.5, + ) + self.assertEqual( + actual_routine.reference, RoutineReference.from_string(full_routine_id) ) - def test_create_dataset_w_client_location_wo_dataset_location(self): - from google.cloud.bigquery.dataset import Dataset + def test_create_routine_w_conflict(self): + from google.cloud.bigquery.routine import Routine - PATH = "projects/%s/datasets" % self.PROJECT - RESOURCE = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "location": self.LOCATION, - } creds = _make_credentials() - client = self._make_one( - project=self.PROJECT, credentials=creds, location=self.LOCATION - ) - conn = client._connection = make_connection(RESOURCE) - - ds_ref = DatasetReference(self.PROJECT, self.DS_ID) - before = Dataset(ds_ref) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - after = client.create_dataset(before) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - - self.assertEqual(after.dataset_id, self.DS_ID) - self.assertEqual(after.project, self.PROJECT) - self.assertEqual(after.etag, RESOURCE["etag"]) - self.assertEqual(after.full_dataset_id, RESOURCE["id"]) - self.assertEqual(after.location, self.LOCATION) - - conn.api_request.assert_called_once_with( - method="POST", - path="/%s" % PATH, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "labels": {}, - "location": self.LOCATION, - }, - timeout=None, - ) - - def test_create_dataset_w_client_location_w_dataset_location(self): - from google.cloud.bigquery.dataset import Dataset - - PATH = "projects/%s/datasets" % self.PROJECT - OTHER_LOCATION = "EU" - RESOURCE = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "location": OTHER_LOCATION, - } - creds = _make_credentials() - client = self._make_one( - project=self.PROJECT, credentials=creds, location=self.LOCATION - ) - conn = client._connection = make_connection(RESOURCE) - - ds_ref = DatasetReference(self.PROJECT, self.DS_ID) - before = Dataset(ds_ref) - before.location = OTHER_LOCATION - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - after = client.create_dataset(before) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - - self.assertEqual(after.dataset_id, self.DS_ID) - self.assertEqual(after.project, self.PROJECT) - self.assertEqual(after.etag, RESOURCE["etag"]) - self.assertEqual(after.full_dataset_id, RESOURCE["id"]) - self.assertEqual(after.location, OTHER_LOCATION) - - conn.api_request.assert_called_once_with( - method="POST", - path="/%s" % PATH, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "labels": {}, - "location": OTHER_LOCATION, - }, - timeout=None, - ) - - def test_create_dataset_w_reference(self): - path = "/projects/%s/datasets" % self.PROJECT - resource = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "location": self.LOCATION, - } - creds = _make_credentials() - client = self._make_one( - project=self.PROJECT, credentials=creds, location=self.LOCATION - ) - conn = client._connection = make_connection(resource) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - dataset = client.create_dataset(DatasetReference(self.PROJECT, self.DS_ID)) - - final_attributes.assert_called_once_with({"path": path}, client, None) - - self.assertEqual(dataset.dataset_id, self.DS_ID) - self.assertEqual(dataset.project, self.PROJECT) - self.assertEqual(dataset.etag, resource["etag"]) - self.assertEqual(dataset.full_dataset_id, resource["id"]) - self.assertEqual(dataset.location, self.LOCATION) - - conn.api_request.assert_called_once_with( - method="POST", - path=path, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "labels": {}, - "location": self.LOCATION, - }, - timeout=None, - ) - - def test_create_dataset_w_fully_qualified_string(self): - path = "/projects/%s/datasets" % self.PROJECT - resource = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "location": self.LOCATION, - } - creds = _make_credentials() - client = self._make_one( - project=self.PROJECT, credentials=creds, location=self.LOCATION - ) - conn = client._connection = make_connection(resource) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - dataset = client.create_dataset("{}.{}".format(self.PROJECT, self.DS_ID)) - - final_attributes.assert_called_once_with({"path": path}, client, None) - - self.assertEqual(dataset.dataset_id, self.DS_ID) - self.assertEqual(dataset.project, self.PROJECT) - self.assertEqual(dataset.etag, resource["etag"]) - self.assertEqual(dataset.full_dataset_id, resource["id"]) - self.assertEqual(dataset.location, self.LOCATION) - - conn.api_request.assert_called_once_with( - method="POST", - path=path, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "labels": {}, - "location": self.LOCATION, - }, - timeout=None, - ) - - def test_create_dataset_w_string(self): - path = "/projects/%s/datasets" % self.PROJECT - resource = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "location": self.LOCATION, - } - creds = _make_credentials() - client = self._make_one( - project=self.PROJECT, credentials=creds, location=self.LOCATION - ) - conn = client._connection = make_connection(resource) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - dataset = client.create_dataset(self.DS_ID) - - final_attributes.assert_called_once_with({"path": path}, client, None) - - self.assertEqual(dataset.dataset_id, self.DS_ID) - self.assertEqual(dataset.project, self.PROJECT) - self.assertEqual(dataset.etag, resource["etag"]) - self.assertEqual(dataset.full_dataset_id, resource["id"]) - self.assertEqual(dataset.location, self.LOCATION) - - conn.api_request.assert_called_once_with( - method="POST", - path=path, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "labels": {}, - "location": self.LOCATION, - }, - timeout=None, - ) - - def test_create_dataset_alreadyexists_w_exists_ok_false(self): - creds = _make_credentials() - client = self._make_one( - project=self.PROJECT, credentials=creds, location=self.LOCATION - ) - client._connection = make_connection( - google.api_core.exceptions.AlreadyExists("dataset already exists") - ) - - with pytest.raises(google.api_core.exceptions.AlreadyExists): - client.create_dataset(self.DS_ID) - - def test_create_dataset_alreadyexists_w_exists_ok_true(self): - post_path = "/projects/{}/datasets".format(self.PROJECT) - get_path = "/projects/{}/datasets/{}".format(self.PROJECT, self.DS_ID) - resource = { - "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, - "etag": "etag", - "id": "{}:{}".format(self.PROJECT, self.DS_ID), - "location": self.LOCATION, - } - creds = _make_credentials() - client = self._make_one( - project=self.PROJECT, credentials=creds, location=self.LOCATION - ) - conn = client._connection = make_connection( - google.api_core.exceptions.AlreadyExists("dataset already exists"), resource - ) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - dataset = client.create_dataset(self.DS_ID, exists_ok=True) - - final_attributes.assert_called_with({"path": get_path}, client, None) - - self.assertEqual(dataset.dataset_id, self.DS_ID) - self.assertEqual(dataset.project, self.PROJECT) - self.assertEqual(dataset.etag, resource["etag"]) - self.assertEqual(dataset.full_dataset_id, resource["id"]) - self.assertEqual(dataset.location, self.LOCATION) - - conn.api_request.assert_has_calls( - [ - mock.call( - method="POST", - path=post_path, - data={ - "datasetReference": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - }, - "labels": {}, - "location": self.LOCATION, - }, - timeout=None, - ), - mock.call(method="GET", path=get_path, timeout=None), - ] - ) - - def test_create_routine_w_minimal_resource(self): - from google.cloud.bigquery.routine import Routine - from google.cloud.bigquery.routine import RoutineReference - - creds = _make_credentials() - path = "/projects/test-routine-project/datasets/test_routines/routines" - resource = { - "routineReference": { - "projectId": "test-routine-project", - "datasetId": "test_routines", - "routineId": "minimal_routine", - } - } - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection(resource) - full_routine_id = "test-routine-project.test_routines.minimal_routine" - routine = Routine(full_routine_id) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - actual_routine = client.create_routine(routine, timeout=7.5) - - final_attributes.assert_called_once_with({"path": path}, client, None) - - conn.api_request.assert_called_once_with( - method="POST", path=path, data=resource, timeout=7.5, - ) - self.assertEqual( - actual_routine.reference, RoutineReference.from_string(full_routine_id) - ) - - def test_create_routine_w_conflict(self): - from google.cloud.bigquery.routine import Routine - - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection( - google.api_core.exceptions.AlreadyExists("routine already exists") + client = self._make_one(project=self.PROJECT, credentials=creds) + conn = client._connection = make_connection( + google.api_core.exceptions.AlreadyExists("routine already exists") ) path = "/projects/test-routine-project/datasets/test_routines/routines" full_routine_id = "test-routine-project.test_routines.minimal_routine" @@ -1327,7 +774,7 @@ def test_create_routine_w_conflict(self): } } conn.api_request.assert_called_once_with( - method="POST", path=path, data=resource, timeout=None, + method="POST", path=path, data=resource, timeout=DEFAULT_TIMEOUT, ) @unittest.skipIf(opentelemetry is None, "Requires `opentelemetry`") @@ -1363,7 +810,7 @@ def test_span_status_is_set(self): } } conn.api_request.assert_called_once_with( - method="POST", path=path, data=resource, timeout=None, + method="POST", path=path, data=resource, timeout=DEFAULT_TIMEOUT, ) def test_create_routine_w_conflict_exists_ok(self): @@ -1399,11 +846,13 @@ def test_create_routine_w_conflict_exists_ok(self): self.assertEqual(actual_routine.routine_id, "minimal_routine") conn.api_request.assert_has_calls( [ - mock.call(method="POST", path=path, data=resource, timeout=None,), + mock.call( + method="POST", path=path, data=resource, timeout=DEFAULT_TIMEOUT, + ), mock.call( method="GET", path="/projects/test-routine-project/datasets/test_routines/routines/minimal_routine", - timeout=None, + timeout=DEFAULT_TIMEOUT, ), ] ) @@ -1479,7 +928,7 @@ def test_create_table_w_custom_property(self): "newAlphaProperty": "unreleased property", "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(got._properties["newAlphaProperty"], "unreleased property") self.assertEqual(got.table_id, self.TABLE_ID) @@ -1520,7 +969,7 @@ def test_create_table_w_encryption_configuration(self): "labels": {}, "encryptionConfiguration": {"kmsKeyName": self.KMS_KEY_NAME}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(got.table_id, self.TABLE_ID) @@ -1556,7 +1005,7 @@ def test_create_table_w_day_partition_and_expire(self): "timePartitioning": {"type": "DAY", "expirationMs": "100"}, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(table.time_partitioning.type_, "DAY") self.assertEqual(table.time_partitioning.expiration_ms, 100) @@ -1579,13 +1028,13 @@ def test_create_table_w_schema_and_query(self): "name": "full_name", "type": "STRING", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, { "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, ] }, @@ -1624,20 +1073,20 @@ def test_create_table_w_schema_and_query(self): "name": "full_name", "type": "STRING", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, { "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, ] }, "view": {"query": query, "useLegacySql": False}, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(got.table_id, self.TABLE_ID) self.assertEqual(got.project, self.PROJECT) @@ -1692,7 +1141,7 @@ def test_create_table_w_external(self): }, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(got.table_id, self.TABLE_ID) self.assertEqual(got.project, self.PROJECT) @@ -1731,7 +1180,7 @@ def test_create_table_w_reference(self): }, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(got.table_id, self.TABLE_ID) @@ -1765,7 +1214,7 @@ def test_create_table_w_fully_qualified_string(self): }, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(got.table_id, self.TABLE_ID) @@ -1797,7 +1246,7 @@ def test_create_table_w_string(self): }, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(got.table_id, self.TABLE_ID) @@ -1832,7 +1281,7 @@ def test_create_table_alreadyexists_w_exists_ok_false(self): }, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_create_table_alreadyexists_w_exists_ok_true(self): @@ -1875,9 +1324,9 @@ def test_create_table_alreadyexists_w_exists_ok_true(self): }, "labels": {}, }, - timeout=None, + timeout=DEFAULT_TIMEOUT, ), - mock.call(method="GET", path=get_path, timeout=None), + mock.call(method="GET", path=get_path, timeout=DEFAULT_TIMEOUT), ] ) @@ -1950,7 +1399,7 @@ def test_get_model_w_string(self): final_attributes.assert_called_once_with({"path": "/%s" % path}, client, None) conn.api_request.assert_called_once_with( - method="GET", path="/%s" % path, timeout=None + method="GET", path="/%s" % path, timeout=DEFAULT_TIMEOUT ) self.assertEqual(got.model_id, self.MODEL_ID) @@ -2036,6 +1485,7 @@ def test_get_table_sets_user_agent(self): url=mock.ANY, method=mock.ANY, headers=mock.ANY, data=mock.ANY ) http.reset_mock() + http.is_mtls = False mock_response.status_code = 200 mock_response.json.return_value = self._make_table_resource() user_agent_override = client_info.ClientInfo(user_agent="my-application/1.2.3") @@ -2058,7 +1508,7 @@ def test_get_table_sets_user_agent(self): "User-Agent": expected_user_agent, }, data=mock.ANY, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertIn("my-application/1.2.3", expected_user_agent) @@ -2401,7 +1851,7 @@ def test_update_dataset_w_custom_property(self): data={"newAlphaProperty": "unreleased property"}, path=path, headers=None, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(dataset.dataset_id, self.DS_ID) @@ -2499,7 +1949,7 @@ def test_update_routine(self): RoutineArgument( name="x", data_type=bigquery_v2.types.StandardSqlDataType( - type_kind=bigquery_v2.enums.StandardSqlDataType.TypeKind.INT64 + type_kind=bigquery_v2.types.StandardSqlDataType.TypeKind.INT64 ), ) ] @@ -2575,12 +2025,14 @@ def test_update_table(self): "type": "STRING", "mode": "REQUIRED", "description": None, + "policyTags": {"names": []}, }, { "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "description": "New field description", + "policyTags": {"names": []}, }, ] }, @@ -2591,8 +2043,10 @@ def test_update_table(self): } ) schema = [ - SchemaField("full_name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), + SchemaField("full_name", "STRING", mode="REQUIRED", description=None), + SchemaField( + "age", "INTEGER", mode="REQUIRED", description="New field description" + ), ] creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) @@ -2620,12 +2074,14 @@ def test_update_table(self): "type": "STRING", "mode": "REQUIRED", "description": None, + "policyTags": {"names": []}, }, { "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "description": "New field description", + "policyTags": {"names": []}, }, ] }, @@ -2685,7 +2141,7 @@ def test_update_table_w_custom_property(self): path="/%s" % path, data={"newAlphaProperty": "unreleased property"}, headers=None, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual( updated_table._properties["newAlphaProperty"], "unreleased property" @@ -2720,7 +2176,7 @@ def test_update_table_only_use_legacy_sql(self): path="/%s" % path, data={"view": {"useLegacySql": True}}, headers=None, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(updated_table.view_use_legacy_sql, table.view_use_legacy_sql) @@ -2746,18 +2202,36 @@ def test_update_table_w_query(self): "type": "STRING", "mode": "REQUIRED", "description": None, + "policyTags": {"names": []}, }, { "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "description": "this is a column", + "policyTags": {"names": []}, + }, + { + "name": "country", + "type": "STRING", + "mode": "NULLABLE", + "policyTags": {"names": []}, }, ] } schema = [ - SchemaField("full_name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), + SchemaField( + "full_name", + "STRING", + mode="REQUIRED", + # Explicitly unset the description. + description=None, + ), + SchemaField( + "age", "INTEGER", mode="REQUIRED", description="this is a column" + ), + # Omit the description to not make updates to it. + SchemaField("country", "STRING"), ] resource = self._make_table_resource() resource.update( @@ -2800,7 +2274,7 @@ def test_update_table_w_query(self): "schema": schema_resource, }, headers=None, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_update_table_w_schema_None(self): @@ -2911,433 +2385,64 @@ def test_update_table_delete_property(self): self.assertEqual(req[1]["data"], sent) self.assertIsNone(table3.description) - def test_list_tables_empty_w_timeout(self): - path = "/projects/{}/datasets/{}/tables".format(self.PROJECT, self.DS_ID) + def test_delete_job_metadata_not_found(self): creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection({}) + client = self._make_one("client-proj", creds, location="client-loc") + conn = client._connection = make_connection( + google.api_core.exceptions.NotFound("job not found"), + google.api_core.exceptions.NotFound("job not found"), + ) - dataset = DatasetReference(self.PROJECT, self.DS_ID) - iterator = client.list_tables(dataset, timeout=7.5) - self.assertIs(iterator.dataset, dataset) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) + with self.assertRaises(google.api_core.exceptions.NotFound): + client.delete_job_metadata("my-job") - final_attributes.assert_called_once_with({"path": path}, client, None) - tables = list(page) - token = iterator.next_page_token + conn.api_request.reset_mock() + client.delete_job_metadata("my-job", not_found_ok=True) - self.assertEqual(tables, []) - self.assertIsNone(token) conn.api_request.assert_called_once_with( - method="GET", path=path, query_params={}, timeout=7.5 + method="DELETE", + path="/projects/client-proj/jobs/my-job/delete", + query_params={"location": "client-loc"}, + timeout=DEFAULT_TIMEOUT, ) - def test_list_models_empty_w_timeout(self): - path = "/projects/{}/datasets/{}/models".format(self.PROJECT, self.DS_ID) + def test_delete_job_metadata_with_id(self): creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection({}) - - dataset_id = "{}.{}".format(self.PROJECT, self.DS_ID) - iterator = client.list_models(dataset_id, timeout=7.5) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": path}, client, None) - models = list(page) - token = iterator.next_page_token - - self.assertEqual(models, []) - self.assertIsNone(token) - conn.api_request.assert_called_once_with( - method="GET", path=path, query_params={}, timeout=7.5 - ) - - def test_list_models_defaults(self): - from google.cloud.bigquery.model import Model - - MODEL_1 = "model_one" - MODEL_2 = "model_two" - PATH = "projects/%s/datasets/%s/models" % (self.PROJECT, self.DS_ID) - TOKEN = "TOKEN" - DATA = { - "nextPageToken": TOKEN, - "models": [ - { - "modelReference": { - "modelId": MODEL_1, - "datasetId": self.DS_ID, - "projectId": self.PROJECT, - } - }, - { - "modelReference": { - "modelId": MODEL_2, - "datasetId": self.DS_ID, - "projectId": self.PROJECT, - } - }, - ], - } - - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection(DATA) - dataset = DatasetReference(self.PROJECT, self.DS_ID) - - iterator = client.list_models(dataset) - self.assertIs(iterator.dataset, dataset) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - models = list(page) - token = iterator.next_page_token - - self.assertEqual(len(models), len(DATA["models"])) - for found, expected in zip(models, DATA["models"]): - self.assertIsInstance(found, Model) - self.assertEqual(found.model_id, expected["modelReference"]["modelId"]) - self.assertEqual(token, TOKEN) - - conn.api_request.assert_called_once_with( - method="GET", path="/%s" % PATH, query_params={}, timeout=None - ) - - def test_list_models_wrong_type(self): - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - with self.assertRaises(TypeError): - client.list_models(DatasetReference(self.PROJECT, self.DS_ID).model("foo")) - - def test_list_routines_empty_w_timeout(self): - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) + client = self._make_one(self.PROJECT, creds) conn = client._connection = make_connection({}) - iterator = client.list_routines("test-routines.test_routines", timeout=7.5) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with( - {"path": "/projects/test-routines/datasets/test_routines/routines"}, - client, - None, - ) - routines = list(page) - token = iterator.next_page_token - - self.assertEqual(routines, []) - self.assertIsNone(token) - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/test-routines/datasets/test_routines/routines", - query_params={}, - timeout=7.5, - ) - - def test_list_routines_defaults(self): - from google.cloud.bigquery.routine import Routine - - project_id = "test-routines" - dataset_id = "test_routines" - path = "/projects/test-routines/datasets/test_routines/routines" - routine_1 = "routine_one" - routine_2 = "routine_two" - token = "TOKEN" - resource = { - "nextPageToken": token, - "routines": [ - { - "routineReference": { - "routineId": routine_1, - "datasetId": dataset_id, - "projectId": project_id, - } - }, - { - "routineReference": { - "routineId": routine_2, - "datasetId": dataset_id, - "projectId": project_id, - } - }, - ], - } - - creds = _make_credentials() - client = self._make_one(project=project_id, credentials=creds) - conn = client._connection = make_connection(resource) - dataset = DatasetReference(client.project, dataset_id) - - iterator = client.list_routines(dataset) - self.assertIs(iterator.dataset, dataset) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": path}, client, None) - routines = list(page) - actual_token = iterator.next_page_token - - self.assertEqual(len(routines), len(resource["routines"])) - for found, expected in zip(routines, resource["routines"]): - self.assertIsInstance(found, Routine) - self.assertEqual( - found.routine_id, expected["routineReference"]["routineId"] - ) - self.assertEqual(actual_token, token) - - conn.api_request.assert_called_once_with( - method="GET", path=path, query_params={}, timeout=None - ) - - def test_list_routines_wrong_type(self): - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - with self.assertRaises(TypeError): - client.list_routines( - DatasetReference(self.PROJECT, self.DS_ID).table("foo") - ) - - def test_list_tables_defaults(self): - from google.cloud.bigquery.table import TableListItem - - TABLE_1 = "table_one" - TABLE_2 = "table_two" - PATH = "projects/%s/datasets/%s/tables" % (self.PROJECT, self.DS_ID) - TOKEN = "TOKEN" - DATA = { - "nextPageToken": TOKEN, - "tables": [ - { - "kind": "bigquery#table", - "id": "%s:%s.%s" % (self.PROJECT, self.DS_ID, TABLE_1), - "tableReference": { - "tableId": TABLE_1, - "datasetId": self.DS_ID, - "projectId": self.PROJECT, - }, - "type": "TABLE", - }, - { - "kind": "bigquery#table", - "id": "%s:%s.%s" % (self.PROJECT, self.DS_ID, TABLE_2), - "tableReference": { - "tableId": TABLE_2, - "datasetId": self.DS_ID, - "projectId": self.PROJECT, - }, - "type": "TABLE", - }, - ], - } - - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection(DATA) - dataset = DatasetReference(self.PROJECT, self.DS_ID) - - iterator = client.list_tables(dataset) - self.assertIs(iterator.dataset, dataset) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - tables = list(page) - token = iterator.next_page_token - - self.assertEqual(len(tables), len(DATA["tables"])) - for found, expected in zip(tables, DATA["tables"]): - self.assertIsInstance(found, TableListItem) - self.assertEqual(found.full_table_id, expected["id"]) - self.assertEqual(found.table_type, expected["type"]) - self.assertEqual(token, TOKEN) + client.delete_job_metadata("my-job", project="param-proj", location="param-loc") conn.api_request.assert_called_once_with( - method="GET", path="/%s" % PATH, query_params={}, timeout=None + method="DELETE", + path="/projects/param-proj/jobs/my-job/delete", + query_params={"location": "param-loc"}, + timeout=DEFAULT_TIMEOUT, ) - def test_list_tables_explicit(self): - from google.cloud.bigquery.table import TableListItem + def test_delete_job_metadata_with_resource(self): + from google.cloud.bigquery.job import QueryJob - TABLE_1 = "table_one" - TABLE_2 = "table_two" - PATH = "projects/%s/datasets/%s/tables" % (self.PROJECT, self.DS_ID) - TOKEN = "TOKEN" - DATA = { - "tables": [ - { - "kind": "bigquery#dataset", - "id": "%s:%s.%s" % (self.PROJECT, self.DS_ID, TABLE_1), - "tableReference": { - "tableId": TABLE_1, - "datasetId": self.DS_ID, - "projectId": self.PROJECT, - }, - "type": "TABLE", - }, - { - "kind": "bigquery#dataset", - "id": "%s:%s.%s" % (self.PROJECT, self.DS_ID, TABLE_2), - "tableReference": { - "tableId": TABLE_2, - "datasetId": self.DS_ID, - "projectId": self.PROJECT, - }, - "type": "TABLE", - }, - ] + query_resource = { + "jobReference": { + "projectId": "job-based-proj", + "jobId": "query_job", + "location": "us-east1", + }, + "configuration": {"query": {}}, } - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection(DATA) - dataset = DatasetReference(self.PROJECT, self.DS_ID) - - iterator = client.list_tables( - # Test with string for dataset ID. - self.DS_ID, - max_results=3, - page_token=TOKEN, - ) - self.assertEqual(iterator.dataset, dataset) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - tables = list(page) - token = iterator.next_page_token + client = self._make_one(self.PROJECT, creds) + conn = client._connection = make_connection(query_resource) + job_from_resource = QueryJob.from_api_repr(query_resource, client) - self.assertEqual(len(tables), len(DATA["tables"])) - for found, expected in zip(tables, DATA["tables"]): - self.assertIsInstance(found, TableListItem) - self.assertEqual(found.full_table_id, expected["id"]) - self.assertEqual(found.table_type, expected["type"]) - self.assertIsNone(token) + client.delete_job_metadata(job_from_resource) conn.api_request.assert_called_once_with( - method="GET", - path="/%s" % PATH, - query_params={"maxResults": 3, "pageToken": TOKEN}, - timeout=None, - ) - - def test_list_tables_wrong_type(self): - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - with self.assertRaises(TypeError): - client.list_tables(DatasetReference(self.PROJECT, self.DS_ID).table("foo")) - - def test_delete_dataset(self): - from google.cloud.bigquery.dataset import Dataset - from google.cloud.bigquery.dataset import DatasetReference - - ds_ref = DatasetReference(self.PROJECT, self.DS_ID) - datasets = (ds_ref, Dataset(ds_ref), "{}.{}".format(self.PROJECT, self.DS_ID)) - PATH = "projects/%s/datasets/%s" % (self.PROJECT, self.DS_ID) - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection(*([{}] * len(datasets))) - for arg in datasets: - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - client.delete_dataset(arg, timeout=7.5) - - final_attributes.assert_called_once_with( - {"path": "/%s" % PATH}, client, None - ) - - conn.api_request.assert_called_with( - method="DELETE", path="/%s" % PATH, query_params={}, timeout=7.5 - ) - - def test_delete_dataset_delete_contents(self): - from google.cloud.bigquery.dataset import Dataset - - PATH = "projects/%s/datasets/%s" % (self.PROJECT, self.DS_ID) - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - conn = client._connection = make_connection({}, {}) - ds_ref = DatasetReference(self.PROJECT, self.DS_ID) - for arg in (ds_ref, Dataset(ds_ref)): - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - client.delete_dataset(arg, delete_contents=True) - - final_attributes.assert_called_once_with( - {"path": "/%s" % PATH, "deleteContents": True}, client, None - ) - conn.api_request.assert_called_with( - method="DELETE", - path="/%s" % PATH, - query_params={"deleteContents": "true"}, - timeout=None, - ) - - def test_delete_dataset_wrong_type(self): - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - with self.assertRaises(TypeError): - client.delete_dataset( - DatasetReference(self.PROJECT, self.DS_ID).table("foo") - ) - - def test_delete_dataset_w_not_found_ok_false(self): - path = "/projects/{}/datasets/{}".format(self.PROJECT, self.DS_ID) - creds = _make_credentials() - http = object() - client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) - conn = client._connection = make_connection( - google.api_core.exceptions.NotFound("dataset not found") - ) - - with self.assertRaises(google.api_core.exceptions.NotFound): - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - client.delete_dataset(self.DS_ID) - - final_attributes.assert_called_once_with({"path": path}, client, None) - - conn.api_request.assert_called_with( - method="DELETE", path=path, query_params={}, timeout=None - ) - - def test_delete_dataset_w_not_found_ok_true(self): - path = "/projects/{}/datasets/{}".format(self.PROJECT, self.DS_ID) - creds = _make_credentials() - http = object() - client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) - conn = client._connection = make_connection( - google.api_core.exceptions.NotFound("dataset not found") - ) - - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - client.delete_dataset(self.DS_ID, not_found_ok=True) - - final_attributes.assert_called_once_with({"path": path}, client, None) - - conn.api_request.assert_called_with( - method="DELETE", path=path, query_params={}, timeout=None + method="DELETE", + path="/projects/job-based-proj/jobs/query_job/delete", + query_params={"location": "us-east1"}, + timeout=DEFAULT_TIMEOUT, ) def test_delete_model(self): @@ -3392,7 +2497,9 @@ def test_delete_model_w_not_found_ok_false(self): with self.assertRaises(google.api_core.exceptions.NotFound): client.delete_model("{}.{}".format(self.DS_ID, self.MODEL_ID)) - conn.api_request.assert_called_with(method="DELETE", path=path, timeout=None) + conn.api_request.assert_called_with( + method="DELETE", path=path, timeout=DEFAULT_TIMEOUT + ) def test_delete_model_w_not_found_ok_true(self): path = "/projects/{}/datasets/{}/models/{}".format( @@ -3413,7 +2520,9 @@ def test_delete_model_w_not_found_ok_true(self): final_attributes.assert_called_once_with({"path": path}, client, None) - conn.api_request.assert_called_with(method="DELETE", path=path, timeout=None) + conn.api_request.assert_called_with( + method="DELETE", path=path, timeout=DEFAULT_TIMEOUT + ) def test_delete_routine(self): from google.cloud.bigquery.routine import Routine @@ -3467,7 +2576,7 @@ def test_delete_routine_w_not_found_ok_false(self): final_attributes.assert_called_once_with({"path": path}, client, None) conn.api_request.assert_called_with( - method="DELETE", path=path, timeout=None, + method="DELETE", path=path, timeout=DEFAULT_TIMEOUT, ) def test_delete_routine_w_not_found_ok_true(self): @@ -3489,7 +2598,7 @@ def test_delete_routine_w_not_found_ok_true(self): final_attributes.assert_called_once_with({"path": path}, client, None) conn.api_request.assert_called_with( - method="DELETE", path=path, timeout=None, + method="DELETE", path=path, timeout=DEFAULT_TIMEOUT, ) def test_delete_table(self): @@ -3553,7 +2662,9 @@ def test_delete_table_w_not_found_ok_false(self): final_attributes.assert_called_once_with({"path": path}, client, None) - conn.api_request.assert_called_with(method="DELETE", path=path, timeout=None) + conn.api_request.assert_called_with( + method="DELETE", path=path, timeout=DEFAULT_TIMEOUT + ) def test_delete_table_w_not_found_ok_true(self): path = "/projects/{}/datasets/{}/tables/{}".format( @@ -3575,23 +2686,32 @@ def test_delete_table_w_not_found_ok_true(self): final_attributes.assert_called_once_with({"path": path}, client, None) - conn.api_request.assert_called_with(method="DELETE", path=path, timeout=None) + conn.api_request.assert_called_with( + method="DELETE", path=path, timeout=DEFAULT_TIMEOUT + ) + + def _create_job_helper(self, job_config): + from google.cloud.bigquery import _helpers - def _create_job_helper(self, job_config, client_method): creds = _make_credentials() http = object() client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) - client._connection = make_connection() - rf1 = mock.Mock() - get_config_patch = mock.patch( - "google.cloud.bigquery.job._JobConfig.from_api_repr", return_value=rf1, - ) - load_patch = mock.patch(client_method, autospec=True) + RESOURCE = { + "jobReference": {"projectId": self.PROJECT, "jobId": mock.ANY}, + "configuration": job_config, + } + conn = client._connection = make_connection(RESOURCE) + client.create_job(job_config=job_config) + if "query" in job_config: + _helpers._del_sub_prop(job_config, ["query", "destinationTable"]) - with load_patch as client_method, get_config_patch: - client.create_job(job_config=job_config) - client_method.assert_called_once() + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/%s/jobs" % self.PROJECT, + data=RESOURCE, + timeout=DEFAULT_TIMEOUT, + ) def test_create_job_load_config(self): configuration = { @@ -3605,9 +2725,7 @@ def test_create_job_load_config(self): } } - self._create_job_helper( - configuration, "google.cloud.bigquery.client.Client.load_table_from_uri" - ) + self._create_job_helper(configuration) def test_create_job_copy_config(self): configuration = { @@ -3627,9 +2745,7 @@ def test_create_job_copy_config(self): } } - self._create_job_helper( - configuration, "google.cloud.bigquery.client.Client.copy_table", - ) + self._create_job_helper(configuration) def test_create_job_copy_config_w_single_source(self): configuration = { @@ -3647,9 +2763,7 @@ def test_create_job_copy_config_w_single_source(self): } } - self._create_job_helper( - configuration, "google.cloud.bigquery.client.Client.copy_table", - ) + self._create_job_helper(configuration) def test_create_job_extract_config(self): configuration = { @@ -3662,9 +2776,7 @@ def test_create_job_extract_config(self): "destinationUris": ["gs://test_bucket/dst_object*"], } } - self._create_job_helper( - configuration, "google.cloud.bigquery.client.Client.extract_table", - ) + self._create_job_helper(configuration) def test_create_job_extract_config_for_model(self): configuration = { @@ -3677,17 +2789,17 @@ def test_create_job_extract_config_for_model(self): "destinationUris": ["gs://test_bucket/dst_object*"], } } - self._create_job_helper( - configuration, "google.cloud.bigquery.client.Client.extract_table", - ) + self._create_job_helper(configuration) def test_create_job_query_config(self): configuration = { - "query": {"query": "query", "destinationTable": {"tableId": "table_id"}} + "query": { + "query": "query", + "destinationTable": {"tableId": "table_id"}, + "useLegacySql": False, + } } - self._create_job_helper( - configuration, "google.cloud.bigquery.client.Client.query", - ) + self._create_job_helper(configuration) def test_create_job_query_config_w_rateLimitExceeded_error(self): from google.cloud.exceptions import Forbidden @@ -3747,7 +2859,7 @@ def test_create_job_query_config_w_rateLimitExceeded_error(self): method="POST", path="/projects/PROJECT/jobs", data=data_without_destination, - timeout=None, + timeout=DEFAULT_TIMEOUT, ), ) @@ -3778,449 +2890,183 @@ def test_get_job_miss_w_explict_project(self): JOB_ID = "NONESUCH" creds = _make_credentials() client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection() - - with self.assertRaises(NotFound): - client.get_job(JOB_ID, project=OTHER_PROJECT, location=self.LOCATION) - - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/OTHER_PROJECT/jobs/NONESUCH", - query_params={"projection": "full", "location": self.LOCATION}, - timeout=None, - ) - - def test_get_job_miss_w_client_location(self): - from google.cloud.exceptions import NotFound - - OTHER_PROJECT = "OTHER_PROJECT" - JOB_ID = "NONESUCH" - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds, location=self.LOCATION) - conn = client._connection = make_connection() - - with self.assertRaises(NotFound): - client.get_job(JOB_ID, project=OTHER_PROJECT) - - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/OTHER_PROJECT/jobs/NONESUCH", - query_params={"projection": "full", "location": self.LOCATION}, - timeout=None, - ) - - def test_get_job_hit_w_timeout(self): - from google.cloud.bigquery.job import CreateDisposition - from google.cloud.bigquery.job import QueryJob - from google.cloud.bigquery.job import WriteDisposition - - JOB_ID = "query_job" - QUERY_DESTINATION_TABLE = "query_destination_table" - QUERY = "SELECT * from test_dataset:test_table" - ASYNC_QUERY_DATA = { - "id": "{}:{}".format(self.PROJECT, JOB_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": "query_job"}, - "state": "DONE", - "configuration": { - "query": { - "query": QUERY, - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": QUERY_DESTINATION_TABLE, - }, - "createDisposition": CreateDisposition.CREATE_IF_NEEDED, - "writeDisposition": WriteDisposition.WRITE_TRUNCATE, - } - }, - } - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(ASYNC_QUERY_DATA) - - job = client.get_job(JOB_ID, timeout=7.5) - - self.assertIsInstance(job, QueryJob) - self.assertEqual(job.job_id, JOB_ID) - self.assertEqual(job.create_disposition, CreateDisposition.CREATE_IF_NEEDED) - self.assertEqual(job.write_disposition, WriteDisposition.WRITE_TRUNCATE) - - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/PROJECT/jobs/query_job", - query_params={"projection": "full"}, - timeout=7.5, - ) - - def test_cancel_job_miss_w_explict_project(self): - from google.cloud.exceptions import NotFound - - OTHER_PROJECT = "OTHER_PROJECT" - JOB_ID = "NONESUCH" - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection() - - with self.assertRaises(NotFound): - client.cancel_job(JOB_ID, project=OTHER_PROJECT, location=self.LOCATION) - - conn.api_request.assert_called_once_with( - method="POST", - path="/projects/OTHER_PROJECT/jobs/NONESUCH/cancel", - query_params={"projection": "full", "location": self.LOCATION}, - timeout=None, - ) - - def test_cancel_job_miss_w_client_location(self): - from google.cloud.exceptions import NotFound - - OTHER_PROJECT = "OTHER_PROJECT" - JOB_ID = "NONESUCH" - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds, location=self.LOCATION) - conn = client._connection = make_connection() - - with self.assertRaises(NotFound): - client.cancel_job(JOB_ID, project=OTHER_PROJECT) - - conn.api_request.assert_called_once_with( - method="POST", - path="/projects/OTHER_PROJECT/jobs/NONESUCH/cancel", - query_params={"projection": "full", "location": self.LOCATION}, - timeout=None, - ) - - def test_cancel_job_hit(self): - from google.cloud.bigquery.job import QueryJob - - JOB_ID = "query_job" - QUERY = "SELECT * from test_dataset:test_table" - QUERY_JOB_RESOURCE = { - "id": "{}:{}".format(self.PROJECT, JOB_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": "query_job"}, - "state": "RUNNING", - "configuration": {"query": {"query": QUERY}}, - } - RESOURCE = {"job": QUERY_JOB_RESOURCE} - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(RESOURCE) - - job = client.cancel_job(JOB_ID) - - self.assertIsInstance(job, QueryJob) - self.assertEqual(job.job_id, JOB_ID) - self.assertEqual(job.query, QUERY) - - conn.api_request.assert_called_once_with( - method="POST", - path="/projects/PROJECT/jobs/query_job/cancel", - query_params={"projection": "full"}, - timeout=None, - ) - - def test_cancel_job_w_timeout(self): - JOB_ID = "query_job" - QUERY = "SELECT * from test_dataset:test_table" - QUERY_JOB_RESOURCE = { - "id": "{}:{}".format(self.PROJECT, JOB_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": "query_job"}, - "state": "RUNNING", - "configuration": {"query": {"query": QUERY}}, - } - RESOURCE = {"job": QUERY_JOB_RESOURCE} - - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(RESOURCE) - - client.cancel_job(JOB_ID, timeout=7.5) - - conn.api_request.assert_called_once_with( - method="POST", - path="/projects/{}/jobs/query_job/cancel".format(self.PROJECT), - query_params={"projection": "full"}, - timeout=7.5, - ) - - def test_list_jobs_defaults(self): - from google.cloud.bigquery.job import CopyJob - from google.cloud.bigquery.job import CreateDisposition - from google.cloud.bigquery.job import ExtractJob - from google.cloud.bigquery.job import LoadJob - from google.cloud.bigquery.job import QueryJob - from google.cloud.bigquery.job import WriteDisposition - - SOURCE_TABLE = "source_table" - DESTINATION_TABLE = "destination_table" - QUERY_DESTINATION_TABLE = "query_destination_table" - SOURCE_URI = "gs://test_bucket/src_object*" - DESTINATION_URI = "gs://test_bucket/dst_object*" - JOB_TYPES = { - "load_job": LoadJob, - "copy_job": CopyJob, - "extract_job": ExtractJob, - "query_job": QueryJob, - } - PATH = "projects/%s/jobs" % self.PROJECT - TOKEN = "TOKEN" - QUERY = "SELECT * from test_dataset:test_table" - ASYNC_QUERY_DATA = { - "id": "%s:%s" % (self.PROJECT, "query_job"), - "jobReference": {"projectId": self.PROJECT, "jobId": "query_job"}, - "state": "DONE", - "configuration": { - "query": { - "query": QUERY, - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": QUERY_DESTINATION_TABLE, - }, - "createDisposition": CreateDisposition.CREATE_IF_NEEDED, - "writeDisposition": WriteDisposition.WRITE_TRUNCATE, - } - }, - } - EXTRACT_DATA = { - "id": "%s:%s" % (self.PROJECT, "extract_job"), - "jobReference": {"projectId": self.PROJECT, "jobId": "extract_job"}, - "state": "DONE", - "configuration": { - "extract": { - "sourceTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": SOURCE_TABLE, - }, - "destinationUris": [DESTINATION_URI], - } - }, - } - COPY_DATA = { - "id": "%s:%s" % (self.PROJECT, "copy_job"), - "jobReference": {"projectId": self.PROJECT, "jobId": "copy_job"}, - "state": "DONE", - "configuration": { - "copy": { - "sourceTables": [ - { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": SOURCE_TABLE, - } - ], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": DESTINATION_TABLE, - }, - } - }, - } - LOAD_DATA = { - "id": "%s:%s" % (self.PROJECT, "load_job"), - "jobReference": {"projectId": self.PROJECT, "jobId": "load_job"}, - "state": "DONE", - "configuration": { - "load": { - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": SOURCE_TABLE, - }, - "sourceUris": [SOURCE_URI], - } - }, - } - DATA = { - "nextPageToken": TOKEN, - "jobs": [ASYNC_QUERY_DATA, EXTRACT_DATA, COPY_DATA, LOAD_DATA], - } - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(DATA) + conn = client._connection = make_connection() - iterator = client.list_jobs() - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) + with self.assertRaises(NotFound): + client.get_job(JOB_ID, project=OTHER_PROJECT) + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/OTHER_PROJECT/jobs/NONESUCH", + query_params={"projection": "full"}, + timeout=DEFAULT_TIMEOUT, + ) + + def test_get_job_miss_w_client_location(self): + from google.cloud.exceptions import NotFound - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - jobs = list(page) - token = iterator.next_page_token + JOB_ID = "NONESUCH" + creds = _make_credentials() + client = self._make_one("client-proj", creds, location="client-loc") + conn = client._connection = make_connection() - self.assertEqual(len(jobs), len(DATA["jobs"])) - for found, expected in zip(jobs, DATA["jobs"]): - name = expected["jobReference"]["jobId"] - self.assertIsInstance(found, JOB_TYPES[name]) - self.assertEqual(found.job_id, name) - self.assertEqual(token, TOKEN) + with self.assertRaises(NotFound): + client.get_job(JOB_ID) conn.api_request.assert_called_once_with( method="GET", - path="/%s" % PATH, - query_params={"projection": "full"}, - timeout=None, + path="/projects/client-proj/jobs/NONESUCH", + query_params={"projection": "full", "location": "client-loc"}, + timeout=DEFAULT_TIMEOUT, ) - def test_list_jobs_load_job_wo_sourceUris(self): - from google.cloud.bigquery.job import LoadJob + def test_get_job_hit_w_timeout(self): + from google.cloud.bigquery.job import CreateDisposition + from google.cloud.bigquery.job import QueryJob + from google.cloud.bigquery.job import WriteDisposition - SOURCE_TABLE = "source_table" - JOB_TYPES = {"load_job": LoadJob} - PATH = "projects/%s/jobs" % self.PROJECT - TOKEN = "TOKEN" - LOAD_DATA = { - "id": "%s:%s" % (self.PROJECT, "load_job"), - "jobReference": {"projectId": self.PROJECT, "jobId": "load_job"}, + JOB_ID = "query_job" + QUERY_DESTINATION_TABLE = "query_destination_table" + QUERY = "SELECT * from test_dataset:test_table" + ASYNC_QUERY_DATA = { + "id": "{}:{}".format(self.PROJECT, JOB_ID), + "jobReference": { + "projectId": "resource-proj", + "jobId": "query_job", + "location": "us-east1", + }, "state": "DONE", "configuration": { - "load": { + "query": { + "query": QUERY, "destinationTable": { "projectId": self.PROJECT, "datasetId": self.DS_ID, - "tableId": SOURCE_TABLE, - } + "tableId": QUERY_DESTINATION_TABLE, + }, + "createDisposition": CreateDisposition.CREATE_IF_NEEDED, + "writeDisposition": WriteDisposition.WRITE_TRUNCATE, } }, } - DATA = {"nextPageToken": TOKEN, "jobs": [LOAD_DATA]} creds = _make_credentials() client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(DATA) - - iterator = client.list_jobs() - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) + conn = client._connection = make_connection(ASYNC_QUERY_DATA) + job_from_resource = QueryJob.from_api_repr(ASYNC_QUERY_DATA, client) - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - jobs = list(page) - token = iterator.next_page_token + job = client.get_job(job_from_resource, timeout=7.5) - self.assertEqual(len(jobs), len(DATA["jobs"])) - for found, expected in zip(jobs, DATA["jobs"]): - name = expected["jobReference"]["jobId"] - self.assertIsInstance(found, JOB_TYPES[name]) - self.assertEqual(found.job_id, name) - self.assertEqual(token, TOKEN) + self.assertIsInstance(job, QueryJob) + self.assertEqual(job.job_id, JOB_ID) + self.assertEqual(job.project, "resource-proj") + self.assertEqual(job.location, "us-east1") + self.assertEqual(job.create_disposition, CreateDisposition.CREATE_IF_NEEDED) + self.assertEqual(job.write_disposition, WriteDisposition.WRITE_TRUNCATE) conn.api_request.assert_called_once_with( method="GET", - path="/%s" % PATH, - query_params={"projection": "full"}, - timeout=None, - ) - - def test_list_jobs_explicit_missing(self): - PATH = "projects/%s/jobs" % self.PROJECT - DATA = {} - TOKEN = "TOKEN" - creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection(DATA) - - iterator = client.list_jobs( - max_results=1000, page_token=TOKEN, all_users=True, state_filter="done" + path="/projects/resource-proj/jobs/query_job", + query_params={"projection": "full", "location": "us-east1"}, + timeout=7.5, ) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - page = six.next(iterator.pages) - - final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) - jobs = list(page) - token = iterator.next_page_token - - self.assertEqual(len(jobs), 0) - self.assertIsNone(token) - conn.api_request.assert_called_once_with( - method="GET", - path="/%s" % PATH, - query_params={ - "projection": "full", - "maxResults": 1000, - "pageToken": TOKEN, - "allUsers": True, - "stateFilter": "done", - }, - timeout=None, - ) + def test_cancel_job_miss_w_explict_project(self): + from google.cloud.exceptions import NotFound - def test_list_jobs_w_project(self): + OTHER_PROJECT = "OTHER_PROJECT" + JOB_ID = "NONESUCH" creds = _make_credentials() client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection({}) + conn = client._connection = make_connection() - list(client.list_jobs(project="other-project")) + with self.assertRaises(NotFound): + client.cancel_job(JOB_ID, project=OTHER_PROJECT, location=self.LOCATION) conn.api_request.assert_called_once_with( - method="GET", - path="/projects/other-project/jobs", - query_params={"projection": "full"}, - timeout=None, + method="POST", + path="/projects/OTHER_PROJECT/jobs/NONESUCH/cancel", + query_params={"projection": "full", "location": self.LOCATION}, + timeout=DEFAULT_TIMEOUT, ) - def test_list_jobs_w_timeout(self): + def test_cancel_job_miss_w_client_location(self): + from google.cloud.exceptions import NotFound + + OTHER_PROJECT = "OTHER_PROJECT" + JOB_ID = "NONESUCH" creds = _make_credentials() - client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection({}) + client = self._make_one(self.PROJECT, creds, location=self.LOCATION) + conn = client._connection = make_connection() - list(client.list_jobs(timeout=7.5)) + with self.assertRaises(NotFound): + client.cancel_job(JOB_ID, project=OTHER_PROJECT) conn.api_request.assert_called_once_with( - method="GET", - path="/projects/{}/jobs".format(self.PROJECT), - query_params={"projection": "full"}, - timeout=7.5, + method="POST", + path="/projects/OTHER_PROJECT/jobs/NONESUCH/cancel", + query_params={"projection": "full", "location": self.LOCATION}, + timeout=DEFAULT_TIMEOUT, ) - def test_list_jobs_w_time_filter(self): + def test_cancel_job_hit(self): + from google.cloud.bigquery.job import QueryJob + + JOB_ID = "query_job" + QUERY = "SELECT * from test_dataset:test_table" + QUERY_JOB_RESOURCE = { + "id": "{}:{}".format(self.PROJECT, JOB_ID), + "jobReference": { + "projectId": "job-based-proj", + "jobId": "query_job", + "location": "asia-northeast1", + }, + "state": "RUNNING", + "configuration": {"query": {"query": QUERY}}, + } + RESOURCE = {"job": QUERY_JOB_RESOURCE} creds = _make_credentials() client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection({}) + conn = client._connection = make_connection(RESOURCE) + job_from_resource = QueryJob.from_api_repr(QUERY_JOB_RESOURCE, client) - # One millisecond after the unix epoch. - start_time = datetime.datetime(1970, 1, 1, 0, 0, 0, 1000) - # One millisecond after the the 2038 31-bit signed int rollover - end_time = datetime.datetime(2038, 1, 19, 3, 14, 7, 1000) - end_time_millis = (((2 ** 31) - 1) * 1000) + 1 + job = client.cancel_job(job_from_resource) - list(client.list_jobs(min_creation_time=start_time, max_creation_time=end_time)) + self.assertIsInstance(job, QueryJob) + self.assertEqual(job.job_id, JOB_ID) + self.assertEqual(job.project, "job-based-proj") + self.assertEqual(job.location, "asia-northeast1") + self.assertEqual(job.query, QUERY) conn.api_request.assert_called_once_with( - method="GET", - path="/projects/%s/jobs" % self.PROJECT, - query_params={ - "projection": "full", - "minCreationTime": "1", - "maxCreationTime": str(end_time_millis), - }, - timeout=None, + method="POST", + path="/projects/job-based-proj/jobs/query_job/cancel", + query_params={"projection": "full", "location": "asia-northeast1"}, + timeout=DEFAULT_TIMEOUT, ) - def test_list_jobs_w_parent_job_filter(self): - from google.cloud.bigquery import job + def test_cancel_job_w_timeout(self): + JOB_ID = "query_job" + QUERY = "SELECT * from test_dataset:test_table" + QUERY_JOB_RESOURCE = { + "id": "{}:{}".format(self.PROJECT, JOB_ID), + "jobReference": {"projectId": self.PROJECT, "jobId": "query_job"}, + "state": "RUNNING", + "configuration": {"query": {"query": QUERY}}, + } + RESOURCE = {"job": QUERY_JOB_RESOURCE} creds = _make_credentials() client = self._make_one(self.PROJECT, creds) - conn = client._connection = make_connection({}, {}) + conn = client._connection = make_connection(RESOURCE) - parent_job_args = ["parent-job-123", job._AsyncJob("parent-job-123", client)] + client.cancel_job(JOB_ID, timeout=7.5) - for parent_job in parent_job_args: - list(client.list_jobs(parent_job=parent_job)) - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/%s/jobs" % self.PROJECT, - query_params={"projection": "full", "parentJobId": "parent-job-123"}, - timeout=None, - ) - conn.api_request.reset_mock() + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/{}/jobs/query_job/cancel".format(self.PROJECT), + query_params={"projection": "full"}, + timeout=7.5, + ) def test_load_table_from_uri(self): from google.cloud.bigquery.job import LoadJob, LoadJobConfig @@ -4270,7 +3116,7 @@ def test_load_table_from_uri(self): self.assertIs(job._client, client) self.assertEqual(job.job_id, JOB) self.assertEqual(list(job.source_uris), [SOURCE_URI]) - self.assertIs(job.destination, destination) + self.assertEqual(job.destination, destination) conn = client._connection = make_connection(RESOURCE) @@ -4279,7 +3125,7 @@ def test_load_table_from_uri(self): self.assertIs(job._client, client) self.assertEqual(job.job_id, JOB) self.assertEqual(list(job.source_uris), [SOURCE_URI]) - self.assertIs(job.destination, destination) + self.assertEqual(job.destination, destination) def test_load_table_from_uri_w_explicit_project(self): job_id = "this-is-a-job-id" @@ -4321,7 +3167,7 @@ def test_load_table_from_uri_w_explicit_project(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_load_table_from_uri_w_client_location(self): @@ -4365,7 +3211,7 @@ def test_load_table_from_uri_w_client_location(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_load_table_from_uri_w_invalid_job_config(self): @@ -4405,7 +3251,7 @@ def _mock_transport(self, status_code, headers, content=b""): fake_transport.request.return_value = fake_response return fake_transport - def _initiate_resumable_upload_helper(self, num_retries=None): + def _initiate_resumable_upload_helper(self, num_retries=None, mtls=False): from google.resumable_media.requests import ResumableUpload from google.cloud.bigquery.client import _DEFAULT_CHUNKSIZE from google.cloud.bigquery.client import _GENERIC_CONTENT_TYPE @@ -4417,9 +3263,11 @@ def _initiate_resumable_upload_helper(self, num_retries=None): # Create mocks to be checked for doing transport. resumable_url = "http://test.invalid?upload_id=hey-you" response_headers = {"location": resumable_url} - fake_transport = self._mock_transport(http_client.OK, response_headers) + fake_transport = self._mock_transport(http.client.OK, response_headers) client = self._make_one(project=self.PROJECT, _http=fake_transport) conn = client._connection = make_connection() + if mtls: + conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") # Create some mock arguments and call the method under test. data = b"goodbye gudbi gootbee" @@ -4429,15 +3277,16 @@ def _initiate_resumable_upload_helper(self, num_retries=None): job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) metadata = job.to_api_repr() upload, transport = client._initiate_resumable_upload( - stream, metadata, num_retries + stream, metadata, num_retries, None ) # Check the returned values. self.assertIsInstance(upload, ResumableUpload) + + host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" upload_url = ( - "https://bigquery.googleapis.com/upload/bigquery/v2/projects/" - + self.PROJECT - + "/jobs?uploadType=resumable" + f"{host_name}/upload/bigquery/v2/projects/{self.PROJECT}" + "/jobs?uploadType=resumable" ) self.assertEqual(upload.upload_url, upload_url) expected_headers = _get_upload_headers(conn.user_agent) @@ -4475,18 +3324,28 @@ def _initiate_resumable_upload_helper(self, num_retries=None): def test__initiate_resumable_upload(self): self._initiate_resumable_upload_helper() + def test__initiate_resumable_upload_mtls(self): + self._initiate_resumable_upload_helper(mtls=True) + def test__initiate_resumable_upload_with_retry(self): self._initiate_resumable_upload_helper(num_retries=11) - def _do_multipart_upload_success_helper(self, get_boundary, num_retries=None): + def _do_multipart_upload_success_helper( + self, get_boundary, num_retries=None, project=None, mtls=False + ): from google.cloud.bigquery.client import _get_upload_headers from google.cloud.bigquery.job import LoadJob from google.cloud.bigquery.job import LoadJobConfig from google.cloud.bigquery.job import SourceFormat - fake_transport = self._mock_transport(http_client.OK, {}) + fake_transport = self._mock_transport(http.client.OK, {}) client = self._make_one(project=self.PROJECT, _http=fake_transport) conn = client._connection = make_connection() + if mtls: + conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") + + if project is None: + project = self.PROJECT # Create some mock arguments. data = b"Bzzzz-zap \x00\x01\xf4" @@ -4496,43 +3355,55 @@ def _do_multipart_upload_success_helper(self, get_boundary, num_retries=None): job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) metadata = job.to_api_repr() size = len(data) - response = client._do_multipart_upload(stream, metadata, size, num_retries) + + response = client._do_multipart_upload( + stream, metadata, size, num_retries, None, project=project + ) # Check the mocks and the returned value. self.assertIs(response, fake_transport.request.return_value) self.assertEqual(stream.tell(), size) get_boundary.assert_called_once_with() + host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" upload_url = ( - "https://bigquery.googleapis.com/upload/bigquery/v2/projects/" - + self.PROJECT - + "/jobs?uploadType=multipart" + f"{host_name}/upload/bigquery/v2/projects/{project}" + "/jobs?uploadType=multipart" ) payload = ( b"--==0==\r\n" - + b"content-type: application/json; charset=UTF-8\r\n\r\n" - + json.dumps(metadata).encode("utf-8") - + b"\r\n" - + b"--==0==\r\n" - + b"content-type: */*\r\n\r\n" - + data - + b"\r\n" - + b"--==0==--" - ) + b"content-type: application/json; charset=UTF-8\r\n\r\n" + b"%(json_metadata)s" + b"\r\n" + b"--==0==\r\n" + b"content-type: */*\r\n\r\n" + b"%(data)s" + b"\r\n" + b"--==0==--" + ) % {b"json_metadata": json.dumps(metadata).encode("utf-8"), b"data": data} + headers = _get_upload_headers(conn.user_agent) headers["content-type"] = b'multipart/related; boundary="==0=="' fake_transport.request.assert_called_once_with( "POST", upload_url, data=payload, headers=headers, timeout=mock.ANY ) - @mock.patch(u"google.resumable_media._upload.get_boundary", return_value=b"==0==") + @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") def test__do_multipart_upload(self, get_boundary): self._do_multipart_upload_success_helper(get_boundary) - @mock.patch(u"google.resumable_media._upload.get_boundary", return_value=b"==0==") + @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") + def test__do_multipart_upload_mtls(self, get_boundary): + self._do_multipart_upload_success_helper(get_boundary, mtls=True) + + @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") def test__do_multipart_upload_with_retry(self, get_boundary): self._do_multipart_upload_success_helper(get_boundary, num_retries=8) + @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") + def test__do_multipart_upload_with_custom_project(self, get_boundary): + self._do_multipart_upload_success_helper(get_boundary, project="custom-project") + def test_copy_table(self): from google.cloud.bigquery.job import CopyJob @@ -4580,16 +3451,67 @@ def test_copy_table(self): self.assertIs(job._client, client) self.assertEqual(job.job_id, JOB) self.assertEqual(list(job.sources), [source]) - self.assertIs(job.destination, destination) + self.assertEqual(job.destination, destination) - conn = client._connection = make_connection(RESOURCE) - source2 = dataset.table(SOURCE + "2") - job = client.copy_table([source, source2], destination, job_id=JOB) + def test_copy_table_w_multiple_sources(self): + from google.cloud.bigquery.job import CopyJob + from google.cloud.bigquery.table import TableReference + + job_id = "job_name" + source_id = "my-project.my_dataset.source_table" + source_id2 = "my-project.my_dataset.source_table2" + destination_id = "my-other-project.another_dataset.destination_table" + expected_resource = { + "jobReference": {"projectId": self.PROJECT, "jobId": job_id}, + "configuration": { + "copy": { + "sourceTables": [ + { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "source_table", + }, + { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "source_table2", + }, + ], + "destinationTable": { + "projectId": "my-other-project", + "datasetId": "another_dataset", + "tableId": "destination_table", + }, + } + }, + } + returned_resource = expected_resource.copy() + returned_resource["statistics"] = {} + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection(returned_resource) + + job = client.copy_table([source_id, source_id2], destination_id, job_id=job_id) + + # Check that copy_table actually starts the job. + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/%s/jobs" % self.PROJECT, + data=expected_resource, + timeout=DEFAULT_TIMEOUT, + ) self.assertIsInstance(job, CopyJob) self.assertIs(job._client, client) - self.assertEqual(job.job_id, JOB) - self.assertEqual(list(job.sources), [source, source2]) - self.assertIs(job.destination, destination) + self.assertEqual(job.job_id, job_id) + self.assertEqual( + list(sorted(job.sources, key=lambda tbl: tbl.table_id)), + [ + TableReference.from_string(source_id), + TableReference.from_string(source_id2), + ], + ) + self.assertEqual(job.destination, TableReference.from_string(destination_id)) def test_copy_table_w_explicit_project(self): job_id = "this-is-a-job-id" @@ -4639,7 +3561,7 @@ def test_copy_table_w_explicit_project(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_copy_table_w_client_location(self): @@ -4689,7 +3611,7 @@ def test_copy_table_w_client_location(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_copy_table_w_source_strings(self): @@ -4782,7 +3704,7 @@ def test_copy_table_w_valid_job_config(self): method="POST", path="/projects/%s/jobs" % self.PROJECT, data=RESOURCE, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) self.assertIsInstance(job._configuration, CopyJobConfig) @@ -4888,7 +3810,7 @@ def test_extract_table_w_explicit_project(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_extract_table_w_client_location(self): @@ -4932,7 +3854,7 @@ def test_extract_table_w_client_location(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_extract_table_generated_job_id(self): @@ -4974,8 +3896,8 @@ def test_extract_table_generated_job_id(self): _, req = conn.api_request.call_args self.assertEqual(req["method"], "POST") self.assertEqual(req["path"], "/projects/PROJECT/jobs") - self.assertIsInstance(req["data"]["jobReference"]["jobId"], six.string_types) - self.assertIsNone(req["timeout"]) + self.assertIsInstance(req["data"]["jobReference"]["jobId"], str) + self.assertEqual(req["timeout"], DEFAULT_TIMEOUT) # Check the job resource. self.assertIsInstance(job, ExtractJob) @@ -5020,7 +3942,7 @@ def test_extract_table_w_destination_uris(self): _, req = conn.api_request.call_args self.assertEqual(req["method"], "POST") self.assertEqual(req["path"], "/projects/PROJECT/jobs") - self.assertIsNone(req["timeout"]) + self.assertEqual(req["timeout"], DEFAULT_TIMEOUT) # Check the job resource. self.assertIsInstance(job, ExtractJob) @@ -5179,7 +4101,7 @@ def test_query_defaults(self): job = client.query(QUERY) self.assertIsInstance(job, QueryJob) - self.assertIsInstance(job.job_id, six.string_types) + self.assertIsInstance(job.job_id, str) self.assertIs(job._client, client) self.assertEqual(job.query, QUERY) self.assertEqual(job.udf_resources, []) @@ -5190,9 +4112,9 @@ def test_query_defaults(self): _, req = conn.api_request.call_args self.assertEqual(req["method"], "POST") self.assertEqual(req["path"], "/projects/PROJECT/jobs") - self.assertIsNone(req["timeout"]) + self.assertEqual(req["timeout"], DEFAULT_TIMEOUT) sent = req["data"] - self.assertIsInstance(sent["jobReference"]["jobId"], six.string_types) + self.assertIsInstance(sent["jobReference"]["jobId"], str) sent_config = sent["configuration"]["query"] self.assertEqual(sent_config["query"], QUERY) self.assertFalse(sent_config["useLegacySql"]) @@ -5243,7 +4165,7 @@ def test_query_w_explicit_project(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_query_w_explicit_job_config(self): @@ -5299,7 +4221,10 @@ def test_query_w_explicit_job_config(self): # Check that query actually starts the job. conn.api_request.assert_called_once_with( - method="POST", path="/projects/PROJECT/jobs", data=resource, timeout=None + method="POST", + path="/projects/PROJECT/jobs", + data=resource, + timeout=DEFAULT_TIMEOUT, ) # the original config object should not have been modified @@ -5343,7 +4268,10 @@ def test_query_preserving_explicit_job_config(self): # Check that query actually starts the job. conn.api_request.assert_called_once_with( - method="POST", path="/projects/PROJECT/jobs", data=resource, timeout=None + method="POST", + path="/projects/PROJECT/jobs", + data=resource, + timeout=DEFAULT_TIMEOUT, ) # the original config object should not have been modified @@ -5395,7 +4323,10 @@ def test_query_preserving_explicit_default_job_config(self): # Check that query actually starts the job. conn.api_request.assert_called_once_with( - method="POST", path="/projects/PROJECT/jobs", data=resource, timeout=None + method="POST", + path="/projects/PROJECT/jobs", + data=resource, + timeout=DEFAULT_TIMEOUT, ) # the original default config object should not have been modified @@ -5480,7 +4411,10 @@ def test_query_w_explicit_job_config_override(self): # Check that query actually starts the job. conn.api_request.assert_called_once_with( - method="POST", path="/projects/PROJECT/jobs", data=resource, timeout=None + method="POST", + path="/projects/PROJECT/jobs", + data=resource, + timeout=DEFAULT_TIMEOUT, ) def test_query_w_client_default_config_no_incoming(self): @@ -5521,7 +4455,10 @@ def test_query_w_client_default_config_no_incoming(self): # Check that query actually starts the job. conn.api_request.assert_called_once_with( - method="POST", path="/projects/PROJECT/jobs", data=resource, timeout=None + method="POST", + path="/projects/PROJECT/jobs", + data=resource, + timeout=DEFAULT_TIMEOUT, ) def test_query_w_invalid_default_job_config(self): @@ -5566,7 +4503,7 @@ def test_query_w_client_location(self): method="POST", path="/projects/other-project/jobs", data=resource, - timeout=None, + timeout=DEFAULT_TIMEOUT, ) def test_query_detect_location(self): @@ -5637,9 +4574,9 @@ def test_query_w_udf_resources(self): _, req = conn.api_request.call_args self.assertEqual(req["method"], "POST") self.assertEqual(req["path"], "/projects/PROJECT/jobs") - self.assertIsNone(req["timeout"]) + self.assertEqual(req["timeout"], DEFAULT_TIMEOUT) sent = req["data"] - self.assertIsInstance(sent["jobReference"]["jobId"], six.string_types) + self.assertIsInstance(sent["jobReference"]["jobId"], str) sent_config = sent["configuration"]["query"] self.assertEqual(sent_config["query"], QUERY) self.assertTrue(sent_config["useLegacySql"]) @@ -5693,7 +4630,7 @@ def test_query_w_query_parameters(self): _, req = conn.api_request.call_args self.assertEqual(req["method"], "POST") self.assertEqual(req["path"], "/projects/PROJECT/jobs") - self.assertIsNone(req["timeout"]) + self.assertEqual(req["timeout"], DEFAULT_TIMEOUT) sent = req["data"] self.assertEqual(sent["jobReference"]["jobId"], JOB) sent_config = sent["configuration"]["query"] @@ -5708,6 +4645,81 @@ def test_query_w_query_parameters(self): }, ) + def test_query_job_rpc_fail_w_random_error(self): + from google.api_core.exceptions import Unknown + from google.cloud.bigquery.job import QueryJob + + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + job_create_error = Unknown("Not sure what went wrong.") + job_begin_patcher = mock.patch.object( + QueryJob, "_begin", side_effect=job_create_error + ) + with job_begin_patcher: + with pytest.raises(Unknown, match="Not sure what went wrong."): + client.query("SELECT 1;", job_id="123") + + def test_query_job_rpc_fail_w_conflict_job_id_given(self): + from google.api_core.exceptions import Conflict + from google.cloud.bigquery.job import QueryJob + + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + job_create_error = Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + QueryJob, "_begin", side_effect=job_create_error + ) + with job_begin_patcher: + with pytest.raises(Conflict, match="Job already exists."): + client.query("SELECT 1;", job_id="123") + + def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_fails(self): + from google.api_core.exceptions import Conflict + from google.api_core.exceptions import DataLoss + from google.cloud.bigquery.job import QueryJob + + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + job_create_error = Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + QueryJob, "_begin", side_effect=job_create_error + ) + get_job_patcher = mock.patch.object( + client, "get_job", side_effect=DataLoss("we lost yor job, sorry") + ) + + with job_begin_patcher, get_job_patcher: + # If get job request fails, the original exception should be raised. + with pytest.raises(Conflict, match="Job already exists."): + client.query("SELECT 1;", job_id=None) + + def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_succeeds(self): + from google.api_core.exceptions import Conflict + from google.cloud.bigquery.job import QueryJob + + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + job_create_error = Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + QueryJob, "_begin", side_effect=job_create_error + ) + get_job_patcher = mock.patch.object( + client, "get_job", return_value=mock.sentinel.query_job + ) + + with job_begin_patcher, get_job_patcher: + result = client.query("SELECT 1;", job_id=None) + + assert result is mock.sentinel.query_job + def test_insert_rows_w_timeout(self): from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table @@ -5756,7 +4768,7 @@ def test_insert_rows_w_schema(self): import datetime from google.cloud._helpers import UTC from google.cloud._helpers import _datetime_to_rfc3339 - from google.cloud._helpers import _microseconds_from_datetime + from google.cloud._helpers import _RFC3339_MICROS from google.cloud.bigquery.schema import SchemaField WHEN_TS = 1437767599.006 @@ -5786,7 +4798,7 @@ def _row_data(row): result = {"full_name": row[0], "age": str(row[1])} joined = row[2] if isinstance(joined, datetime.datetime): - joined = _microseconds_from_datetime(joined) * 1e-6 + joined = joined.strftime(_RFC3339_MICROS) if joined is not None: result["joined"] = joined return result @@ -5810,13 +4822,13 @@ def _row_data(row): self.assertEqual(req["method"], "POST") self.assertEqual(req["path"], "/%s" % PATH) self.assertEqual(req["data"], SENT) - self.assertIsNone(req["timeout"]) + self.assertEqual(req["timeout"], DEFAULT_TIMEOUT) def test_insert_rows_w_list_of_dictionaries(self): import datetime from google.cloud._helpers import UTC from google.cloud._helpers import _datetime_to_rfc3339 - from google.cloud._helpers import _microseconds_from_datetime + from google.cloud._helpers import _RFC3339_MICROS from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table @@ -5862,7 +4874,7 @@ def _row_data(row): row = copy.deepcopy(row) del row["joined"] elif isinstance(joined, datetime.datetime): - row["joined"] = _microseconds_from_datetime(joined) * 1e-6 + row["joined"] = joined.strftime(_RFC3339_MICROS) row["age"] = str(row["age"]) return row @@ -5878,7 +4890,7 @@ def _row_data(row): self.assertEqual(len(errors), 0) conn.api_request.assert_called_once_with( - method="POST", path="/%s" % PATH, data=SENT, timeout=None + method="POST", path="/%s" % PATH, data=SENT, timeout=DEFAULT_TIMEOUT ) def test_insert_rows_w_list_of_Rows(self): @@ -5923,7 +4935,7 @@ def _row_data(row): self.assertEqual(len(errors), 0) conn.api_request.assert_called_once_with( - method="POST", path="/%s" % PATH, data=SENT, timeout=None + method="POST", path="/%s" % PATH, data=SENT, timeout=DEFAULT_TIMEOUT ) def test_insert_rows_w_skip_invalid_and_ignore_unknown(self): @@ -6000,7 +5012,7 @@ def _row_data(row): errors[0]["errors"][0], RESPONSE["insertErrors"][0]["errors"][0] ) conn.api_request.assert_called_once_with( - method="POST", path="/%s" % PATH, data=SENT, timeout=None + method="POST", path="/%s" % PATH, data=SENT, timeout=DEFAULT_TIMEOUT ) def test_insert_rows_w_repeated_fields(self): @@ -6033,16 +5045,24 @@ def test_insert_rows_w_repeated_fields(self): ( 12, [ - datetime.datetime(2018, 12, 1, 12, 0, 0, tzinfo=pytz.utc), - datetime.datetime(2018, 12, 1, 13, 0, 0, tzinfo=pytz.utc), + datetime.datetime( + 2018, 12, 1, 12, 0, 0, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2018, 12, 1, 13, 0, 0, tzinfo=datetime.timezone.utc + ), ], [1.25, 2.5], ), { "score": 13, "times": [ - datetime.datetime(2018, 12, 2, 12, 0, 0, tzinfo=pytz.utc), - datetime.datetime(2018, 12, 2, 13, 0, 0, tzinfo=pytz.utc), + datetime.datetime( + 2018, 12, 2, 12, 0, 0, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2018, 12, 2, 13, 0, 0, tzinfo=datetime.timezone.utc + ), ], "distances": [-1.25, -2.5], }, @@ -6061,16 +5081,16 @@ def test_insert_rows_w_repeated_fields(self): { "score": "12", "times": [ - 1543665600.0, # 2018-12-01 12:00 UTC - 1543669200.0, # 2018-12-01 13:00 UTC + "2018-12-01T12:00:00.000000Z", + "2018-12-01T13:00:00.000000Z", ], "distances": [1.25, 2.5], }, { "score": "13", "times": [ - 1543752000.0, # 2018-12-02 12:00 UTC - 1543755600.0, # 2018-12-02 13:00 UTC + "2018-12-02T12:00:00.000000Z", + "2018-12-02T13:00:00.000000Z", ], "distances": [-1.25, -2.5], }, @@ -6093,7 +5113,7 @@ def test_insert_rows_w_repeated_fields(self): self.assertEqual(len(errors), 0) conn.api_request.assert_called_once_with( - method="POST", path="/%s" % PATH, data=SENT, timeout=None, + method="POST", path="/%s" % PATH, data=SENT, timeout=DEFAULT_TIMEOUT, ) def test_insert_rows_w_record_schema(self): @@ -6159,7 +5179,7 @@ def test_insert_rows_w_record_schema(self): self.assertEqual(len(errors), 0) conn.api_request.assert_called_once_with( - method="POST", path="/%s" % PATH, data=SENT, timeout=None + method="POST", path="/%s" % PATH, data=SENT, timeout=DEFAULT_TIMEOUT ) def test_insert_rows_w_explicit_none_insert_ids(self): @@ -6193,7 +5213,7 @@ def _row_data(row): self.assertEqual(len(errors), 0) conn.api_request.assert_called_once_with( - method="POST", path="/{}".format(PATH), data=SENT, timeout=None, + method="POST", path="/{}".format(PATH), data=SENT, timeout=DEFAULT_TIMEOUT, ) def test_insert_rows_errors(self): @@ -6242,38 +5262,43 @@ def test_insert_rows_w_numeric(self): creds = _make_credentials() http = object() client = self._make_one(project=project, credentials=creds, _http=http) - conn = client._connection = make_connection({}) table_ref = DatasetReference(project, ds_id).table(table_id) - schema = [SchemaField("account", "STRING"), SchemaField("balance", "NUMERIC")] - insert_table = table.Table(table_ref, schema=schema) rows = [ ("Savings", decimal.Decimal("23.47")), ("Checking", decimal.Decimal("1.98")), ("Mortgage", decimal.Decimal("-12345678909.87654321")), ] + schemas = [ + [SchemaField("account", "STRING"), SchemaField("balance", "NUMERIC")], + [SchemaField("account", "STRING"), SchemaField("balance", "BIGNUMERIC")], + ] - with mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows)))): - errors = client.insert_rows(insert_table, rows) + for schema in schemas: + conn = client._connection = make_connection({}) - self.assertEqual(len(errors), 0) - rows_json = [ - {"account": "Savings", "balance": "23.47"}, - {"account": "Checking", "balance": "1.98"}, - {"account": "Mortgage", "balance": "-12345678909.87654321"}, - ] - sent = { - "rows": [ - {"json": row, "insertId": str(i)} for i, row in enumerate(rows_json) - ] - } - conn.api_request.assert_called_once_with( - method="POST", - path="/projects/{}/datasets/{}/tables/{}/insertAll".format( - project, ds_id, table_id - ), - data=sent, - timeout=None, - ) + insert_table = table.Table(table_ref, schema=schema) + with mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows)))): + errors = client.insert_rows(insert_table, rows) + + self.assertEqual(len(errors), 0) + rows_json = [ + {"account": "Savings", "balance": "23.47"}, + {"account": "Checking", "balance": "1.98"}, + {"account": "Mortgage", "balance": "-12345678909.87654321"}, + ] + sent = { + "rows": [ + {"json": row, "insertId": str(i)} for i, row in enumerate(rows_json) + ] + } + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/{}/datasets/{}/tables/{}/insertAll".format( + project, ds_id, table_id + ), + data=sent, + timeout=DEFAULT_TIMEOUT, + ) @unittest.skipIf(pandas is None, "Requires `pandas`") def test_insert_rows_from_dataframe(self): @@ -6286,10 +5311,10 @@ def test_insert_rows_from_dataframe(self): dataframe = pandas.DataFrame( [ - {"name": u"Little One", "age": 10, "adult": False}, - {"name": u"Young Gun", "age": 20, "adult": True}, - {"name": u"Dad", "age": 30, "adult": True}, - {"name": u"Stranger", "age": 40, "adult": True}, + {"name": "Little One", "age": 10, "adult": False}, + {"name": "Young Gun", "age": 20, "adult": True}, + {"name": "Dad", "age": 30, "adult": True}, + {"name": "Stranger", "age": 40, "adult": True}, ] ) @@ -6345,7 +5370,7 @@ def test_insert_rows_from_dataframe(self): actual_calls = conn.api_request.call_args_list - for call, expected_data in six.moves.zip_longest( + for call, expected_data in itertools.zip_longest( actual_calls, EXPECTED_SENT_DATA ): expected_call = mock.call( @@ -6413,7 +5438,7 @@ def test_insert_rows_from_dataframe_nan(self): actual_calls = conn.api_request.call_args_list - for call, expected_data in six.moves.zip_longest( + for call, expected_data in itertools.zip_longest( actual_calls, EXPECTED_SENT_DATA ): expected_call = mock.call( @@ -6464,7 +5489,10 @@ def test_insert_rows_from_dataframe_many_columns(self): ] } expected_call = mock.call( - method="POST", path=API_PATH, data=EXPECTED_SENT_DATA, timeout=None + method="POST", + path=API_PATH, + data=EXPECTED_SENT_DATA, + timeout=DEFAULT_TIMEOUT, ) actual_calls = conn.api_request.call_args_list @@ -6482,8 +5510,8 @@ def test_insert_rows_from_dataframe_w_explicit_none_insert_ids(self): dataframe = pandas.DataFrame( [ - {"name": u"Little One", "adult": False}, - {"name": u"Young Gun", "adult": True}, + {"name": "Little One", "adult": False}, + {"name": "Young Gun", "adult": True}, ] ) @@ -6517,10 +5545,13 @@ def test_insert_rows_from_dataframe_w_explicit_none_insert_ids(self): actual_calls = conn.api_request.call_args_list assert len(actual_calls) == 1 assert actual_calls[0] == mock.call( - method="POST", path=API_PATH, data=EXPECTED_SENT_DATA, timeout=None + method="POST", + path=API_PATH, + data=EXPECTED_SENT_DATA, + timeout=DEFAULT_TIMEOUT, ) - def test_insert_rows_json(self): + def test_insert_rows_json_default_behavior(self): from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table @@ -6567,8 +5598,10 @@ def test_insert_rows_json(self): method="POST", path="/%s" % PATH, data=SENT, timeout=7.5, ) - def test_insert_rows_json_with_string_id(self): - rows = [{"col1": "val1"}] + def test_insert_rows_json_w_explicitly_requested_autogenerated_insert_ids(self): + from google.cloud.bigquery import AutoRowIDs + + rows = [{"col1": "val1"}, {"col2": "val2"}] creds = _make_credentials() http = object() client = self._make_one( @@ -6576,19 +5609,115 @@ def test_insert_rows_json_with_string_id(self): ) conn = client._connection = make_connection({}) - with mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows)))): - errors = client.insert_rows_json("proj.dset.tbl", rows) + uuid_patcher = mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows)))) + with uuid_patcher: + errors = client.insert_rows_json( + "proj.dset.tbl", rows, row_ids=AutoRowIDs.GENERATE_UUID + ) self.assertEqual(len(errors), 0) - expected = { - "rows": [{"json": row, "insertId": str(i)} for i, row in enumerate(rows)] + + # Check row data sent to the backend. + expected_row_data = { + "rows": [ + {"json": {"col1": "val1"}, "insertId": "0"}, + {"json": {"col2": "val2"}, "insertId": "1"}, + ] } conn.api_request.assert_called_once_with( method="POST", path="/projects/proj/datasets/dset/tables/tbl/insertAll", - data=expected, - timeout=None, + data=expected_row_data, + timeout=DEFAULT_TIMEOUT, + ) + + def test_insert_rows_json_w_explicitly_disabled_insert_ids(self): + from google.cloud.bigquery import AutoRowIDs + + rows = [{"col1": "val1"}, {"col2": "val2"}] + creds = _make_credentials() + http = object() + client = self._make_one( + project="default-project", credentials=creds, _http=http + ) + conn = client._connection = make_connection({}) + + errors = client.insert_rows_json( + "proj.dset.tbl", rows, row_ids=AutoRowIDs.DISABLED, + ) + + self.assertEqual(len(errors), 0) + + expected_row_data = { + "rows": [ + {"json": {"col1": "val1"}, "insertId": None}, + {"json": {"col2": "val2"}, "insertId": None}, + ] + } + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/proj/datasets/dset/tables/tbl/insertAll", + data=expected_row_data, + timeout=DEFAULT_TIMEOUT, + ) + + def test_insert_rows_json_with_iterator_row_ids(self): + rows = [{"col1": "val1"}, {"col2": "val2"}, {"col3": "val3"}] + creds = _make_credentials() + http = object() + client = self._make_one( + project="default-project", credentials=creds, _http=http + ) + conn = client._connection = make_connection({}) + + row_ids_iter = map(str, itertools.count(42)) + errors = client.insert_rows_json("proj.dset.tbl", rows, row_ids=row_ids_iter) + + self.assertEqual(len(errors), 0) + expected_row_data = { + "rows": [ + {"json": {"col1": "val1"}, "insertId": "42"}, + {"json": {"col2": "val2"}, "insertId": "43"}, + {"json": {"col3": "val3"}, "insertId": "44"}, + ] + } + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/proj/datasets/dset/tables/tbl/insertAll", + data=expected_row_data, + timeout=DEFAULT_TIMEOUT, + ) + + def test_insert_rows_json_with_non_iterable_row_ids(self): + rows = [{"col1": "val1"}] + creds = _make_credentials() + http = object() + client = self._make_one( + project="default-project", credentials=creds, _http=http ) + client._connection = make_connection({}) + + with self.assertRaises(TypeError) as exc: + client.insert_rows_json("proj.dset.tbl", rows, row_ids=object()) + + err_msg = str(exc.exception) + self.assertIn("row_ids", err_msg) + self.assertIn("iterable", err_msg) + + def test_insert_rows_json_with_too_few_row_ids(self): + rows = [{"col1": "val1"}, {"col2": "val2"}, {"col3": "val3"}] + creds = _make_credentials() + http = object() + client = self._make_one( + project="default-project", credentials=creds, _http=http + ) + client._connection = make_connection({}) + + insert_ids = ["10", "20"] + + error_msg_pattern = "row_ids did not generate enough IDs.*index 2" + with self.assertRaisesRegex(ValueError, error_msg_pattern): + client.insert_rows_json("proj.dset.tbl", rows, row_ids=insert_ids) def test_insert_rows_json_w_explicit_none_insert_ids(self): rows = [{"col1": "val1"}, {"col2": "val2"}] @@ -6609,7 +5738,46 @@ def test_insert_rows_json_w_explicit_none_insert_ids(self): method="POST", path="/projects/proj/datasets/dset/tables/tbl/insertAll", data=expected, - timeout=None, + timeout=DEFAULT_TIMEOUT, + ) + + def test_insert_rows_json_w_none_insert_ids_sequence(self): + rows = [{"col1": "val1"}, {"col2": "val2"}] + creds = _make_credentials() + http = object() + client = self._make_one( + project="default-project", credentials=creds, _http=http + ) + conn = client._connection = make_connection({}) + + uuid_patcher = mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows)))) + with warnings.catch_warnings(record=True) as warned, uuid_patcher: + errors = client.insert_rows_json("proj.dset.tbl", rows, row_ids=None) + + self.assertEqual(len(errors), 0) + + # Passing row_ids=None should have resulted in a deprecation warning. + matches = [ + warning + for warning in warned + if issubclass(warning.category, DeprecationWarning) + and "row_ids" in str(warning) + and "AutoRowIDs.GENERATE_UUID" in str(warning) + ] + assert matches, "The expected deprecation warning was not raised." + + # Check row data sent to the backend. + expected_row_data = { + "rows": [ + {"json": {"col1": "val1"}, "insertId": "0"}, + {"json": {"col2": "val2"}, "insertId": "1"}, + ] + } + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/proj/datasets/dset/tables/tbl/insertAll", + data=expected_row_data, + timeout=DEFAULT_TIMEOUT, ) def test_insert_rows_w_wrong_arg(self): @@ -6691,42 +5859,21 @@ def test_list_rows(self): self.DS_ID, self.TABLE_ID, ) - WHEN_TS = 1437767599.006 - WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS).replace(tzinfo=UTC) - WHEN_1 = WHEN + datetime.timedelta(seconds=1) - WHEN_2 = WHEN + datetime.timedelta(seconds=2) + WHEN_TS = 1437767599006000 + + WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS / 1e6).replace(tzinfo=UTC) + WHEN_1 = WHEN + datetime.timedelta(microseconds=1) + WHEN_2 = WHEN + datetime.timedelta(microseconds=2) ROWS = 1234 TOKEN = "TOKEN" - def _bigquery_timestamp_float_repr(ts_float): - # Preserve microsecond precision for E+09 timestamps - return "%0.15E" % (ts_float,) - DATA = { "totalRows": str(ROWS), "pageToken": TOKEN, "rows": [ - { - "f": [ - {"v": "Phred Phlyntstone"}, - {"v": "32"}, - {"v": _bigquery_timestamp_float_repr(WHEN_TS)}, - ] - }, - { - "f": [ - {"v": "Bharney Rhubble"}, - {"v": "33"}, - {"v": _bigquery_timestamp_float_repr(WHEN_TS + 1)}, - ] - }, - { - "f": [ - {"v": "Wylma Phlyntstone"}, - {"v": "29"}, - {"v": _bigquery_timestamp_float_repr(WHEN_TS + 2)}, - ] - }, + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}, {"v": WHEN_TS}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}, {"v": WHEN_TS + 1}]}, + {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}, {"v": WHEN_TS + 2}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": None}, {"v": None}]}, ], } @@ -6738,12 +5885,17 @@ def _bigquery_timestamp_float_repr(ts_float): age = SchemaField("age", "INTEGER", mode="NULLABLE") joined = SchemaField("joined", "TIMESTAMP", mode="NULLABLE") table = Table(self.TABLE_REF, schema=[full_name, age, joined]) + table._properties["numRows"] = 7 iterator = client.list_rows(table, timeout=7.5) - page = six.next(iterator.pages) + + # Check that initial total_rows is populated from the table. + self.assertEqual(iterator.total_rows, 7) + page = next(iterator.pages) rows = list(page) - total_rows = iterator.total_rows - page_token = iterator.next_page_token + + # Check that total_rows is updated based on API response. + self.assertEqual(iterator.total_rows, ROWS) f2i = {"full_name": 0, "age": 1, "joined": 2} self.assertEqual(len(rows), 4) @@ -6751,11 +5903,13 @@ def _bigquery_timestamp_float_repr(ts_float): self.assertEqual(rows[1], Row(("Bharney Rhubble", 33, WHEN_1), f2i)) self.assertEqual(rows[2], Row(("Wylma Phlyntstone", 29, WHEN_2), f2i)) self.assertEqual(rows[3], Row(("Bhettye Rhubble", None, None), f2i)) - self.assertEqual(total_rows, ROWS) - self.assertEqual(page_token, TOKEN) + self.assertEqual(iterator.next_page_token, TOKEN) conn.api_request.assert_called_once_with( - method="GET", path="/%s" % PATH, query_params={}, timeout=7.5 + method="GET", + path="/%s" % PATH, + query_params={"formatOptions.useInt64Timestamp": True}, + timeout=7.5, ) def test_list_rows_w_start_index_w_page_size(self): @@ -6792,33 +5946,43 @@ def test_list_rows_w_start_index_w_page_size(self): table = Table(self.TABLE_REF, schema=[full_name]) iterator = client.list_rows(table, max_results=4, page_size=2, start_index=1) pages = iterator.pages - rows = list(six.next(pages)) + rows = list(next(pages)) extra_params = iterator.extra_params f2i = {"full_name": 0} self.assertEqual(len(rows), 2) self.assertEqual(rows[0], Row(("Phred Phlyntstone",), f2i)) self.assertEqual(rows[1], Row(("Bharney Rhubble",), f2i)) - rows = list(six.next(pages)) + rows = list(next(pages)) self.assertEqual(len(rows), 2) self.assertEqual(rows[0], Row(("Wylma Phlyntstone",), f2i)) self.assertEqual(rows[1], Row(("Bhettye Rhubble",), f2i)) - self.assertEqual(extra_params, {"startIndex": 1}) + self.assertEqual( + extra_params, {"startIndex": 1, "formatOptions.useInt64Timestamp": True} + ) conn.api_request.assert_has_calls( [ mock.call( method="GET", path="/%s" % PATH, - query_params={"startIndex": 1, "maxResults": 2}, - timeout=None, + query_params={ + "startIndex": 1, + "maxResults": 2, + "formatOptions.useInt64Timestamp": True, + }, + timeout=DEFAULT_TIMEOUT, ), mock.call( method="GET", path="/%s" % PATH, - query_params={"pageToken": "some-page-token", "maxResults": 2}, - timeout=None, + query_params={ + "pageToken": "some-page-token", + "maxResults": 2, + "formatOptions.useInt64Timestamp": True, + }, + timeout=DEFAULT_TIMEOUT, ), ] ) @@ -6866,10 +6030,48 @@ def test_list_rows_query_params(self): conn = client._connection = make_connection(*len(tests) * [{}]) for i, test in enumerate(tests): iterator = client.list_rows(table, **test[0]) - six.next(iterator.pages) + next(iterator.pages) req = conn.api_request.call_args_list[i] + test[1]["formatOptions.useInt64Timestamp"] = True self.assertEqual(req[1]["query_params"], test[1], "for kwargs %s" % test[0]) + def test_list_rows_w_numeric(self): + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.table import Table + + resource = { + "totalRows": 3, + "rows": [ + {"f": [{"v": "-1.23456789"}, {"v": "-123456789.987654321"}]}, + {"f": [{"v": None}, {"v": "3.141592653589793238462643383279502884"}]}, + {"f": [{"v": "2718281828459045235360287471.352662497"}, {"v": None}]}, + ], + } + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + client._connection = make_connection(resource) + schema = [ + SchemaField("num", "NUMERIC"), + SchemaField("bignum", "BIGNUMERIC"), + ] + table = Table(self.TABLE_REF, schema=schema) + + iterator = client.list_rows(table) + rows = list(iterator) + + self.assertEqual(len(rows), 3) + self.assertEqual(rows[0]["num"], decimal.Decimal("-1.23456789")) + self.assertEqual(rows[0]["bignum"], decimal.Decimal("-123456789.987654321")) + self.assertIsNone(rows[1]["num"]) + self.assertEqual( + rows[1]["bignum"], decimal.Decimal("3.141592653589793238462643383279502884") + ) + self.assertEqual( + rows[2]["num"], decimal.Decimal("2718281828459045235360287471.352662497") + ) + self.assertIsNone(rows[2]["bignum"]) + def test_list_rows_repeated_fields(self): from google.cloud.bigquery.schema import SchemaField @@ -6913,7 +6115,7 @@ def test_list_rows_repeated_fields(self): struct = SchemaField("struct", "RECORD", mode="REPEATED", fields=[index, score]) iterator = client.list_rows(self.TABLE_REF, selected_fields=[color, struct]) - page = six.next(iterator.pages) + page = next(iterator.pages) rows = list(page) total_rows = iterator.total_rows page_token = iterator.next_page_token @@ -6927,8 +6129,11 @@ def test_list_rows_repeated_fields(self): conn.api_request.assert_called_once_with( method="GET", path="/%s" % PATH, - query_params={"selectedFields": "color,struct"}, - timeout=None, + query_params={ + "selectedFields": "color,struct", + "formatOptions.useInt64Timestamp": True, + }, + timeout=DEFAULT_TIMEOUT, ) def test_list_rows_w_record_schema(self): @@ -6975,7 +6180,7 @@ def test_list_rows_w_record_schema(self): table = Table(self.TABLE_REF, schema=[full_name, phone]) iterator = client.list_rows(table) - page = six.next(iterator.pages) + page = next(iterator.pages) rows = list(page) total_rows = iterator.total_rows page_token = iterator.next_page_token @@ -6995,7 +6200,10 @@ def test_list_rows_w_record_schema(self): self.assertEqual(page_token, TOKEN) conn.api_request.assert_called_once_with( - method="GET", path="/%s" % PATH, query_params={}, timeout=None + method="GET", + path="/%s" % PATH, + query_params={"formatOptions.useInt64Timestamp": True}, + timeout=DEFAULT_TIMEOUT, ) def test_list_rows_with_missing_schema(self): @@ -7050,14 +6258,17 @@ def test_list_rows_with_missing_schema(self): row_iter = client.list_rows(table) conn.api_request.assert_called_once_with( - method="GET", path=table_path, timeout=None + method="GET", path=table_path, timeout=DEFAULT_TIMEOUT ) conn.api_request.reset_mock() self.assertEqual(row_iter.total_rows, 2, msg=repr(table)) rows = list(row_iter) conn.api_request.assert_called_once_with( - method="GET", path=tabledata_path, query_params={}, timeout=None + method="GET", + path=tabledata_path, + query_params={"formatOptions.useInt64Timestamp": True}, + timeout=DEFAULT_TIMEOUT, ) self.assertEqual(row_iter.total_rows, 3, msg=repr(table)) self.assertEqual(rows[0].name, "Phred Phlyntstone", msg=repr(table)) @@ -7073,6 +6284,28 @@ def test_list_rows_error(self): with self.assertRaises(TypeError): client.list_rows(1) + def test_context_manager_enter_returns_itself(self): + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + with mock.patch.object(client, "close"), client as context_var: + pass + + self.assertIs(client, context_var) + + def test_context_manager_exit_closes_client(self): + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + + fake_close = mock.Mock() + with mock.patch.object(client, "close", fake_close): + with client: + pass + + fake_close.assert_called_once() + class Test_make_job_id(unittest.TestCase): def _call_fut(self, job_id, prefix=None): @@ -7109,17 +6342,18 @@ class TestClientUpload(object): # `pytest`-style tests rather than `unittest`-style. from google.cloud.bigquery.job import SourceFormat - TABLE_REF = DatasetReference("project_id", "test_dataset").table("test_table") + PROJECT = "project_id" + TABLE_REF = DatasetReference(PROJECT, "test_dataset").table("test_table") LOCATION = "us-central" - @staticmethod - def _make_client(transport=None, location=None): + @classmethod + def _make_client(cls, transport=None, location=None): from google.cloud.bigquery import _http from google.cloud.bigquery import client cl = client.Client( - project="project_id", + project=cls.PROJECT, credentials=_make_credentials(), _http=transport, location=location, @@ -7145,7 +6379,7 @@ def _make_do_upload_patch(cls, client, method, resource={}, side_effect=None): if side_effect is None: side_effect = [ cls._make_response( - http_client.OK, + http.client.OK, json.dumps(resource), {"Content-Type": "application/json"}, ) @@ -7153,12 +6387,12 @@ def _make_do_upload_patch(cls, client, method, resource={}, side_effect=None): return mock.patch.object(client, method, side_effect=side_effect, autospec=True) EXPECTED_CONFIGURATION = { - "jobReference": {"projectId": "project_id", "jobId": "job_id"}, + "jobReference": {"projectId": PROJECT, "jobId": "job_id"}, "configuration": { "load": { "sourceFormat": SourceFormat.CSV, "destinationTable": { - "projectId": "project_id", + "projectId": PROJECT, "datasetId": "test_dataset", "tableId": "test_table", }, @@ -7204,7 +6438,11 @@ def test_load_table_from_file_resumable(self): ) do_upload.assert_called_once_with( - file_obj, self.EXPECTED_CONFIGURATION, _DEFAULT_NUM_RETRIES + file_obj, + self.EXPECTED_CONFIGURATION, + _DEFAULT_NUM_RETRIES, + DEFAULT_TIMEOUT, + project=self.EXPECTED_CONFIGURATION["jobReference"]["projectId"], ) # the original config object should not have been modified @@ -7233,7 +6471,11 @@ def test_load_table_from_file_w_explicit_project(self): expected_resource["jobReference"]["location"] = self.LOCATION expected_resource["jobReference"]["projectId"] = "other-project" do_upload.assert_called_once_with( - file_obj, expected_resource, _DEFAULT_NUM_RETRIES + file_obj, + expected_resource, + _DEFAULT_NUM_RETRIES, + DEFAULT_TIMEOUT, + project="other-project", ) def test_load_table_from_file_w_client_location(self): @@ -7263,7 +6505,11 @@ def test_load_table_from_file_w_client_location(self): expected_resource["jobReference"]["location"] = self.LOCATION expected_resource["jobReference"]["projectId"] = "other-project" do_upload.assert_called_once_with( - file_obj, expected_resource, _DEFAULT_NUM_RETRIES + file_obj, + expected_resource, + _DEFAULT_NUM_RETRIES, + DEFAULT_TIMEOUT, + project="other-project", ) def test_load_table_from_file_resumable_metadata(self): @@ -7288,7 +6534,7 @@ def test_load_table_from_file_resumable_metadata(self): config.null_marker = r"\N" expected_config = { - "jobReference": {"projectId": "project_id", "jobId": "job_id"}, + "jobReference": {"projectId": self.PROJECT, "jobId": "job_id"}, "configuration": { "load": { "destinationTable": { @@ -7321,7 +6567,11 @@ def test_load_table_from_file_resumable_metadata(self): ) do_upload.assert_called_once_with( - file_obj, expected_config, _DEFAULT_NUM_RETRIES + file_obj, + expected_config, + _DEFAULT_NUM_RETRIES, + DEFAULT_TIMEOUT, + project=self.EXPECTED_CONFIGURATION["jobReference"]["projectId"], ) def test_load_table_from_file_multipart(self): @@ -7345,7 +6595,12 @@ def test_load_table_from_file_multipart(self): ) do_upload.assert_called_once_with( - file_obj, self.EXPECTED_CONFIGURATION, file_obj_size, _DEFAULT_NUM_RETRIES + file_obj, + self.EXPECTED_CONFIGURATION, + file_obj_size, + _DEFAULT_NUM_RETRIES, + DEFAULT_TIMEOUT, + project=self.PROJECT, ) def test_load_table_from_file_with_retries(self): @@ -7366,7 +6621,11 @@ def test_load_table_from_file_with_retries(self): ) do_upload.assert_called_once_with( - file_obj, self.EXPECTED_CONFIGURATION, num_retries + file_obj, + self.EXPECTED_CONFIGURATION, + num_retries, + DEFAULT_TIMEOUT, + project=self.EXPECTED_CONFIGURATION["jobReference"]["projectId"], ) def test_load_table_from_file_with_rewind(self): @@ -7399,7 +6658,11 @@ def test_load_table_from_file_with_readable_gzip(self): ) do_upload.assert_called_once_with( - gzip_file, self.EXPECTED_CONFIGURATION, _DEFAULT_NUM_RETRIES + gzip_file, + self.EXPECTED_CONFIGURATION, + _DEFAULT_NUM_RETRIES, + DEFAULT_TIMEOUT, + project=self.EXPECTED_CONFIGURATION["jobReference"]["projectId"], ) def test_load_table_from_file_with_writable_gzip(self): @@ -7422,7 +6685,7 @@ def test_load_table_from_file_failure(self): file_obj = self._make_file_obj() response = self._make_response( - content="Someone is already in this spot.", status_code=http_client.CONFLICT + content="Someone is already in this spot.", status_code=http.client.CONFLICT ) do_upload_patch = self._make_do_upload_patch( @@ -7461,18 +6724,47 @@ def test_load_table_from_file_w_invalid_job_config(self): def test_load_table_from_dataframe(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job - from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.schema import PolicyTagList, SchemaField client = self._make_client() - records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] - dataframe = pandas.DataFrame(records) + records = [ + {"id": 1, "age": 100, "accounts": [2, 3]}, + {"id": 2, "age": 60, "accounts": [5]}, + {"id": 3, "age": 40, "accounts": []}, + ] + # Mixup column order so that we can verify sent schema matches the + # serialized order, not the table column order. + column_order = ["age", "accounts", "id"] + dataframe = pandas.DataFrame(records, columns=column_order) + table_fields = { + "id": SchemaField( + "id", + "INTEGER", + mode="REQUIRED", + description="integer column", + policy_tags=PolicyTagList(names=("foo", "bar")), + ), + "age": SchemaField( + "age", + "INTEGER", + mode="NULLABLE", + description="age column", + policy_tags=PolicyTagList(names=("baz",)), + ), + "accounts": SchemaField( + "accounts", "INTEGER", mode="REPEATED", description="array column", + ), + } + get_table_schema = [ + table_fields["id"], + table_fields["age"], + table_fields["accounts"], + ] get_table_patch = mock.patch( "google.cloud.bigquery.client.Client.get_table", autospec=True, - return_value=mock.Mock( - schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")] - ), + return_value=mock.Mock(schema=get_table_schema), ) load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True @@ -7486,18 +6778,33 @@ def test_load_table_from_dataframe(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=None, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_file = load_table_from_file.mock_calls[0][1][1] assert sent_file.closed - sent_config = load_table_from_file.mock_calls[0][2]["job_config"] - assert sent_config.source_format == job.SourceFormat.PARQUET + sent_config = load_table_from_file.mock_calls[0][2]["job_config"].to_api_repr()[ + "load" + ] + assert sent_config["sourceFormat"] == job.SourceFormat.PARQUET + for field_index, field in enumerate(sent_config["schema"]["fields"]): + assert field["name"] == column_order[field_index] + table_field = table_fields[field["name"]] + assert field["name"] == table_field.name + assert field["type"] == table_field.field_type + assert field["mode"] == table_field.mode + assert len(field.get("fields", [])) == len(table_field.fields) + assert field["policyTags"]["names"] == [] + # Omit unnecessary fields when they come from getting the table + # (not passed in via job_config) + assert "description" not in field @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @@ -7529,11 +6836,13 @@ def test_load_table_from_dataframe_w_client_location(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_file = load_table_from_file.mock_calls[0][1][1] @@ -7581,11 +6890,13 @@ def test_load_table_from_dataframe_w_custom_job_config_wihtout_source_format(sel self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7635,11 +6946,13 @@ def test_load_table_from_dataframe_w_custom_job_config_w_source_format(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7702,7 +7015,7 @@ def test_load_table_from_dataframe_w_automatic_schema(self): datetime.datetime(2012, 3, 14, 15, 16), ], dtype="datetime64[ns]", - ).dt.tz_localize(pytz.utc), + ).dt.tz_localize(datetime.timezone.utc), ), ] ) @@ -7727,11 +7040,13 @@ def test_load_table_from_dataframe_w_automatic_schema(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7786,11 +7101,13 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7799,89 +7116,18 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self): sent_schema = sorted(sent_config.schema, key=operator.attrgetter("name")) expected_sent_schema = [ SchemaField("float_col", "FLOAT"), - SchemaField("int_col", "INTEGER"), - SchemaField("unique_name", "STRING"), - ] - assert sent_schema == expected_sent_schema - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_unknown_table(self): - from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES - - client = self._make_client() - records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] - dataframe = pandas.DataFrame(records) - - get_table_patch = mock.patch( - "google.cloud.bigquery.client.Client.get_table", - autospec=True, - side_effect=google.api_core.exceptions.NotFound("Table not found"), - ) - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - with load_patch as load_table_from_file, get_table_patch: - # there should be no error - client.load_table_from_dataframe(dataframe, self.TABLE_REF) - - load_table_from_file.assert_called_once_with( - client, - mock.ANY, - self.TABLE_REF, - num_retries=_DEFAULT_NUM_RETRIES, - rewind=True, - job_id=mock.ANY, - job_id_prefix=None, - location=None, - project=None, - job_config=mock.ANY, - ) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") - def test_load_table_from_dataframe_no_pyarrow_warning(self): - from google.cloud.bigquery.client import PyarrowMissingWarning - - client = self._make_client() - - # Pick at least one column type that translates to Pandas dtype - # "object". A string column matches that. - records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] - dataframe = pandas.DataFrame(records) - - get_table_patch = mock.patch( - "google.cloud.bigquery.client.Client.get_table", - autospec=True, - side_effect=google.api_core.exceptions.NotFound("Table not found"), - ) - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) - pyarrow_patch_helpers = mock.patch( - "google.cloud.bigquery._pandas_helpers.pyarrow", None - ) - catch_warnings = warnings.catch_warnings(record=True) - - with get_table_patch, load_patch, pyarrow_patch, pyarrow_patch_helpers, catch_warnings as warned: - client.load_table_from_dataframe( - dataframe, self.TABLE_REF, location=self.LOCATION - ) - - matches = [ - warning for warning in warned if warning.category is PyarrowMissingWarning + SchemaField("int_col", "INTEGER"), + SchemaField("unique_name", "STRING"), ] - assert matches, "A missing pyarrow deprecation warning was not raised." + assert sent_schema == expected_sent_schema @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") - def test_load_table_from_dataframe_no_schema_warning_wo_pyarrow(self): - client = self._make_client() + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_load_table_from_dataframe_unknown_table(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES - # Pick at least one column type that translates to Pandas dtype - # "object". A string column matches that. - records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] dataframe = pandas.DataFrame(records) get_table_patch = mock.patch( @@ -7892,25 +7138,24 @@ def test_load_table_from_dataframe_no_schema_warning_wo_pyarrow(self): load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) - pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) - pyarrow_patch_helpers = mock.patch( - "google.cloud.bigquery._pandas_helpers.pyarrow", None - ) - catch_warnings = warnings.catch_warnings(record=True) - - with get_table_patch, load_patch, pyarrow_patch, pyarrow_patch_helpers, catch_warnings as warned: - client.load_table_from_dataframe( - dataframe, self.TABLE_REF, location=self.LOCATION - ) + with load_patch as load_table_from_file, get_table_patch: + # there should be no error + client.load_table_from_dataframe(dataframe, self.TABLE_REF) - matches = [ - warning - for warning in warned - if warning.category in (DeprecationWarning, PendingDeprecationWarning) - and "could not be detected" in str(warning) - and "please provide a schema" in str(warning) - ] - assert matches, "A missing schema deprecation warning was not raised." + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=None, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) @unittest.skipIf( pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIUM_VERSION, @@ -7945,11 +7190,13 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7991,11 +7238,13 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8032,49 +7281,37 @@ def test_load_table_from_dataframe_struct_fields(self): "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) - if six.PY2: - with pytest.raises(ValueError) as exc_info, load_patch: - client.load_table_from_dataframe( - dataframe, - self.TABLE_REF, - job_config=job_config, - location=self.LOCATION, - ) - - err_msg = str(exc_info.value) - assert "struct" in err_msg - assert "not support" in err_msg - - else: - get_table_patch = mock.patch( - "google.cloud.bigquery.client.Client.get_table", - autospec=True, - side_effect=google.api_core.exceptions.NotFound("Table not found"), - ) - with load_patch as load_table_from_file, get_table_patch: - client.load_table_from_dataframe( - dataframe, - self.TABLE_REF, - job_config=job_config, - location=self.LOCATION, - ) - - load_table_from_file.assert_called_once_with( - client, - mock.ANY, + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, - num_retries=_DEFAULT_NUM_RETRIES, - rewind=True, - job_id=mock.ANY, - job_id_prefix=None, + job_config=job_config, location=self.LOCATION, - project=None, - job_config=mock.ANY, ) - sent_config = load_table_from_file.mock_calls[0][2]["job_config"] - assert sent_config.source_format == job.SourceFormat.PARQUET - assert sent_config.schema == schema + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=self.LOCATION, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.PARQUET + assert sent_config.schema == schema @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @@ -8110,9 +7347,9 @@ def test_load_table_from_dataframe_w_partial_schema(self): datetime.datetime(2012, 3, 14, 15, 16), ], dtype="datetime64[ns]", - ).dt.tz_localize(pytz.utc), + ).dt.tz_localize(datetime.timezone.utc), ), - ("string_col", [u"abc", None, u"def"]), + ("string_col", ["abc", None, "def"]), ("bytes_col", [b"abc", b"def", None]), ] ) @@ -8138,11 +7375,13 @@ def test_load_table_from_dataframe_w_partial_schema(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8169,7 +7408,7 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self): [ ("int_col", [1, 2, 3]), ("int_as_float_col", [1.0, float("nan"), 3.0]), - ("string_col", [u"abc", None, u"def"]), + ("string_col", ["abc", None, "def"]), ] ) dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) @@ -8196,7 +7435,6 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self): assert "unknown_col" in message @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") def test_load_table_from_dataframe_w_partial_schema_missing_types(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -8205,7 +7443,7 @@ def test_load_table_from_dataframe_w_partial_schema_missing_types(self): client = self._make_client() df_data = collections.OrderedDict( [ - ("string_col", [u"abc", u"def", u"ghi"]), + ("string_col", ["abc", "def", "ghi"]), ("unknown_col", [b"jkl", None, b"mno"]), ] ) @@ -8232,11 +7470,13 @@ def test_load_table_from_dataframe_w_partial_schema_missing_types(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) assert warned # there should be at least one warning @@ -8250,55 +7490,6 @@ def test_load_table_from_dataframe_w_partial_schema_missing_types(self): assert sent_config.source_format == job.SourceFormat.PARQUET assert sent_config.schema is None - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_schema_wo_pyarrow(self): - from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES - from google.cloud.bigquery import job - from google.cloud.bigquery.schema import SchemaField - - client = self._make_client() - records = [{"name": u"Monty", "age": 100}, {"name": u"Python", "age": 60}] - dataframe = pandas.DataFrame(records, columns=["name", "age"]) - schema = (SchemaField("name", "STRING"), SchemaField("age", "INTEGER")) - job_config = job.LoadJobConfig(schema=schema) - - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) - - with load_patch as load_table_from_file, pyarrow_patch, warnings.catch_warnings( - record=True - ) as warned: - client.load_table_from_dataframe( - dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION - ) - - assert warned # there should be at least one warning - for warning in warned: - assert "pyarrow" in str(warning) - assert issubclass( - warning.category, (DeprecationWarning, PendingDeprecationWarning) - ) - - load_table_from_file.assert_called_once_with( - client, - mock.ANY, - self.TABLE_REF, - num_retries=_DEFAULT_NUM_RETRIES, - rewind=True, - job_id=mock.ANY, - job_id_prefix=None, - location=self.LOCATION, - project=None, - job_config=mock.ANY, - ) - - sent_config = load_table_from_file.mock_calls[0][2]["job_config"] - assert sent_config.source_format == job.SourceFormat.PARQUET - assert tuple(sent_config.schema) == schema - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): @@ -8306,7 +7497,7 @@ def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): from google.cloud.bigquery.schema import SchemaField client = self._make_client() - records = [{"name": u"Monty", "age": 100}, {"name": u"Python", "age": 60}] + records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] dataframe = pandas.DataFrame(records) schema = (SchemaField("name", "STRING"), SchemaField("age", "INTEGER")) job_config = job.LoadJobConfig(schema=schema) @@ -8334,7 +7525,7 @@ def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self): + def test_load_table_from_dataframe_wo_pyarrow_raises_error(self): client = self._make_client() records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] dataframe = pandas.DataFrame(records) @@ -8352,8 +7543,8 @@ def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self): dataframe, "to_parquet", wraps=dataframe.to_parquet ) - with load_patch, get_table_patch, pyarrow_patch, to_parquet_patch as to_parquet_spy: - with warnings.catch_warnings(record=True) as warned: + with load_patch, get_table_patch, pyarrow_patch, to_parquet_patch: + with pytest.raises(ValueError): client.load_table_from_dataframe( dataframe, self.TABLE_REF, @@ -8361,15 +7552,41 @@ def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self): parquet_compression="gzip", ) - call_args = to_parquet_spy.call_args - assert call_args is not None - assert call_args.kwargs.get("compression") == "gzip" + def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): + pytest.importorskip("pandas", reason="Requires `pandas`") + pytest.importorskip("pyarrow", reason="Requires `pyarrow`") + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + pyarrow_version_patch = mock.patch( + "google.cloud.bigquery.client._PYARROW_VERSION", + packaging.version.parse("2.0.0"), # A known bad version of pyarrow. + ) + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) - assert len(warned) == 2 - warning = warned[0] - assert "Loading dataframe data without pyarrow" in str(warning) - warning = warned[1] - assert "Please install the pyarrow package" in str(warning) + with load_patch, get_table_patch, pyarrow_version_patch: + with warnings.catch_warnings(record=True) as warned: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, location=self.LOCATION, + ) + + expected_warnings = [ + warning for warning in warned if "pyarrow" in str(warning).lower() + ] + assert len(expected_warnings) == 1 + assert issubclass(expected_warnings[0].category, RuntimeWarning) + msg = str(expected_warnings[0].message) + assert "pyarrow 2.0.0" in msg + assert "data corruption" in msg @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @@ -8403,11 +7620,13 @@ def test_load_table_from_dataframe_w_nulls(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8432,6 +7651,56 @@ def test_load_table_from_dataframe_w_invaild_job_config(self): err_msg = str(exc.value) assert "Expected an instance of LoadJobConfig" in err_msg + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_load_table_from_dataframe_with_csv_source_format(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.schema import SchemaField + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + job_config = job.LoadJobConfig( + write_disposition=job.WriteDisposition.WRITE_TRUNCATE, + source_format=job.SourceFormat.CSV, + ) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock( + schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")] + ), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, job_config=job_config + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=None, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_file = load_table_from_file.mock_calls[0][1][1] + assert sent_file.closed + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.CSV + def test_load_table_from_json_basic_use(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -8454,12 +7723,14 @@ def test_load_table_from_json_basic_use(self): client, mock.ANY, self.TABLE_REF, + size=mock.ANY, num_retries=_DEFAULT_NUM_RETRIES, job_id=mock.ANY, job_id_prefix=None, location=client.location, project=client.project, job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8505,12 +7776,14 @@ def test_load_table_from_json_non_default_args(self): client, mock.ANY, self.TABLE_REF, + size=mock.ANY, num_retries=_DEFAULT_NUM_RETRIES, job_id=mock.ANY, job_id_prefix=None, location="EU", project="project-x", job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8543,6 +7816,42 @@ def test_load_table_from_json_w_invalid_job_config(self): err_msg = str(exc.value) assert "Expected an instance of LoadJobConfig" in err_msg + def test_load_table_from_json_unicode_emoji_data_case(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + + client = self._make_client() + + emoji = "\U0001F3E6" + json_row = {"emoji": emoji} + json_rows = [json_row] + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + with load_patch as load_table_from_file: + client.load_table_from_json(json_rows, self.TABLE_REF) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + size=mock.ANY, + num_retries=_DEFAULT_NUM_RETRIES, + job_id=mock.ANY, + job_id_prefix=None, + location=client.location, + project=client.project, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_data_file = load_table_from_file.mock_calls[0][1][1] + + # make sure json_row's unicode characters are only encoded one time + expected_bytes = b'{"emoji": "' + emoji.encode("utf8") + b'"}' + assert sent_data_file.getvalue() == expected_bytes + # Low-level tests @classmethod @@ -8552,7 +7861,7 @@ def _make_resumable_upload_responses(cls, size): resumable_url = "http://test.invalid?upload_id=and-then-there-was-1" initial_response = cls._make_response( - http_client.OK, "", {"location": resumable_url} + http.client.OK, "", {"location": resumable_url} ) data_response = cls._make_response( resumable_media.PERMANENT_REDIRECT, @@ -8560,7 +7869,7 @@ def _make_resumable_upload_responses(cls, size): {"range": "bytes=0-{:d}".format(size - 1)}, ) final_response = cls._make_response( - http_client.OK, + http.client.OK, json.dumps({"size": size}), {"Content-Type": "application/json"}, ) @@ -8585,7 +7894,32 @@ def test__do_resumable_upload(self): client = self._make_client(transport) result = client._do_resumable_upload( - file_obj, self.EXPECTED_CONFIGURATION, None + file_obj, self.EXPECTED_CONFIGURATION, None, None + ) + + content = result.content.decode("utf-8") + assert json.loads(content) == {"size": file_obj_len} + + # Verify that configuration data was passed in with the initial + # request. + transport.request.assert_any_call( + "POST", + mock.ANY, + data=json.dumps(self.EXPECTED_CONFIGURATION).encode("utf-8"), + headers=mock.ANY, + timeout=mock.ANY, + ) + + def test__do_resumable_upload_custom_project(self): + file_obj = self._make_file_obj() + file_obj_len = len(file_obj.getvalue()) + transport = self._make_transport( + self._make_resumable_upload_responses(file_obj_len) + ) + client = self._make_client(transport) + + result = client._do_resumable_upload( + file_obj, self.EXPECTED_CONFIGURATION, None, None, project="custom-project", ) content = result.content.decode("utf-8") @@ -8601,14 +7935,27 @@ def test__do_resumable_upload(self): timeout=mock.ANY, ) + # Check the project ID used in the call to initiate resumable upload. + initiation_url = next( + ( + call.args[1] + for call in transport.request.call_args_list + if call.args[0] == "POST" and "uploadType=resumable" in call.args[1] + ), + None, + ) # pragma: NO COVER + + assert initiation_url is not None + assert "projects/custom-project" in initiation_url + def test__do_multipart_upload(self): - transport = self._make_transport([self._make_response(http_client.OK)]) + transport = self._make_transport([self._make_response(http.client.OK)]) client = self._make_client(transport) file_obj = self._make_file_obj() file_obj_len = len(file_obj.getvalue()) client._do_multipart_upload( - file_obj, self.EXPECTED_CONFIGURATION, file_obj_len, None + file_obj, self.EXPECTED_CONFIGURATION, file_obj_len, None, None ) # Verify that configuration data was passed in with the initial @@ -8636,7 +7983,7 @@ def test__do_multipart_upload_wrong_size(self): file_obj_len = len(file_obj.getvalue()) with pytest.raises(ValueError): - client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None) + client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None, None) def test_schema_from_json_with_file_path(self): from google.cloud.bigquery.schema import SchemaField @@ -8671,14 +8018,9 @@ def test_schema_from_json_with_file_path(self): client = self._make_client() mock_file_path = "/mocked/file.json" - if six.PY2: - open_patch = mock.patch( - "__builtin__.open", mock.mock_open(read_data=file_content) - ) - else: - open_patch = mock.patch( - "builtins.open", new=mock.mock_open(read_data=file_content) - ) + open_patch = mock.patch( + "builtins.open", new=mock.mock_open(read_data=file_content) + ) with open_patch as _mock_file: actual = client.schema_from_json(mock_file_path) @@ -8720,12 +8062,7 @@ def test_schema_from_json_with_file_object(self): ] client = self._make_client() - - if six.PY2: - fake_file = io.BytesIO(file_content) - else: - fake_file = io.StringIO(file_content) - + fake_file = io.StringIO(file_content) actual = client.schema_from_json(fake_file) assert expected == actual @@ -8738,18 +8075,21 @@ def test_schema_to_json_with_file_path(self): "description": "quarter", "mode": "REQUIRED", "name": "qtr", + "policyTags": {"names": []}, "type": "STRING", }, { "description": "sales representative", "mode": "NULLABLE", "name": "rep", + "policyTags": {"names": []}, "type": "STRING", }, { "description": "total sales", "mode": "NULLABLE", "name": "sales", + "policyTags": {"names": []}, "type": "FLOAT", }, ] @@ -8762,11 +8102,7 @@ def test_schema_to_json_with_file_path(self): client = self._make_client() mock_file_path = "/mocked/file.json" - - if six.PY2: - open_patch = mock.patch("__builtin__.open", mock.mock_open()) - else: - open_patch = mock.patch("builtins.open", mock.mock_open()) + open_patch = mock.patch("builtins.open", mock.mock_open()) with open_patch as mock_file, mock.patch("json.dump") as mock_dump: client.schema_to_json(schema_list, mock_file_path) @@ -8786,18 +8122,21 @@ def test_schema_to_json_with_file_object(self): "description": "quarter", "mode": "REQUIRED", "name": "qtr", + "policyTags": {"names": []}, "type": "STRING", }, { "description": "sales representative", "mode": "NULLABLE", "name": "rep", + "policyTags": {"names": []}, "type": "STRING", }, { "description": "total sales", "mode": "NULLABLE", "name": "sales", + "policyTags": {"names": []}, "type": "FLOAT", }, ] @@ -8808,12 +8147,29 @@ def test_schema_to_json_with_file_object(self): SchemaField("sales", "FLOAT", "NULLABLE", "total sales"), ] - if six.PY2: - fake_file = io.BytesIO() - else: - fake_file = io.StringIO() + fake_file = io.StringIO() client = self._make_client() client.schema_to_json(schema_list, fake_file) assert file_content == json.loads(fake_file.getvalue()) + + +def test_upload_chunksize(client): + with mock.patch("google.cloud.bigquery.client.ResumableUpload") as RU: + upload = RU.return_value + + upload.finished = False + + def transmit_next_chunk(transport): + upload.finished = True + result = mock.MagicMock() + result.json.return_value = {} + return result + + upload.transmit_next_chunk = transmit_next_chunk + f = io.BytesIO() + client.load_table_from_file(f, "foo.bar") + + chunk_size = RU.call_args_list[0][0][1] + assert chunk_size == 100 * (1 << 20) diff --git a/tests/unit/test_create_dataset.py b/tests/unit/test_create_dataset.py new file mode 100644 index 000000000..67b21225d --- /dev/null +++ b/tests/unit/test_create_dataset.py @@ -0,0 +1,364 @@ +# Copyright 2021 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud.bigquery.dataset import Dataset, DatasetReference +from .helpers import make_connection, dataset_polymorphic, make_client +import google.cloud.bigquery.dataset +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +import mock +import pytest + + +@dataset_polymorphic +def test_create_dataset_minimal(make_dataset, get_reference, client, PROJECT, DS_ID): + PATH = "projects/%s/datasets" % PROJECT + RESOURCE = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "%s:%s" % (PROJECT, DS_ID), + } + conn = client._connection = make_connection(RESOURCE) + + dataset = make_dataset(PROJECT, DS_ID) + after = client.create_dataset(dataset, timeout=7.5) + + assert after.dataset_id == DS_ID + assert after.project == PROJECT + assert after.etag == RESOURCE["etag"] + assert after.full_dataset_id == RESOURCE["id"] + + conn.api_request.assert_called_once_with( + method="POST", + path="/%s" % PATH, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "labels": {}, + }, + timeout=7.5, + ) + + +def test_create_dataset_w_attrs(client, PROJECT, DS_ID): + from google.cloud.bigquery.dataset import AccessEntry + + PATH = "projects/%s/datasets" % PROJECT + DESCRIPTION = "DESC" + FRIENDLY_NAME = "FN" + LOCATION = "US" + USER_EMAIL = "phred@example.com" + LABELS = {"color": "red"} + VIEW = { + "projectId": "my-proj", + "datasetId": "starry-skies", + "tableId": "northern-hemisphere", + } + RESOURCE = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "%s:%s" % (PROJECT, DS_ID), + "description": DESCRIPTION, + "friendlyName": FRIENDLY_NAME, + "location": LOCATION, + "defaultTableExpirationMs": "3600", + "labels": LABELS, + "access": [{"role": "OWNER", "userByEmail": USER_EMAIL}, {"view": VIEW}], + } + conn = client._connection = make_connection(RESOURCE) + entries = [ + AccessEntry("OWNER", "userByEmail", USER_EMAIL), + AccessEntry(None, "view", VIEW), + ] + + ds_ref = DatasetReference(PROJECT, DS_ID) + before = Dataset(ds_ref) + before.access_entries = entries + before.description = DESCRIPTION + before.friendly_name = FRIENDLY_NAME + before.default_table_expiration_ms = 3600 + before.location = LOCATION + before.labels = LABELS + after = client.create_dataset(before) + + assert after.dataset_id == DS_ID + assert after.project == PROJECT + assert after.etag == RESOURCE["etag"] + assert after.full_dataset_id == RESOURCE["id"] + assert after.description == DESCRIPTION + assert after.friendly_name == FRIENDLY_NAME + assert after.location == LOCATION + assert after.default_table_expiration_ms == 3600 + assert after.labels == LABELS + + conn.api_request.assert_called_once_with( + method="POST", + path="/%s" % PATH, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "description": DESCRIPTION, + "friendlyName": FRIENDLY_NAME, + "location": LOCATION, + "defaultTableExpirationMs": "3600", + "access": [{"role": "OWNER", "userByEmail": USER_EMAIL}, {"view": VIEW}], + "labels": LABELS, + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_create_dataset_w_custom_property(client, PROJECT, DS_ID): + # The library should handle sending properties to the API that are not + # yet part of the library + + path = "/projects/%s/datasets" % PROJECT + resource = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "newAlphaProperty": "unreleased property", + } + conn = client._connection = make_connection(resource) + + ds_ref = DatasetReference(PROJECT, DS_ID) + before = Dataset(ds_ref) + before._properties["newAlphaProperty"] = "unreleased property" + after = client.create_dataset(before) + + assert after.dataset_id == DS_ID + assert after.project == PROJECT + assert after._properties["newAlphaProperty"] == "unreleased property" + + conn.api_request.assert_called_once_with( + method="POST", + path=path, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "newAlphaProperty": "unreleased property", + "labels": {}, + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_create_dataset_w_client_location_wo_dataset_location(PROJECT, DS_ID, LOCATION): + PATH = "projects/%s/datasets" % PROJECT + RESOURCE = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "%s:%s" % (PROJECT, DS_ID), + "location": LOCATION, + } + client = make_client(location=LOCATION) + conn = client._connection = make_connection(RESOURCE) + + ds_ref = DatasetReference(PROJECT, DS_ID) + before = Dataset(ds_ref) + after = client.create_dataset(before) + + assert after.dataset_id == DS_ID + assert after.project == PROJECT + assert after.etag == RESOURCE["etag"] + assert after.full_dataset_id == RESOURCE["id"] + assert after.location == LOCATION + + conn.api_request.assert_called_once_with( + method="POST", + path="/%s" % PATH, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "labels": {}, + "location": LOCATION, + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_create_dataset_w_client_location_w_dataset_location(PROJECT, DS_ID, LOCATION): + PATH = "projects/%s/datasets" % PROJECT + OTHER_LOCATION = "EU" + RESOURCE = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "%s:%s" % (PROJECT, DS_ID), + "location": OTHER_LOCATION, + } + client = make_client(location=LOCATION) + conn = client._connection = make_connection(RESOURCE) + + ds_ref = DatasetReference(PROJECT, DS_ID) + before = Dataset(ds_ref) + before.location = OTHER_LOCATION + after = client.create_dataset(before) + + assert after.dataset_id == DS_ID + assert after.project == PROJECT + assert after.etag == RESOURCE["etag"] + assert after.full_dataset_id == RESOURCE["id"] + assert after.location == OTHER_LOCATION + + conn.api_request.assert_called_once_with( + method="POST", + path="/%s" % PATH, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "labels": {}, + "location": OTHER_LOCATION, + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_create_dataset_w_reference(PROJECT, DS_ID, LOCATION): + path = "/projects/%s/datasets" % PROJECT + resource = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "%s:%s" % (PROJECT, DS_ID), + "location": LOCATION, + } + client = make_client(location=LOCATION) + conn = client._connection = make_connection(resource) + dataset = client.create_dataset(DatasetReference(PROJECT, DS_ID)) + + assert dataset.dataset_id == DS_ID + assert dataset.project == PROJECT + assert dataset.etag == resource["etag"] + assert dataset.full_dataset_id == resource["id"] + assert dataset.location == LOCATION + + conn.api_request.assert_called_once_with( + method="POST", + path=path, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "labels": {}, + "location": LOCATION, + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_create_dataset_w_fully_qualified_string(PROJECT, DS_ID, LOCATION): + path = "/projects/%s/datasets" % PROJECT + resource = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "%s:%s" % (PROJECT, DS_ID), + "location": LOCATION, + } + client = make_client(location=LOCATION) + conn = client._connection = make_connection(resource) + dataset = client.create_dataset("{}.{}".format(PROJECT, DS_ID)) + + assert dataset.dataset_id == DS_ID + assert dataset.project == PROJECT + assert dataset.etag == resource["etag"] + assert dataset.full_dataset_id == resource["id"] + assert dataset.location == LOCATION + + conn.api_request.assert_called_once_with( + method="POST", + path=path, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "labels": {}, + "location": LOCATION, + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_create_dataset_w_string(PROJECT, DS_ID, LOCATION): + path = "/projects/%s/datasets" % PROJECT + resource = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "%s:%s" % (PROJECT, DS_ID), + "location": LOCATION, + } + client = make_client(location=LOCATION) + conn = client._connection = make_connection(resource) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + dataset = client.create_dataset(DS_ID) + + final_attributes.assert_called_once_with({"path": path}, client, None) + + assert dataset.dataset_id == DS_ID + assert dataset.project == PROJECT + assert dataset.etag == resource["etag"] + assert dataset.full_dataset_id == resource["id"] + assert dataset.location == LOCATION + + conn.api_request.assert_called_once_with( + method="POST", + path=path, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "labels": {}, + "location": LOCATION, + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_create_dataset_alreadyexists_w_exists_ok_false(PROJECT, DS_ID, LOCATION): + client = make_client(location=LOCATION) + client._connection = make_connection( + google.api_core.exceptions.AlreadyExists("dataset already exists") + ) + + with pytest.raises(google.api_core.exceptions.AlreadyExists): + client.create_dataset(DS_ID) + + +def test_create_dataset_alreadyexists_w_exists_ok_true(PROJECT, DS_ID, LOCATION): + post_path = "/projects/{}/datasets".format(PROJECT) + get_path = "/projects/{}/datasets/{}".format(PROJECT, DS_ID) + resource = { + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "etag": "etag", + "id": "{}:{}".format(PROJECT, DS_ID), + "location": LOCATION, + } + client = make_client(location=LOCATION) + conn = client._connection = make_connection( + google.api_core.exceptions.AlreadyExists("dataset already exists"), resource + ) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + dataset = client.create_dataset(DS_ID, exists_ok=True) + + final_attributes.assert_called_with({"path": get_path}, client, None) + + assert dataset.dataset_id == DS_ID + assert dataset.project == PROJECT + assert dataset.etag == resource["etag"] + assert dataset.full_dataset_id == resource["id"] + assert dataset.location == LOCATION + + conn.api_request.assert_has_calls( + [ + mock.call( + method="POST", + path=post_path, + data={ + "datasetReference": {"projectId": PROJECT, "datasetId": DS_ID}, + "labels": {}, + "location": LOCATION, + }, + timeout=DEFAULT_TIMEOUT, + ), + mock.call(method="GET", path=get_path, timeout=DEFAULT_TIMEOUT), + ] + ) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index e4977a270..b3a53a08d 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -53,6 +53,21 @@ def test_ctor_view_success(self): self.assertEqual(entry.entity_type, entity_type) self.assertEqual(entry.entity_id, entity_id) + def test_ctor_routine_with_role(self): + role = "READER" + entity_type = "routine" + with self.assertRaises(ValueError): + self._make_one(role, entity_type, None) + + def test_ctor_routine_success(self): + role = None + entity_type = "routine" + entity_id = object() + entry = self._make_one(role, entity_type, entity_id) + self.assertEqual(entry.role, role) + self.assertEqual(entry.entity_type, entity_type) + self.assertEqual(entry.entity_id, entity_id) + def test_ctor_nonview_without_role(self): role = None entity_type = "userByEmail" @@ -115,6 +130,17 @@ def test_to_api_repr_view(self): exp_resource = {"view": view} self.assertEqual(resource, exp_resource) + def test_to_api_repr_routine(self): + routine = { + "projectId": "my-project", + "datasetId": "my_dataset", + "routineId": "my_routine", + } + entry = self._make_one(None, "routine", routine) + resource = entry.to_api_repr() + exp_resource = {"routine": routine} + self.assertEqual(resource, exp_resource) + def test_from_api_repr(self): resource = {"role": "OWNER", "userByEmail": "salmon@example.com"} entry = self._get_target_class().from_api_repr(resource) diff --git a/tests/unit/test_dbapi__helpers.py b/tests/unit/test_dbapi__helpers.py index 08dd6dcfa..b33203354 100644 --- a/tests/unit/test_dbapi__helpers.py +++ b/tests/unit/test_dbapi__helpers.py @@ -16,17 +16,18 @@ import decimal import math import operator as op +import re import unittest +import pytest + try: import pyarrow except ImportError: # pragma: NO COVER pyarrow = None -import six - import google.cloud._helpers -from google.cloud.bigquery import table +from google.cloud.bigquery import table, enums from google.cloud.bigquery.dbapi import _helpers from google.cloud.bigquery.dbapi import exceptions from tests.unit.helpers import _to_pyarrow @@ -40,9 +41,8 @@ def test_scalar_to_query_parameter(self): (123, "INT64"), (-123456789, "INT64"), (1.25, "FLOAT64"), - (decimal.Decimal("1.25"), "NUMERIC"), (b"I am some bytes", "BYTES"), - (u"I am a string", "STRING"), + ("I am a string", "STRING"), (datetime.date(2017, 4, 1), "DATE"), (datetime.time(12, 34, 56), "TIME"), (datetime.datetime(2012, 3, 4, 5, 6, 7), "DATETIME"), @@ -52,7 +52,18 @@ def test_scalar_to_query_parameter(self): ), "TIMESTAMP", ), + (decimal.Decimal("1.25"), "NUMERIC"), + (decimal.Decimal("9.9999999999999999999999999999999999999E+28"), "NUMERIC"), + (decimal.Decimal("1.0E+29"), "BIGNUMERIC"), # more than max NUMERIC value + (decimal.Decimal("1.123456789"), "NUMERIC"), + (decimal.Decimal("1.1234567891"), "BIGNUMERIC"), # scale > 9 + (decimal.Decimal("12345678901234567890123456789.012345678"), "NUMERIC"), + ( + decimal.Decimal("12345678901234567890123456789012345678"), + "BIGNUMERIC", # larger than max NUMERIC value, despite precision <=38 + ), ] + for value, expected_type in expected_types: msg = "value: {} expected_type: {}".format(value, expected_type) parameter = _helpers.scalar_to_query_parameter(value) @@ -82,8 +93,9 @@ def test_array_to_query_parameter_valid_argument(self): ([123, -456, 0], "INT64"), ([1.25, 2.50], "FLOAT64"), ([decimal.Decimal("1.25")], "NUMERIC"), + ([decimal.Decimal("{d38}.{d38}".format(d38="9" * 38))], "BIGNUMERIC"), ([b"foo", b"bar"], "BYTES"), - ([u"foo", u"bar"], "STRING"), + (["foo", "bar"], "STRING"), ([datetime.date(2017, 4, 1), datetime.date(2018, 4, 1)], "DATE"), ([datetime.time(12, 34, 56), datetime.time(10, 20, 30)], "TIME"), ( @@ -122,7 +134,7 @@ def test_array_to_query_parameter_empty_argument(self): _helpers.array_to_query_parameter([]) def test_array_to_query_parameter_unsupported_sequence(self): - unsupported_iterables = [{10, 20, 30}, u"foo", b"bar", bytearray([65, 75, 85])] + unsupported_iterables = [{10, 20, 30}, "foo", b"bar", bytearray([65, 75, 85])] for iterable in unsupported_iterables: with self.assertRaises(exceptions.ProgrammingError): _helpers.array_to_query_parameter(iterable) @@ -132,8 +144,8 @@ def test_array_to_query_parameter_sequence_w_invalid_elements(self): _helpers.array_to_query_parameter([object(), 2, 7]) def test_to_query_parameters_w_dict(self): - parameters = {"somebool": True, "somestring": u"a-string-value"} - query_parameters = _helpers.to_query_parameters(parameters) + parameters = {"somebool": True, "somestring": "a-string-value"} + query_parameters = _helpers.to_query_parameters(parameters, {}) query_parameter_tuples = [] for param in query_parameters: query_parameter_tuples.append((param.name, param.type_, param.value)) @@ -142,14 +154,14 @@ def test_to_query_parameters_w_dict(self): sorted( [ ("somebool", "BOOL", True), - ("somestring", "STRING", u"a-string-value"), + ("somestring", "STRING", "a-string-value"), ] ), ) def test_to_query_parameters_w_dict_array_param(self): parameters = {"somelist": [10, 20]} - query_parameters = _helpers.to_query_parameters(parameters) + query_parameters = _helpers.to_query_parameters(parameters, {}) self.assertEqual(len(query_parameters), 1) param = query_parameters[0] @@ -162,22 +174,22 @@ def test_to_query_parameters_w_dict_dict_param(self): parameters = {"my_param": {"foo": "bar"}} with self.assertRaises(NotImplementedError): - _helpers.to_query_parameters(parameters) + _helpers.to_query_parameters(parameters, {}) def test_to_query_parameters_w_list(self): - parameters = [True, u"a-string-value"] - query_parameters = _helpers.to_query_parameters(parameters) + parameters = [True, "a-string-value"] + query_parameters = _helpers.to_query_parameters(parameters, [None, None]) query_parameter_tuples = [] for param in query_parameters: query_parameter_tuples.append((param.name, param.type_, param.value)) self.assertSequenceEqual( sorted(query_parameter_tuples), - sorted([(None, "BOOL", True), (None, "STRING", u"a-string-value")]), + sorted([(None, "BOOL", True), (None, "STRING", "a-string-value")]), ) def test_to_query_parameters_w_list_array_param(self): parameters = [[10, 20]] - query_parameters = _helpers.to_query_parameters(parameters) + query_parameters = _helpers.to_query_parameters(parameters, [None]) self.assertEqual(len(query_parameters), 1) param = query_parameters[0] @@ -190,10 +202,10 @@ def test_to_query_parameters_w_list_dict_param(self): parameters = [{"foo": "bar"}] with self.assertRaises(NotImplementedError): - _helpers.to_query_parameters(parameters) + _helpers.to_query_parameters(parameters, [None]) def test_to_query_parameters_none_argument(self): - query_parameters = _helpers.to_query_parameters(None) + query_parameters = _helpers.to_query_parameters(None, None) self.assertEqual(query_parameters, []) @@ -293,7 +305,7 @@ def test_public_instance_methods_on_closed_instance(self): instance = decorated_class() instance._closed = True - with six.assertRaisesRegex(self, exceptions.ProgrammingError, "I'm closed!"): + with self.assertRaisesRegex(exceptions.ProgrammingError, "I'm closed!"): instance.instance_method() def test_methods_wo_public_instance_methods_on_closed_instance(self): @@ -316,7 +328,7 @@ def test_custom_class_closed_attribute(self): instance._closed = False instance._really_closed = True - with six.assertRaisesRegex(self, exceptions.ProgrammingError, "I'm closed!"): + with self.assertRaisesRegex(exceptions.ProgrammingError, "I'm closed!"): instance.instance_method() def test_custom_on_closed_error_type(self): @@ -327,5 +339,358 @@ def test_custom_on_closed_error_type(self): instance = decorated_class() instance._closed = True - with six.assertRaisesRegex(self, RuntimeError, "I'm closed!"): + with self.assertRaisesRegex(RuntimeError, "I'm closed!"): instance.instance_method() + + +VALID_BQ_TYPES = [ + (name, getattr(enums.SqlParameterScalarTypes, name)._type) + for name in dir(enums.SqlParameterScalarTypes) + if not name.startswith("_") +] + + +@pytest.mark.parametrize("alias, type_", VALID_BQ_TYPES) +def test_scalar_to_query_parameter_honors_given_type(alias, type_): + from google.cloud import bigquery + + assert _helpers.scalar_to_query_parameter(1.23, None, alias) == ( + bigquery.ScalarQueryParameter(None, type_, 1.23) + ) + assert _helpers.scalar_to_query_parameter(None, "foo", alias) == ( + bigquery.ScalarQueryParameter("foo", type_, None) + ) + + +def test_scalar_to_query_parameter_honors_given_type_errors_on_invalid(): + with pytest.raises( + google.cloud.bigquery.dbapi.exceptions.ProgrammingError, + match="The given parameter type, INT, for foo is not a valid BigQuery scalar type.", + ): + _helpers.scalar_to_query_parameter(None, "foo", "INT") + + +@pytest.mark.parametrize("alias, type_", VALID_BQ_TYPES) +def test_array_to_query_parameter_honors_given_type(alias, type_): + from google.cloud import bigquery + + assert _helpers.array_to_query_parameter([1.23], None, alias) == ( + bigquery.ArrayQueryParameter(None, type_, [1.23]) + ) + assert _helpers.array_to_query_parameter((), "foo", alias) == ( + bigquery.ArrayQueryParameter("foo", type_, ()) + ) + + +def test_array_to_query_parameter_honors_given_type_errors_on_invalid(): + with pytest.raises( + google.cloud.bigquery.dbapi.exceptions.ProgrammingError, + match="The given parameter type, INT, for foo is not a valid BigQuery scalar type.", + ): + _helpers.array_to_query_parameter((), "foo", "INT") + + +def test_to_query_parameters_dict_w_types(): + from google.cloud import bigquery + + assert sorted( + _helpers.to_query_parameters( + dict(i=1, x=1.2, y=None, q="hi", z=[]), + dict(x="numeric", y="string", q="string(9)", z="float64"), + ), + key=lambda p: p.name, + ) == [ + bigquery.ScalarQueryParameter("i", "INT64", 1), + bigquery.ScalarQueryParameter("q", "STRING", "hi"), + bigquery.ScalarQueryParameter("x", "NUMERIC", 1.2), + bigquery.ScalarQueryParameter("y", "STRING", None), + bigquery.ArrayQueryParameter("z", "FLOAT64", []), + ] + + +def test_to_query_parameters_list_w_types(): + from google.cloud import bigquery + + assert _helpers.to_query_parameters( + [1, 1.2, None, "hi", []], [None, "numeric", "string", "string(9)", "float64"] + ) == [ + bigquery.ScalarQueryParameter(None, "INT64", 1), + bigquery.ScalarQueryParameter(None, "NUMERIC", 1.2), + bigquery.ScalarQueryParameter(None, "STRING", None), + bigquery.ScalarQueryParameter(None, "STRING", "hi"), + bigquery.ArrayQueryParameter(None, "FLOAT64", []), + ] + + +@pytest.mark.parametrize( + "value,type_,expect", + [ + ( + [], + "ARRAY", + { + "parameterType": {"type": "ARRAY", "arrayType": {"type": "INT64"}}, + "parameterValue": {"arrayValues": []}, + }, + ), + ( + [1, 2], + "ARRAY", + { + "parameterType": {"type": "ARRAY", "arrayType": {"type": "INT64"}}, + "parameterValue": {"arrayValues": [{"value": "1"}, {"value": "2"}]}, + }, + ), + ( + dict( + name="par", + children=[ + dict(name="ch1", bdate=datetime.date(2021, 1, 1)), + dict(name="ch2", bdate=datetime.date(2021, 1, 2)), + ], + ), + "struct>>", + { + "parameterType": { + "structTypes": [ + {"name": "name", "type": {"type": "STRING"}}, + { + "name": "children", + "type": { + "arrayType": { + "structTypes": [ + {"name": "name", "type": {"type": "STRING"}}, + {"name": "bdate", "type": {"type": "DATE"}}, + ], + "type": "STRUCT", + }, + "type": "ARRAY", + }, + }, + ], + "type": "STRUCT", + }, + "parameterValue": { + "structValues": { + "children": { + "arrayValues": [ + { + "structValues": { + "bdate": {"value": "2021-01-01"}, + "name": {"value": "ch1"}, + } + }, + { + "structValues": { + "bdate": {"value": "2021-01-02"}, + "name": {"value": "ch2"}, + } + }, + ] + }, + "name": {"value": "par"}, + } + }, + }, + ), + ( + dict( + name="par", + children=[ + dict(name="ch1", bdate=datetime.date(2021, 1, 1)), + dict(name="ch2", bdate=datetime.date(2021, 1, 2)), + ], + ), + "struct>>", + { + "parameterType": { + "structTypes": [ + {"name": "name", "type": {"type": "STRING"}}, + { + "name": "children", + "type": { + "arrayType": { + "structTypes": [ + {"name": "name", "type": {"type": "STRING"}}, + {"name": "bdate", "type": {"type": "DATE"}}, + ], + "type": "STRUCT", + }, + "type": "ARRAY", + }, + }, + ], + "type": "STRUCT", + }, + "parameterValue": { + "structValues": { + "children": { + "arrayValues": [ + { + "structValues": { + "bdate": {"value": "2021-01-01"}, + "name": {"value": "ch1"}, + } + }, + { + "structValues": { + "bdate": {"value": "2021-01-02"}, + "name": {"value": "ch2"}, + } + }, + ] + }, + "name": {"value": "par"}, + } + }, + }, + ), + ( + ["1", "hi"], + "ARRAY", + { + "parameterType": {"type": "ARRAY", "arrayType": {"type": "STRING"}}, + "parameterValue": {"arrayValues": [{"value": "1"}, {"value": "hi"}]}, + }, + ), + ], +) +def test_complex_query_parameter_type(type_, value, expect): + from google.cloud.bigquery.dbapi._helpers import complex_query_parameter + + param = complex_query_parameter("test", value, type_).to_api_repr() + assert param.pop("name") == "test" + assert param == expect + + +def _expected_error_match(expect): + return "^" + re.escape(expect) + "$" + + +@pytest.mark.parametrize( + "value,type_,expect", + [ + ( + [], + "ARRAY", + "The given parameter type, INT," + " is not a valid BigQuery scalar type, in ARRAY.", + ), + ([], "x", "Invalid parameter type, x"), + ({}, "struct", "Invalid struct field, int, in struct"), + ( + {"x": 1}, + "struct", + "The given parameter type, int," + " for x is not a valid BigQuery scalar type, in struct.", + ), + ([], "x<", "Invalid parameter type, x<"), + (0, "ARRAY", "Array type with non-array-like value with type int"), + ( + [], + "ARRAY>", + "Array can't contain an array in ARRAY>", + ), + ([], "struct", "Non-mapping value for type struct"), + ({}, "struct", "No field value for x in struct"), + ({"x": 1, "y": 1}, "struct", "Extra data keys for struct"), + ([], "array>", "Invalid struct field, xxx, in array>"), + ([], "array<<>>", "Invalid parameter type, <>"), + ], +) +def test_complex_query_parameter_type_errors(type_, value, expect): + from google.cloud.bigquery.dbapi._helpers import complex_query_parameter + from google.cloud.bigquery.dbapi import exceptions + + with pytest.raises( + exceptions.ProgrammingError, match=_expected_error_match(expect), + ): + complex_query_parameter("test", value, type_) + + +@pytest.mark.parametrize( + "parameters,parameter_types,expect", + [ + ( + [[], dict(name="ch1", bdate=datetime.date(2021, 1, 1))], + ["ARRAY", "struct"], + [ + { + "parameterType": {"arrayType": {"type": "INT64"}, "type": "ARRAY"}, + "parameterValue": {"arrayValues": []}, + }, + { + "parameterType": { + "structTypes": [ + {"name": "name", "type": {"type": "STRING"}}, + {"name": "bdate", "type": {"type": "DATE"}}, + ], + "type": "STRUCT", + }, + "parameterValue": { + "structValues": { + "bdate": {"value": "2021-01-01"}, + "name": {"value": "ch1"}, + } + }, + }, + ], + ), + ( + dict(ids=[], child=dict(name="ch1", bdate=datetime.date(2021, 1, 1))), + dict(ids="ARRAY", child="struct"), + [ + { + "name": "ids", + "parameterType": {"arrayType": {"type": "INT64"}, "type": "ARRAY"}, + "parameterValue": {"arrayValues": []}, + }, + { + "name": "child", + "parameterType": { + "structTypes": [ + {"name": "name", "type": {"type": "STRING"}}, + {"name": "bdate", "type": {"type": "DATE"}}, + ], + "type": "STRUCT", + }, + "parameterValue": { + "structValues": { + "bdate": {"value": "2021-01-01"}, + "name": {"value": "ch1"}, + } + }, + }, + ], + ), + ], +) +def test_to_query_parameters_complex_types(parameters, parameter_types, expect): + from google.cloud.bigquery.dbapi._helpers import to_query_parameters + + result = [p.to_api_repr() for p in to_query_parameters(parameters, parameter_types)] + assert result == expect + + +def test_to_query_parameters_struct_error(): + from google.cloud.bigquery.dbapi._helpers import to_query_parameters + + with pytest.raises( + NotImplementedError, + match=_expected_error_match( + "STRUCT-like parameter values are not supported, " + "unless an explicit type is give in the parameter placeholder " + "(e.g. '%(:struct<...>)s')." + ), + ): + to_query_parameters([dict(x=1)], [None]) + + with pytest.raises( + NotImplementedError, + match=_expected_error_match( + "STRUCT-like parameter values are not supported (parameter foo), " + "unless an explicit type is give in the parameter placeholder " + "(e.g. '%(foo:struct<...>)s')." + ), + ): + to_query_parameters(dict(foo=dict(x=1)), {}) diff --git a/tests/unit/test_dbapi_connection.py b/tests/unit/test_dbapi_connection.py index 0f1be45ee..0576cad38 100644 --- a/tests/unit/test_dbapi_connection.py +++ b/tests/unit/test_dbapi_connection.py @@ -16,12 +16,11 @@ import unittest import mock -import six try: - from google.cloud import bigquery_storage_v1 + from google.cloud import bigquery_storage except ImportError: # pragma: NO COVER - bigquery_storage_v1 = None + bigquery_storage = None class TestConnection(unittest.TestCase): @@ -41,38 +40,41 @@ def _mock_client(self): return mock_client def _mock_bqstorage_client(self): - if bigquery_storage_v1 is None: - return None - mock_client = mock.create_autospec( - bigquery_storage_v1.client.BigQueryReadClient - ) - mock_client.transport = mock.Mock(spec=["channel"]) - mock_client.transport.channel = mock.Mock(spec=["close"]) + # Assumption: bigquery_storage exists. It's the test's responisbility to + # not use this helper or skip itself if bqstroage is not installed. + mock_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + mock_client._transport = mock.Mock(spec=["channel"]) + mock_client._transport.grpc_channel = mock.Mock(spec=["close"]) return mock_client def test_ctor_wo_bqstorage_client(self): from google.cloud.bigquery.dbapi import Connection mock_client = self._mock_client() - mock_bqstorage_client = self._mock_bqstorage_client() - mock_client._create_bqstorage_client.return_value = mock_bqstorage_client + mock_client._ensure_bqstorage_client.return_value = None connection = self._make_one(client=mock_client) self.assertIsInstance(connection, Connection) self.assertIs(connection._client, mock_client) - self.assertIs(connection._bqstorage_client, mock_bqstorage_client) + self.assertIs(connection._bqstorage_client, None) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_ctor_w_bqstorage_client(self): from google.cloud.bigquery.dbapi import Connection mock_client = self._mock_client() mock_bqstorage_client = self._mock_bqstorage_client() + mock_client._ensure_bqstorage_client.return_value = mock_bqstorage_client + connection = self._make_one( client=mock_client, bqstorage_client=mock_bqstorage_client, ) + + mock_client._ensure_bqstorage_client.assert_called_once_with( + mock_bqstorage_client + ) self.assertIsInstance(connection, Connection) self.assertIs(connection._client, mock_client) self.assertIs(connection._bqstorage_client, mock_bqstorage_client) @@ -87,21 +89,26 @@ def test_connect_wo_client(self, mock_client): self.assertIsNotNone(connection._client) self.assertIsNotNone(connection._bqstorage_client) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_connect_w_client(self): from google.cloud.bigquery.dbapi import connect from google.cloud.bigquery.dbapi import Connection mock_client = self._mock_client() mock_bqstorage_client = self._mock_bqstorage_client() - mock_client._create_bqstorage_client.return_value = mock_bqstorage_client + mock_client._ensure_bqstorage_client.return_value = mock_bqstorage_client connection = connect(client=mock_client) + + mock_client._ensure_bqstorage_client.assert_called_once_with() self.assertIsInstance(connection, Connection) self.assertIs(connection._client, mock_client) self.assertIs(connection._bqstorage_client, mock_bqstorage_client) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_connect_w_both_clients(self): from google.cloud.bigquery.dbapi import connect @@ -109,9 +116,15 @@ def test_connect_w_both_clients(self): mock_client = self._mock_client() mock_bqstorage_client = self._mock_bqstorage_client() + mock_client._ensure_bqstorage_client.return_value = mock_bqstorage_client + connection = connect( client=mock_client, bqstorage_client=mock_bqstorage_client, ) + + mock_client._ensure_bqstorage_client.assert_called_once_with( + mock_bqstorage_client + ) self.assertIsInstance(connection, Connection) self.assertIs(connection._client, mock_client) self.assertIs(connection._bqstorage_client, mock_bqstorage_client) @@ -124,13 +137,13 @@ def test_raises_error_if_closed(self): connection.close() for method in ("close", "commit", "cursor"): - with six.assertRaisesRegex( - self, ProgrammingError, r"Operating on a closed connection\." + with self.assertRaisesRegex( + ProgrammingError, r"Operating on a closed connection\." ): getattr(connection, method)() @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_close_closes_all_created_bigquery_clients(self): client = self._mock_client() @@ -141,7 +154,7 @@ def test_close_closes_all_created_bigquery_clients(self): return_value=client, ) bqstorage_client_patcher = mock.patch.object( - client, "_create_bqstorage_client", return_value=bqstorage_client, + client, "_ensure_bqstorage_client", return_value=bqstorage_client, ) with client_patcher, bqstorage_client_patcher: @@ -150,10 +163,10 @@ def test_close_closes_all_created_bigquery_clients(self): connection.close() self.assertTrue(client.close.called) - self.assertTrue(bqstorage_client.transport.channel.close.called) + self.assertTrue(bqstorage_client._transport.grpc_channel.close.called) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_close_does_not_close_bigquery_clients_passed_to_it(self): client = self._mock_client() @@ -163,7 +176,7 @@ def test_close_does_not_close_bigquery_clients_passed_to_it(self): connection.close() self.assertFalse(client.close.called) - self.assertFalse(bqstorage_client.transport.channel.called) + self.assertFalse(bqstorage_client._transport.grpc_channel.close.called) def test_close_closes_all_created_cursors(self): connection = self._make_one(client=self._mock_client()) @@ -177,6 +190,22 @@ def test_close_closes_all_created_cursors(self): self.assertTrue(cursor_1._closed) self.assertTrue(cursor_2._closed) + def test_close_closes_only_open_created_cursors(self): + connection = self._make_one(client=self._mock_client()) + cursor_1 = connection.cursor() + cursor_2 = connection.cursor() + self.assertFalse(cursor_1._closed) + self.assertFalse(cursor_2._closed) + + cursor_1.close() + self.assertTrue(cursor_1._closed) + cursor_1.close = mock.MagicMock() + + connection.close() + + self.assertFalse(cursor_1.close.called) + self.assertTrue(cursor_2._closed) + def test_does_not_keep_cursor_instances_alive(self): from google.cloud.bigquery.dbapi import Cursor diff --git a/tests/unit/test_dbapi_cursor.py b/tests/unit/test_dbapi_cursor.py index bd1d9dc0a..026810aaf 100644 --- a/tests/unit/test_dbapi_cursor.py +++ b/tests/unit/test_dbapi_cursor.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import mock import operator as op import unittest -import warnings -import mock -import six +import pytest + try: import pyarrow @@ -27,11 +27,9 @@ from google.api_core import exceptions try: - from google.cloud import bigquery_storage_v1 - from google.cloud import bigquery_storage_v1beta1 + from google.cloud import bigquery_storage except ImportError: # pragma: NO COVER - bigquery_storage_v1 = None - bigquery_storage_v1beta1 = None + bigquery_storage = None from tests.unit.helpers import _to_pyarrow @@ -69,41 +67,26 @@ def _mock_client( num_dml_affected_rows=num_dml_affected_rows, dry_run=dry_run_job, total_bytes_processed=total_bytes_processed, + rows=rows, ) - mock_client.list_rows.return_value = rows mock_client._default_query_job_config = default_query_job_config # Assure that the REST client gets used, not the BQ Storage client. - mock_client._create_bqstorage_client.return_value = None + mock_client._ensure_bqstorage_client.return_value = None return mock_client - def _mock_bqstorage_client(self, rows=None, stream_count=0, v1beta1=False): - from google.cloud.bigquery_storage_v1 import client - from google.cloud.bigquery_storage_v1 import types - from google.cloud.bigquery_storage_v1beta1 import types as types_v1beta1 - + def _mock_bqstorage_client(self, rows=None, stream_count=0): if rows is None: rows = [] - if v1beta1: - mock_client = mock.create_autospec( - bigquery_storage_v1beta1.BigQueryStorageClient - ) - mock_read_session = mock.MagicMock( - streams=[ - types_v1beta1.Stream(name="streams/stream_{}".format(i)) - for i in range(stream_count) - ] - ) - else: - mock_client = mock.create_autospec(client.BigQueryReadClient) - mock_read_session = mock.MagicMock( - streams=[ - types.ReadStream(name="streams/stream_{}".format(i)) - for i in range(stream_count) - ] - ) + mock_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + mock_read_session = mock.MagicMock( + streams=[ + bigquery_storage.types.ReadStream(name="streams/stream_{}".format(i)) + for i in range(stream_count) + ] + ) mock_client.create_read_session.return_value = mock_read_session @@ -120,9 +103,13 @@ def _mock_job( num_dml_affected_rows=None, dry_run=False, total_bytes_processed=0, + rows=None, ): from google.cloud.bigquery import job + if rows is None: + rows = [] + mock_job = mock.create_autospec(job.QueryJob) mock_job.error_result = None mock_job.state = "DONE" @@ -132,12 +119,13 @@ def _mock_job( mock_job.result.side_effect = exceptions.NotFound mock_job.total_bytes_processed = total_bytes_processed else: - mock_job.result.return_value = mock_job + mock_job.result.return_value = rows mock_job._query_results = self._mock_results( total_rows=total_rows, schema=schema, num_dml_affected_rows=num_dml_affected_rows, ) + mock_job.destination.project = "P" mock_job.destination.to_bqstorage.return_value = ( "projects/P/datasets/DS/tables/T" ) @@ -192,11 +180,12 @@ def test_raises_error_if_closed(self): "fetchone", "setinputsizes", "setoutputsize", + "__iter__", ) for method in method_names: - with six.assertRaisesRegex( - self, ProgrammingError, r"Operating on a closed cursor\." + with self.assertRaisesRegex( + ProgrammingError, r"Operating on a closed cursor\." ): getattr(cursor, method)() @@ -291,7 +280,7 @@ def test_fetchall_w_row(self): self.assertEqual(rows[0], (1,)) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_fetchall_w_bqstorage_client_fetch_success(self): @@ -322,6 +311,7 @@ def test_fetchall_w_bqstorage_client_fetch_success(self): mock_bqstorage_client = self._mock_bqstorage_client( stream_count=1, rows=bqstorage_streamed_rows, ) + mock_client._ensure_bqstorage_client.return_value = mock_bqstorage_client connection = dbapi.connect( client=mock_client, bqstorage_client=mock_bqstorage_client, @@ -345,77 +335,14 @@ def test_fetchall_w_bqstorage_client_fetch_success(self): self.assertEqual(sorted_row_data, expected_row_data) @unittest.skipIf( - bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`" - ) - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_fetchall_w_bqstorage_client_v1beta1_fetch_success(self): - from google.cloud.bigquery import dbapi - from google.cloud.bigquery import table - - # use unordered data to also test any non-determenistic key order in dicts - row_data = [ - table.Row([1.4, 1.1, 1.3, 1.2], {"bar": 3, "baz": 2, "foo": 1, "quux": 0}), - table.Row([2.4, 2.1, 2.3, 2.2], {"bar": 3, "baz": 2, "foo": 1, "quux": 0}), - ] - bqstorage_streamed_rows = [ - { - "bar": _to_pyarrow(1.2), - "foo": _to_pyarrow(1.1), - "quux": _to_pyarrow(1.4), - "baz": _to_pyarrow(1.3), - }, - { - "bar": _to_pyarrow(2.2), - "foo": _to_pyarrow(2.1), - "quux": _to_pyarrow(2.4), - "baz": _to_pyarrow(2.3), - }, - ] - - mock_client = self._mock_client(rows=row_data) - mock_bqstorage_client = self._mock_bqstorage_client( - stream_count=1, rows=bqstorage_streamed_rows, v1beta1=True - ) - - connection = dbapi.connect( - client=mock_client, bqstorage_client=mock_bqstorage_client, - ) - cursor = connection.cursor() - cursor.execute("SELECT foo, bar FROM some_table") - - with warnings.catch_warnings(record=True) as warned: - rows = cursor.fetchall() - - # a deprecation warning should have been emitted - expected_warnings = [ - warning - for warning in warned - if issubclass(warning.category, DeprecationWarning) - and "v1beta1" in str(warning) - ] - self.assertEqual(len(expected_warnings), 1, "Deprecation warning not raised.") - - # the default client was not used - mock_client.list_rows.assert_not_called() - - # check the data returned - field_value = op.itemgetter(1) - sorted_row_data = [sorted(row.items(), key=field_value) for row in rows] - expected_row_data = [ - [("foo", 1.1), ("bar", 1.2), ("baz", 1.3), ("quux", 1.4)], - [("foo", 2.1), ("bar", 2.2), ("baz", 2.3), ("quux", 2.4)], - ] - - self.assertEqual(sorted_row_data, expected_row_data) - - @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_fetchall_w_bqstorage_client_fetch_no_rows(self): from google.cloud.bigquery import dbapi mock_client = self._mock_client(rows=[]) mock_bqstorage_client = self._mock_bqstorage_client(stream_count=0) + mock_client._ensure_bqstorage_client.return_value = mock_bqstorage_client connection = dbapi.connect( client=mock_client, bqstorage_client=mock_bqstorage_client, @@ -432,7 +359,7 @@ def test_fetchall_w_bqstorage_client_fetch_no_rows(self): self.assertEqual(rows, []) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_fetchall_w_bqstorage_client_fetch_error_no_fallback(self): from google.cloud.bigquery import dbapi @@ -440,7 +367,11 @@ def test_fetchall_w_bqstorage_client_fetch_error_no_fallback(self): row_data = [table.Row([1.1, 1.2], {"foo": 0, "bar": 1})] + def fake_ensure_bqstorage_client(bqstorage_client=None, **kwargs): + return bqstorage_client + mock_client = self._mock_client(rows=row_data) + mock_client._ensure_bqstorage_client.side_effect = fake_ensure_bqstorage_client mock_bqstorage_client = self._mock_bqstorage_client( stream_count=1, rows=row_data, ) @@ -453,12 +384,62 @@ def test_fetchall_w_bqstorage_client_fetch_error_no_fallback(self): cursor = connection.cursor() cursor.execute("SELECT foo, bar FROM some_table") - with six.assertRaisesRegex(self, exceptions.Forbidden, "invalid credentials"): + with self.assertRaisesRegex(exceptions.Forbidden, "invalid credentials"): cursor.fetchall() # the default client was not used mock_client.list_rows.assert_not_called() + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_fetchall_w_bqstorage_client_no_arrow_compression(self): + from google.cloud.bigquery import dbapi + from google.cloud.bigquery import table + + # Use unordered data to also test any non-determenistic key order in dicts. + row_data = [table.Row([1.2, 1.1], {"bar": 1, "foo": 0})] + bqstorage_streamed_rows = [{"bar": _to_pyarrow(1.2), "foo": _to_pyarrow(1.1)}] + + def fake_ensure_bqstorage_client(bqstorage_client=None, **kwargs): + return bqstorage_client + + mock_client = self._mock_client(rows=row_data) + mock_client._ensure_bqstorage_client.side_effect = fake_ensure_bqstorage_client + mock_bqstorage_client = self._mock_bqstorage_client( + stream_count=1, rows=bqstorage_streamed_rows, + ) + + connection = dbapi.connect( + client=mock_client, bqstorage_client=mock_bqstorage_client, + ) + cursor = connection.cursor() + cursor.execute("SELECT foo, bar FROM some_table") + + with mock.patch( + "google.cloud.bigquery.dbapi.cursor._ARROW_COMPRESSION_SUPPORT", new=False + ): + rows = cursor.fetchall() + + mock_client.list_rows.assert_not_called() # The default client was not used. + + # Check the BQ Storage session config. + expected_session = bigquery_storage.ReadSession( + table="projects/P/datasets/DS/tables/T", + data_format=bigquery_storage.DataFormat.ARROW, + ) + mock_bqstorage_client.create_read_session.assert_called_once_with( + parent="projects/P", read_session=expected_session, max_stream_count=1 + ) + + # Check the data returned. + field_value = op.itemgetter(1) + sorted_row_data = [sorted(row.items(), key=field_value) for row in rows] + expected_row_data = [[("foo", 1.1), ("bar", 1.2)]] + + self.assertEqual(sorted_row_data, expected_row_data) + def test_execute_custom_job_id(self): from google.cloud.bigquery.dbapi import connect @@ -641,18 +622,52 @@ def test_executemany_w_dml(self): (("test",), ("anothertest",)), ) self.assertIsNone(cursor.description) - self.assertEqual(cursor.rowcount, 12) + self.assertEqual(cursor.rowcount, 24) # 24 because 2 * 12 because cumulatve. + + def test_executemany_empty(self): + from google.cloud.bigquery.dbapi import connect + + connection = connect(self._mock_client(rows=[], num_dml_affected_rows=12)) + cursor = connection.cursor() + cursor.executemany((), ()) + self.assertIsNone(cursor.description) + self.assertEqual(cursor.rowcount, -1) + + def test_is_iterable(self): + from google.cloud.bigquery import dbapi + + connection = dbapi.connect( + self._mock_client(rows=[("hello", "there", 7), ("good", "bye", -3)]) + ) + cursor = connection.cursor() + cursor.execute("SELECT foo, bar, baz FROM hello_world WHERE baz < 42;") + + rows_iter = iter(cursor) + + row = next(rows_iter) + self.assertEqual(row, ("hello", "there", 7)) + row = next(rows_iter) + self.assertEqual(row, ("good", "bye", -3)) + self.assertRaises(StopIteration, next, rows_iter) + + self.assertEqual( + list(cursor), + [], + "Iterating again over the same results should produce no rows.", + ) def test__format_operation_w_dict(self): from google.cloud.bigquery.dbapi import cursor - formatted_operation = cursor._format_operation( - "SELECT %(somevalue)s, %(a `weird` one)s;", + parameter_types = {} + formatted_operation, parameter_types = cursor._format_operation( + "SELECT %(somevalue)s, %(a `weird` one:STRING)s;", {"somevalue": "hi", "a `weird` one": "world"}, ) self.assertEqual( formatted_operation, "SELECT @`somevalue`, @`a \\`weird\\` one`;" ) + self.assertEqual(parameter_types, {"a `weird` one": "STRING"}) def test__format_operation_w_wrong_dict(self): from google.cloud.bigquery import dbapi @@ -665,10 +680,18 @@ def test__format_operation_w_wrong_dict(self): {"somevalue-not-here": "hi", "othervalue": "world"}, ) + def test__format_operation_w_redundant_dict_key(self): + from google.cloud.bigquery.dbapi import cursor + + formatted_operation, _ = cursor._format_operation( + "SELECT %(somevalue)s;", {"somevalue": "foo", "value-not-used": "bar"} + ) + self.assertEqual(formatted_operation, "SELECT @`somevalue`;") + def test__format_operation_w_sequence(self): from google.cloud.bigquery.dbapi import cursor - formatted_operation = cursor._format_operation( + formatted_operation, _ = cursor._format_operation( "SELECT %s, %s;", ("hello", "world") ) self.assertEqual(formatted_operation, "SELECT ?, ?;") @@ -683,3 +706,157 @@ def test__format_operation_w_too_short_sequence(self): "SELECT %s, %s;", ("hello",), ) + + def test__format_operation_w_too_long_sequence(self): + from google.cloud.bigquery import dbapi + from google.cloud.bigquery.dbapi import cursor + + self.assertRaises( + dbapi.ProgrammingError, + cursor._format_operation, + "SELECT %s, %s;", + ("hello", "world", "everyone"), + ) + + def test__format_operation_w_empty_dict(self): + from google.cloud.bigquery.dbapi import cursor + + formatted_operation, _ = cursor._format_operation("SELECT '%f'", {}) + self.assertEqual(formatted_operation, "SELECT '%f'") + + def test__format_operation_wo_params_single_percent(self): + from google.cloud.bigquery.dbapi import cursor + + formatted_operation, _ = cursor._format_operation("SELECT '%'", {}) + self.assertEqual(formatted_operation, "SELECT '%'") + + def test__format_operation_wo_params_double_percents(self): + from google.cloud.bigquery.dbapi import cursor + + formatted_operation, _ = cursor._format_operation("SELECT '%%'", {}) + self.assertEqual(formatted_operation, "SELECT '%'") + + def test__format_operation_unescaped_percent_w_dict_param(self): + from google.cloud.bigquery import dbapi + from google.cloud.bigquery.dbapi import cursor + + self.assertRaises( + dbapi.ProgrammingError, + cursor._format_operation, + "SELECT %(foo)s, '100 %';", + {"foo": "bar"}, + ) + + def test__format_operation_unescaped_percent_w_list_param(self): + from google.cloud.bigquery import dbapi + from google.cloud.bigquery.dbapi import cursor + + self.assertRaises( + dbapi.ProgrammingError, + cursor._format_operation, + "SELECT %s, %s, '100 %';", + ["foo", "bar"], + ) + + def test__format_operation_no_placeholders(self): + from google.cloud.bigquery import dbapi + from google.cloud.bigquery.dbapi import cursor + + self.assertRaises( + dbapi.ProgrammingError, + cursor._format_operation, + "SELECT 42", + ["foo", "bar"], + ) + + +@pytest.mark.parametrize( + "inp,expect", + [ + ("", ("", None)), + ("values(%(foo)s, %(bar)s)", ("values(%(foo)s, %(bar)s)", {})), + ( + "values('%%(oof:INT64)s', %(foo)s, %(bar)s)", + ("values('%%(oof:INT64)s', %(foo)s, %(bar)s)", {}), + ), + ( + "values(%(foo:INT64)s, %(bar)s)", + ("values(%(foo)s, %(bar)s)", dict(foo="INT64")), + ), + ( + "values('%%(oof:INT64)s, %(foo:INT64)s, %(foo)s)", + ("values('%%(oof:INT64)s, %(foo)s, %(foo)s)", dict(foo="INT64")), + ), + ( + "values(%(foo:INT64)s, %(foo:INT64)s)", + ("values(%(foo)s, %(foo)s)", dict(foo="INT64")), + ), + ( + "values(%(foo:INT64)s, %(bar:NUMERIC)s) 100 %", + ("values(%(foo)s, %(bar)s) 100 %", dict(foo="INT64", bar="NUMERIC")), + ), + (" %s %()s %(:int64)s ", (" %s %s %s ", [None, None, "int64"])), + (" %%s %s %()s %(:int64)s ", (" %%s %s %s %s ", [None, None, "int64"])), + ( + "values(%%%(foo:INT64)s, %(bar)s)", + ("values(%%%(foo)s, %(bar)s)", dict(foo="INT64")), + ), + ( + "values(%%%%(foo:INT64)s, %(bar)s)", + ("values(%%%%(foo:INT64)s, %(bar)s)", dict()), + ), + ( + "values(%%%%%(foo:INT64)s, %(bar)s)", + ("values(%%%%%(foo)s, %(bar)s)", dict(foo="INT64")), + ), + ( + "values(%%%%%(foo:struct)s, %(bar)s)", + ("values(%%%%%(foo)s, %(bar)s)", dict(foo="struct")), + ), + ( + "values(%%%%%(foo:struct)s, %(bar)s)", + ("values(%%%%%(foo)s, %(bar)s)", dict(foo="struct")), + ), + ( + "values(%(foo:struct)s, %(bar)s)", + ( + "values(%(foo)s, %(bar)s)", + dict(foo="struct"), + ), + ), + ( + "values(%(foo:struct)s, %(bar)s)", + ( + "values(%(foo)s, %(bar)s)", + dict(foo="struct"), + ), + ), + ( + "values(%(foo:string(10))s, %(bar)s)", + ("values(%(foo)s, %(bar)s)", dict(foo="string(10)")), + ), + ], +) +def test__extract_types(inp, expect): + from google.cloud.bigquery.dbapi.cursor import _extract_types as et + + assert et(inp) == expect + + +@pytest.mark.parametrize( + "match,inp", + [ + ( + "Conflicting types for foo: numeric and int64.", + " %(foo:numeric)s %(foo:int64)s ", + ), + (r"' %s %\(foo\)s ' mixes named and unamed parameters.", " %s %(foo)s "), + (r"' %\(foo\)s %s ' mixes named and unamed parameters.", " %(foo)s %s "), + ], +) +def test__extract_types_fail(match, inp): + from google.cloud.bigquery.dbapi.cursor import _extract_types as et + from google.cloud.bigquery.dbapi import exceptions + + with pytest.raises(exceptions.ProgrammingError, match=match): + et(inp) diff --git a/tests/unit/test_dbapi_types.py b/tests/unit/test_dbapi_types.py index e05660ffe..cf282c68b 100644 --- a/tests/unit/test_dbapi_types.py +++ b/tests/unit/test_dbapi_types.py @@ -15,6 +15,8 @@ import datetime import unittest +import pytest + import google.cloud._helpers from google.cloud.bigquery.dbapi import types @@ -26,10 +28,6 @@ def test_binary_type(self): self.assertEqual("STRUCT", types.BINARY) self.assertNotEqual("STRING", types.BINARY) - def test_binary_constructor(self): - self.assertEqual(types.Binary(u"hello"), b"hello") - self.assertEqual(types.Binary(u"\u1f60"), u"\u1f60".encode("utf-8")) - def test_timefromticks(self): somedatetime = datetime.datetime( 2017, 2, 18, 12, 47, 26, tzinfo=google.cloud._helpers.UTC @@ -40,3 +38,29 @@ def test_timefromticks(self): types.TimeFromTicks(ticks, google.cloud._helpers.UTC), datetime.time(12, 47, 26, tzinfo=google.cloud._helpers.UTC), ) + + +class CustomBinary: + def __bytes__(self): + return b"Google" + + +@pytest.mark.parametrize( + "raw,expected", + [ + (u"hello", b"hello"), + (u"\u1f60", u"\u1f60".encode("utf-8")), + (b"hello", b"hello"), + (bytearray(b"hello"), b"hello"), + (memoryview(b"hello"), b"hello"), + (CustomBinary(), b"Google"), + ], +) +def test_binary_constructor(raw, expected): + assert types.Binary(raw) == expected + + +@pytest.mark.parametrize("bad", (42, 42.0, None)) +def test_invalid_binary_constructor(bad): + with pytest.raises(TypeError): + types.Binary(bad) diff --git a/tests/unit/test_delete_dataset.py b/tests/unit/test_delete_dataset.py new file mode 100644 index 000000000..b48beb147 --- /dev/null +++ b/tests/unit/test_delete_dataset.py @@ -0,0 +1,79 @@ +# Copyright 2021 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .helpers import make_connection, make_client, dataset_polymorphic +import google.api_core.exceptions +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +import pytest + + +@dataset_polymorphic +def test_delete_dataset(make_dataset, get_reference, client, PROJECT, DS_ID): + dataset = make_dataset(PROJECT, DS_ID) + PATH = "projects/%s/datasets/%s" % (PROJECT, DS_ID) + conn = client._connection = make_connection({}) + client.delete_dataset(dataset, timeout=7.5) + conn.api_request.assert_called_with( + method="DELETE", path="/%s" % PATH, query_params={}, timeout=7.5 + ) + + +@dataset_polymorphic +def test_delete_dataset_delete_contents( + make_dataset, get_reference, client, PROJECT, DS_ID +): + PATH = "projects/%s/datasets/%s" % (PROJECT, DS_ID) + conn = client._connection = make_connection({}) + dataset = make_dataset(PROJECT, DS_ID) + client.delete_dataset(dataset, delete_contents=True) + conn.api_request.assert_called_with( + method="DELETE", + path="/%s" % PATH, + query_params={"deleteContents": "true"}, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_delete_dataset_wrong_type(client): + with pytest.raises(TypeError): + client.delete_dataset(42) + + +def test_delete_dataset_w_not_found_ok_false(PROJECT, DS_ID): + path = "/projects/{}/datasets/{}".format(PROJECT, DS_ID) + http = object() + client = make_client(_http=http) + conn = client._connection = make_connection( + google.api_core.exceptions.NotFound("dataset not found") + ) + + with pytest.raises(google.api_core.exceptions.NotFound): + client.delete_dataset(DS_ID) + + conn.api_request.assert_called_with( + method="DELETE", path=path, query_params={}, timeout=DEFAULT_TIMEOUT + ) + + +def test_delete_dataset_w_not_found_ok_true(PROJECT, DS_ID): + path = "/projects/{}/datasets/{}".format(PROJECT, DS_ID) + http = object() + client = make_client(_http=http) + conn = client._connection = make_connection( + google.api_core.exceptions.NotFound("dataset not found") + ) + client.delete_dataset(DS_ID, not_found_ok=True) + conn.api_request.assert_called_with( + method="DELETE", path=path, query_params={}, timeout=DEFAULT_TIMEOUT + ) diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py index 4b6ef5118..1f49dba5d 100644 --- a/tests/unit/test_external_config.py +++ b/tests/unit/test_external_config.py @@ -74,6 +74,7 @@ def test_to_api_repr_base(self): ec.autodetect = True ec.ignore_unknown_values = False ec.compression = "compression" + ec.connection_id = "path/to/connection" ec.schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")] exp_schema = { @@ -82,7 +83,7 @@ def test_to_api_repr_base(self): "name": "full_name", "type": "STRING", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, } ] } @@ -94,10 +95,17 @@ def test_to_api_repr_base(self): "autodetect": True, "ignoreUnknownValues": False, "compression": "compression", + "connectionId": "path/to/connection", "schema": exp_schema, } self.assertEqual(got_resource, exp_resource) + def test_connection_id(self): + ec = external_config.ExternalConfig("") + self.assertIsNone(ec.connection_id) + ec.connection_id = "path/to/connection" + self.assertEqual(ec.connection_id, "path/to/connection") + def test_schema_None(self): ec = external_config.ExternalConfig("") ec.schema = None @@ -424,6 +432,164 @@ def test_to_api_repr_bigtable(self): self.assertEqual(got_resource, exp_resource) + def test_parquet_options_getter(self): + from google.cloud.bigquery.format_options import ParquetOptions + + parquet_options = ParquetOptions.from_api_repr( + {"enumAsString": True, "enableListInference": False} + ) + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.PARQUET + ) + + self.assertIsNone(ec.parquet_options.enum_as_string) + self.assertIsNone(ec.parquet_options.enable_list_inference) + + ec._options = parquet_options + + self.assertTrue(ec.parquet_options.enum_as_string) + self.assertFalse(ec.parquet_options.enable_list_inference) + + self.assertIs(ec.parquet_options, ec.options) + + def test_parquet_options_getter_non_parquet_format(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + self.assertIsNone(ec.parquet_options) + + def test_parquet_options_setter(self): + from google.cloud.bigquery.format_options import ParquetOptions + + parquet_options = ParquetOptions.from_api_repr( + {"enumAsString": False, "enableListInference": True} + ) + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.PARQUET + ) + + ec.parquet_options = parquet_options + + # Setting Parquet options should be reflected in the generic options attribute. + self.assertFalse(ec.options.enum_as_string) + self.assertTrue(ec.options.enable_list_inference) + + def test_parquet_options_setter_non_parquet_format(self): + from google.cloud.bigquery.format_options import ParquetOptions + + parquet_options = ParquetOptions.from_api_repr( + {"enumAsString": False, "enableListInference": True} + ) + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + + with self.assertRaisesRegex(TypeError, "Cannot set.*source format is CSV"): + ec.parquet_options = parquet_options + + def test_from_api_repr_parquet(self): + from google.cloud.bigquery.format_options import ParquetOptions + + resource = _copy_and_update( + self.BASE_RESOURCE, + { + "sourceFormat": "PARQUET", + "parquetOptions": {"enumAsString": True, "enableListInference": False}, + }, + ) + + ec = external_config.ExternalConfig.from_api_repr(resource) + + self._verify_base(ec) + self.assertEqual(ec.source_format, external_config.ExternalSourceFormat.PARQUET) + self.assertIsInstance(ec.options, ParquetOptions) + self.assertTrue(ec.parquet_options.enum_as_string) + self.assertFalse(ec.parquet_options.enable_list_inference) + + got_resource = ec.to_api_repr() + + self.assertEqual(got_resource, resource) + + del resource["parquetOptions"]["enableListInference"] + ec = external_config.ExternalConfig.from_api_repr(resource) + self.assertIsNone(ec.options.enable_list_inference) + got_resource = ec.to_api_repr() + self.assertEqual(got_resource, resource) + + def test_to_api_repr_parquet(self): + from google.cloud.bigquery.format_options import ParquetOptions + + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.PARQUET + ) + options = ParquetOptions.from_api_repr( + dict(enumAsString=False, enableListInference=True) + ) + ec._options = options + + exp_resource = { + "sourceFormat": external_config.ExternalSourceFormat.PARQUET, + "parquetOptions": {"enumAsString": False, "enableListInference": True}, + } + + got_resource = ec.to_api_repr() + + self.assertEqual(got_resource, exp_resource) + + def test_from_api_repr_decimal_target_types(self): + from google.cloud.bigquery.enums import DecimalTargetType + + resource = _copy_and_update( + self.BASE_RESOURCE, + { + "sourceFormat": "FORMAT_FOO", + "decimalTargetTypes": [DecimalTargetType.NUMERIC], + }, + ) + + ec = external_config.ExternalConfig.from_api_repr(resource) + + self._verify_base(ec) + self.assertEqual(ec.source_format, "FORMAT_FOO") + self.assertEqual( + ec.decimal_target_types, frozenset([DecimalTargetType.NUMERIC]) + ) + + # converting back to API representation should yield the same result + got_resource = ec.to_api_repr() + self.assertEqual(got_resource, resource) + + del resource["decimalTargetTypes"] + ec = external_config.ExternalConfig.from_api_repr(resource) + self.assertIsNone(ec.decimal_target_types) + + got_resource = ec.to_api_repr() + self.assertEqual(got_resource, resource) + + def test_to_api_repr_decimal_target_types(self): + from google.cloud.bigquery.enums import DecimalTargetType + + ec = external_config.ExternalConfig("FORMAT_FOO") + ec.decimal_target_types = [DecimalTargetType.NUMERIC, DecimalTargetType.STRING] + + got_resource = ec.to_api_repr() + + expected_resource = { + "sourceFormat": "FORMAT_FOO", + "decimalTargetTypes": [DecimalTargetType.NUMERIC, DecimalTargetType.STRING], + } + self.assertEqual(got_resource, expected_resource) + + def test_to_api_repr_decimal_target_types_unset(self): + from google.cloud.bigquery.enums import DecimalTargetType + + ec = external_config.ExternalConfig("FORMAT_FOO") + ec._properties["decimalTargetTypes"] = [DecimalTargetType.NUMERIC] + ec.decimal_target_types = None + + got_resource = ec.to_api_repr() + + expected_resource = {"sourceFormat": "FORMAT_FOO"} + self.assertEqual(got_resource, expected_resource) + + ec.decimal_target_types = None # No error if unsetting when already unset. + def _copy_and_update(d, u): d = copy.deepcopy(d) diff --git a/tests/unit/test_format_options.py b/tests/unit/test_format_options.py new file mode 100644 index 000000000..ab5f9e05c --- /dev/null +++ b/tests/unit/test_format_options.py @@ -0,0 +1,41 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class TestParquetOptions: + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.format_options import ParquetOptions + + return ParquetOptions + + def test_ctor(self): + config = self._get_target_class()() + assert config.enum_as_string is None + assert config.enable_list_inference is None + + def test_from_api_repr(self): + config = self._get_target_class().from_api_repr( + {"enumAsString": False, "enableListInference": True} + ) + assert not config.enum_as_string + assert config.enable_list_inference + + def test_to_api_repr(self): + config = self._get_target_class()() + config.enum_as_string = True + config.enable_list_inference = False + + result = config.to_api_repr() + assert result == {"enumAsString": True, "enableListInference": False} diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py deleted file mode 100644 index fb6a46bd6..000000000 --- a/tests/unit/test_job.py +++ /dev/null @@ -1,6324 +0,0 @@ -# Copyright 2015 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import concurrent -import copy -import json -import textwrap -import unittest -import warnings - -import freezegun -import mock -import pytest -import requests -from six.moves import http_client - -try: - import pandas -except (ImportError, AttributeError): # pragma: NO COVER - pandas = None - -try: - import pyarrow -except ImportError: # pragma: NO COVER - pyarrow = None -try: - from google.cloud import bigquery_storage_v1 -except (ImportError, AttributeError): # pragma: NO COVER - bigquery_storage_v1 = None -try: - from tqdm import tqdm -except (ImportError, AttributeError): # pragma: NO COVER - tqdm = None - - -def _make_credentials(): - import google.auth.credentials - - return mock.Mock(spec=google.auth.credentials.Credentials) - - -def _make_client(project="test-project", connection=None): - from google.cloud.bigquery.client import Client - - if connection is None: - connection = _make_connection() - - client = Client(project=project, credentials=_make_credentials(), _http=object()) - client._connection = connection - return client - - -def _make_connection(*responses): - import google.cloud.bigquery._http - from google.cloud.exceptions import NotFound - - mock_conn = mock.create_autospec(google.cloud.bigquery._http.Connection) - mock_conn.api_request.side_effect = list(responses) + [NotFound("miss")] - return mock_conn - - -def _make_job_resource( - creation_time_ms=1437767599006, - started_time_ms=1437767600007, - ended_time_ms=1437767601008, - started=False, - ended=False, - etag="abc-def-hjk", - endpoint="https://bigquery.googleapis.com", - job_type="load", - job_id="a-random-id", - project_id="some-project", - user_email="bq-user@example.com", -): - resource = { - "configuration": {job_type: {}}, - "statistics": {"creationTime": creation_time_ms, job_type: {}}, - "etag": etag, - "id": "{}:{}".format(project_id, job_id), - "jobReference": {"projectId": project_id, "jobId": job_id}, - "selfLink": "{}/bigquery/v2/projects/{}/jobs/{}".format( - endpoint, project_id, job_id - ), - "user_email": user_email, - } - - if started or ended: - resource["statistics"]["startTime"] = started_time_ms - - if ended: - resource["statistics"]["endTime"] = ended_time_ms - - if job_type == "query": - resource["configuration"]["query"]["destinationTable"] = { - "projectId": project_id, - "datasetId": "_temp_dataset", - "tableId": "_temp_table", - } - - return resource - - -class Test__error_result_to_exception(unittest.TestCase): - def _call_fut(self, *args, **kwargs): - from google.cloud.bigquery import job - - return job._error_result_to_exception(*args, **kwargs) - - def test_simple(self): - error_result = {"reason": "invalid", "message": "bad request"} - exception = self._call_fut(error_result) - self.assertEqual(exception.code, http_client.BAD_REQUEST) - self.assertTrue(exception.message.startswith("bad request")) - self.assertIn(error_result, exception.errors) - - def test_missing_reason(self): - error_result = {} - exception = self._call_fut(error_result) - self.assertEqual(exception.code, http_client.INTERNAL_SERVER_ERROR) - - -class Test_JobReference(unittest.TestCase): - JOB_ID = "job-id" - PROJECT = "test-project-123" - LOCATION = "us-central" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery import job - - return job._JobReference - - def _make_one(self, job_id, project, location): - return self._get_target_class()(job_id, project, location) - - def test_ctor(self): - job_ref = self._make_one(self.JOB_ID, self.PROJECT, self.LOCATION) - - self.assertEqual(job_ref.job_id, self.JOB_ID) - self.assertEqual(job_ref.project, self.PROJECT) - self.assertEqual(job_ref.location, self.LOCATION) - - def test__to_api_repr(self): - job_ref = self._make_one(self.JOB_ID, self.PROJECT, self.LOCATION) - - self.assertEqual( - job_ref._to_api_repr(), - { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": self.LOCATION, - }, - ) - - def test_from_api_repr(self): - api_repr = { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": self.LOCATION, - } - - job_ref = self._get_target_class()._from_api_repr(api_repr) - - self.assertEqual(job_ref.job_id, self.JOB_ID) - self.assertEqual(job_ref.project, self.PROJECT) - self.assertEqual(job_ref.location, self.LOCATION) - - -class Test_AsyncJob(unittest.TestCase): - JOB_ID = "job-id" - PROJECT = "test-project-123" - LOCATION = "us-central" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery import job - - return job._AsyncJob - - def _make_one(self, job_id, client): - return self._get_target_class()(job_id, client) - - def _make_derived_class(self): - class Derived(self._get_target_class()): - _JOB_TYPE = "derived" - - return Derived - - def _make_derived(self, job_id, client): - return self._make_derived_class()(job_id, client) - - @staticmethod - def _job_reference(job_id, project, location): - from google.cloud.bigquery import job - - return job._JobReference(job_id, project, location) - - def test_ctor_w_bare_job_id(self): - import threading - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - - self.assertEqual(job.job_id, self.JOB_ID) - self.assertEqual(job.project, self.PROJECT) - self.assertIsNone(job.location) - self.assertIs(job._client, client) - self.assertEqual( - job._properties, - {"jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}}, - ) - self.assertIsInstance(job._completion_lock, type(threading.Lock())) - self.assertEqual( - job.path, "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) - ) - - def test_ctor_w_job_ref(self): - import threading - - other_project = "other-project-234" - client = _make_client(project=other_project) - job_ref = self._job_reference(self.JOB_ID, self.PROJECT, self.LOCATION) - job = self._make_one(job_ref, client) - - self.assertEqual(job.job_id, self.JOB_ID) - self.assertEqual(job.project, self.PROJECT) - self.assertEqual(job.location, self.LOCATION) - self.assertIs(job._client, client) - self.assertEqual( - job._properties, - { - "jobReference": { - "projectId": self.PROJECT, - "location": self.LOCATION, - "jobId": self.JOB_ID, - } - }, - ) - self.assertFalse(job._result_set) - self.assertIsInstance(job._completion_lock, type(threading.Lock())) - self.assertEqual( - job.path, "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) - ) - - def test__require_client_w_none(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - - self.assertIs(job._require_client(None), client) - - def test__require_client_w_other(self): - client = _make_client(project=self.PROJECT) - other = object() - job = self._make_one(self.JOB_ID, client) - - self.assertIs(job._require_client(other), other) - - def test_job_type(self): - client = _make_client(project=self.PROJECT) - derived = self._make_derived(self.JOB_ID, client) - - self.assertEqual(derived.job_type, "derived") - - def test_parent_job_id(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - - self.assertIsNone(job.parent_job_id) - job._properties["statistics"] = {"parentJobId": "parent-job-123"} - self.assertEqual(job.parent_job_id, "parent-job-123") - - def test_script_statistics(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - - self.assertIsNone(job.script_statistics) - job._properties["statistics"] = { - "scriptStatistics": { - "evaluationKind": "EXPRESSION", - "stackFrames": [ - { - "startLine": 5, - "startColumn": 29, - "endLine": 9, - "endColumn": 14, - "text": "QUERY TEXT", - } - ], - } - } - script_stats = job.script_statistics - self.assertEqual(script_stats.evaluation_kind, "EXPRESSION") - stack_frames = script_stats.stack_frames - self.assertEqual(len(stack_frames), 1) - stack_frame = stack_frames[0] - self.assertIsNone(stack_frame.procedure_id) - self.assertEqual(stack_frame.start_line, 5) - self.assertEqual(stack_frame.start_column, 29) - self.assertEqual(stack_frame.end_line, 9) - self.assertEqual(stack_frame.end_column, 14) - self.assertEqual(stack_frame.text, "QUERY TEXT") - - def test_num_child_jobs(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - - self.assertEqual(job.num_child_jobs, 0) - job._properties["statistics"] = {"numChildJobs": "17"} - self.assertEqual(job.num_child_jobs, 17) - - def test_labels_miss(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertEqual(job.labels, {}) - - def test_labels_update_in_place(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - labels = job.labels - labels["foo"] = "bar" # update in place - self.assertEqual(job.labels, {"foo": "bar"}) - - def test_labels_hit(self): - labels = {"foo": "bar"} - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["labels"] = labels - self.assertEqual(job.labels, labels) - - def test_etag(self): - etag = "ETAG-123" - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.etag) - job._properties["etag"] = etag - self.assertEqual(job.etag, etag) - - def test_self_link(self): - self_link = "https://api.example.com/123" - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.self_link) - job._properties["selfLink"] = self_link - self.assertEqual(job.self_link, self_link) - - def test_user_email(self): - user_email = "user@example.com" - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.user_email) - job._properties["user_email"] = user_email - self.assertEqual(job.user_email, user_email) - - @staticmethod - def _datetime_and_millis(): - import datetime - import pytz - from google.cloud._helpers import _millis - - now = datetime.datetime.utcnow().replace( - microsecond=123000, tzinfo=pytz.UTC # stats timestamps have ms precision - ) - return now, _millis(now) - - def test_created(self): - now, millis = self._datetime_and_millis() - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.created) - stats = job._properties["statistics"] = {} - self.assertIsNone(job.created) - stats["creationTime"] = millis - self.assertEqual(job.created, now) - - def test_started(self): - now, millis = self._datetime_and_millis() - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.started) - stats = job._properties["statistics"] = {} - self.assertIsNone(job.started) - stats["startTime"] = millis - self.assertEqual(job.started, now) - - def test_ended(self): - now, millis = self._datetime_and_millis() - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.ended) - stats = job._properties["statistics"] = {} - self.assertIsNone(job.ended) - stats["endTime"] = millis - self.assertEqual(job.ended, now) - - def test__job_statistics(self): - statistics = {"foo": "bar"} - client = _make_client(project=self.PROJECT) - derived = self._make_derived(self.JOB_ID, client) - self.assertEqual(derived._job_statistics(), {}) - stats = derived._properties["statistics"] = {} - self.assertEqual(derived._job_statistics(), {}) - stats["derived"] = statistics - self.assertEqual(derived._job_statistics(), statistics) - - def test_error_result(self): - error_result = { - "debugInfo": "DEBUG INFO", - "location": "LOCATION", - "message": "MESSAGE", - "reason": "REASON", - } - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.error_result) - status = job._properties["status"] = {} - self.assertIsNone(job.error_result) - status["errorResult"] = error_result - self.assertEqual(job.error_result, error_result) - - def test_errors(self): - errors = [ - { - "debugInfo": "DEBUG INFO", - "location": "LOCATION", - "message": "MESSAGE", - "reason": "REASON", - } - ] - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.errors) - status = job._properties["status"] = {} - self.assertIsNone(job.errors) - status["errors"] = errors - self.assertEqual(job.errors, errors) - - def test_state(self): - state = "STATE" - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - self.assertIsNone(job.state) - status = job._properties["status"] = {} - self.assertIsNone(job.state) - status["state"] = state - self.assertEqual(job.state, state) - - def test__scrub_local_properties(self): - before = {"foo": "bar"} - resource = before.copy() - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._scrub_local_properties(resource) # no raise - self.assertEqual(resource, before) - - def test__copy_configuration_properties(self): - before = {"foo": "bar"} - resource = before.copy() - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - with self.assertRaises(NotImplementedError): - job._copy_configuration_properties(resource) - self.assertEqual(resource, before) - - def _set_properties_job(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._scrub_local_properties = mock.Mock() - job._copy_configuration_properties = mock.Mock() - job._set_future_result = mock.Mock() - job._properties = { - "jobReference": job._properties["jobReference"], - "foo": "bar", - } - return job - - def test__set_properties_no_stats(self): - config = {"test": True} - resource = {"configuration": config} - job = self._set_properties_job() - - job._set_properties(resource) - - self.assertEqual(job._properties, resource) - - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - - def test__set_properties_w_creation_time(self): - now, millis = self._datetime_and_millis() - config = {"test": True} - stats = {"creationTime": str(millis)} - resource = {"configuration": config, "statistics": stats} - job = self._set_properties_job() - - job._set_properties(resource) - - cleaned = copy.deepcopy(resource) - cleaned["statistics"]["creationTime"] = float(millis) - self.assertEqual(job._properties, cleaned) - - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - - def test__set_properties_w_start_time(self): - now, millis = self._datetime_and_millis() - config = {"test": True} - stats = {"startTime": str(millis)} - resource = {"configuration": config, "statistics": stats} - job = self._set_properties_job() - - job._set_properties(resource) - - cleaned = copy.deepcopy(resource) - cleaned["statistics"]["startTime"] = float(millis) - self.assertEqual(job._properties, cleaned) - - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - - def test__set_properties_w_end_time(self): - now, millis = self._datetime_and_millis() - config = {"test": True} - stats = {"endTime": str(millis)} - resource = {"configuration": config, "statistics": stats} - job = self._set_properties_job() - - job._set_properties(resource) - - cleaned = copy.deepcopy(resource) - cleaned["statistics"]["endTime"] = float(millis) - self.assertEqual(job._properties, cleaned) - - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - - def test__get_resource_config_missing_job_ref(self): - resource = {} - klass = self._make_derived_class() - - with self.assertRaises(KeyError): - klass._get_resource_config(resource) - - def test__get_resource_config_missing_job_id(self): - resource = {"jobReference": {}} - klass = self._make_derived_class() - - with self.assertRaises(KeyError): - klass._get_resource_config(resource) - - def test__get_resource_config_missing_configuration(self): - resource = {"jobReference": {"jobId": self.JOB_ID}} - klass = self._make_derived_class() - - with self.assertRaises(KeyError): - klass._get_resource_config(resource) - - def test__get_resource_config_missing_config_type(self): - resource = {"jobReference": {"jobId": self.JOB_ID}, "configuration": {}} - klass = self._make_derived_class() - - with self.assertRaises(KeyError): - klass._get_resource_config(resource) - - def test__get_resource_config_ok(self): - derived_config = {"foo": "bar"} - resource = { - "jobReference": {"jobId": self.JOB_ID}, - "configuration": {"derived": derived_config}, - } - klass = self._make_derived_class() - - job_id, config = klass._get_resource_config(resource) - - self.assertEqual(job_id, self.JOB_ID) - self.assertEqual(config, {"derived": derived_config}) - - def test__build_resource(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - with self.assertRaises(NotImplementedError): - job._build_resource() - - def test_to_api_repr(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - with self.assertRaises(NotImplementedError): - job.to_api_repr() - - def test__begin_already(self): - job = self._set_properties_job() - job._properties["status"] = {"state": "WHATEVER"} - - with self.assertRaises(ValueError): - job._begin() - - def test__begin_defaults(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - job = self._set_properties_job() - builder = job.to_api_repr = mock.Mock() - builder.return_value = resource - call_api = job._client._call_api = mock.Mock() - call_api.return_value = resource - path = "/projects/{}/jobs".format(self.PROJECT) - job._begin() - - call_api.assert_called_once_with( - DEFAULT_RETRY, - span_name="BigQuery.job.begin", - span_attributes={"path": path}, - job_ref=job, - method="POST", - path=path, - data=resource, - timeout=None, - ) - self.assertEqual(job._properties, resource) - - def test__begin_explicit(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - other_project = "other-project-234" - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - job = self._set_properties_job() - builder = job.to_api_repr = mock.Mock() - builder.return_value = resource - client = _make_client(project=other_project) - call_api = client._call_api = mock.Mock() - call_api.return_value = resource - retry = DEFAULT_RETRY.with_deadline(1) - path = "/projects/{}/jobs".format(self.PROJECT) - job._begin(client=client, retry=retry, timeout=7.5) - - call_api.assert_called_once_with( - retry, - span_name="BigQuery.job.begin", - span_attributes={"path": path}, - job_ref=job, - method="POST", - path=path, - data=resource, - timeout=7.5, - ) - self.assertEqual(job._properties, resource) - - def test_exists_defaults_miss(self): - from google.cloud.exceptions import NotFound - from google.cloud.bigquery.retry import DEFAULT_RETRY - - job = self._set_properties_job() - job._properties["jobReference"]["location"] = self.LOCATION - call_api = job._client._call_api = mock.Mock() - call_api.side_effect = NotFound("testing") - self.assertFalse(job.exists()) - - call_api.assert_called_once_with( - DEFAULT_RETRY, - span_name="BigQuery.job.exists", - span_attributes={ - "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) - }, - job_ref=job, - method="GET", - path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), - query_params={"fields": "id", "location": self.LOCATION}, - timeout=None, - ) - - def test_exists_explicit_hit(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - other_project = "other-project-234" - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - job = self._set_properties_job() - client = _make_client(project=other_project) - call_api = client._call_api = mock.Mock() - call_api.return_value = resource - retry = DEFAULT_RETRY.with_deadline(1) - self.assertTrue(job.exists(client=client, retry=retry)) - - call_api.assert_called_once_with( - retry, - span_name="BigQuery.job.exists", - span_attributes={ - "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) - }, - job_ref=job, - method="GET", - path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), - query_params={"fields": "id"}, - timeout=None, - ) - - def test_exists_w_timeout(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - PATH = "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) - job = self._set_properties_job() - call_api = job._client._call_api = mock.Mock() - job.exists(timeout=7.5) - - call_api.assert_called_once_with( - DEFAULT_RETRY, - span_name="BigQuery.job.exists", - span_attributes={"path": PATH}, - job_ref=job, - method="GET", - path=PATH, - query_params={"fields": "id"}, - timeout=7.5, - ) - - def test_reload_defaults(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - job = self._set_properties_job() - job._properties["jobReference"]["location"] = self.LOCATION - call_api = job._client._call_api = mock.Mock() - call_api.return_value = resource - job.reload() - - call_api.assert_called_once_with( - DEFAULT_RETRY, - span_name="BigQuery.job.reload", - span_attributes={ - "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) - }, - job_ref=job, - method="GET", - path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), - query_params={"location": self.LOCATION}, - timeout=None, - ) - self.assertEqual(job._properties, resource) - - def test_reload_explicit(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - other_project = "other-project-234" - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - job = self._set_properties_job() - client = _make_client(project=other_project) - call_api = client._call_api = mock.Mock() - call_api.return_value = resource - retry = DEFAULT_RETRY.with_deadline(1) - job.reload(client=client, retry=retry, timeout=4.2) - - call_api.assert_called_once_with( - retry, - span_name="BigQuery.job.reload", - span_attributes={ - "path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID) - }, - job_ref=job, - method="GET", - path="/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID), - query_params={}, - timeout=4.2, - ) - self.assertEqual(job._properties, resource) - - def test_cancel_defaults(self): - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - response = {"job": resource} - job = self._set_properties_job() - job._properties["jobReference"]["location"] = self.LOCATION - connection = job._client._connection = _make_connection(response) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertTrue(job.cancel()) - - final_attributes.assert_called() - - connection.api_request.assert_called_once_with( - method="POST", - path="/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID), - query_params={"location": self.LOCATION}, - timeout=None, - ) - self.assertEqual(job._properties, resource) - - def test_cancel_explicit(self): - other_project = "other-project-234" - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - response = {"job": resource} - job = self._set_properties_job() - client = _make_client(project=other_project) - connection = client._connection = _make_connection(response) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertTrue(job.cancel(client=client, timeout=7.5)) - - final_attributes.assert_called_with( - {"path": "/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID)}, - client, - job, - ) - - connection.api_request.assert_called_once_with( - method="POST", - path="/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID), - query_params={}, - timeout=7.5, - ) - self.assertEqual(job._properties, resource) - - def test_cancel_w_custom_retry(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - api_path = "/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID) - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - response = {"job": resource} - job = self._set_properties_job() - - api_request_patcher = mock.patch.object( - job._client._connection, "api_request", side_effect=[ValueError, response], - ) - retry = DEFAULT_RETRY.with_deadline(1).with_predicate( - lambda exc: isinstance(exc, ValueError) - ) - - with api_request_patcher as fake_api_request: - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - result = job.cancel(retry=retry, timeout=7.5) - - final_attributes.assert_called() - - self.assertTrue(result) - self.assertEqual(job._properties, resource) - self.assertEqual( - fake_api_request.call_args_list, - [ - mock.call(method="POST", path=api_path, query_params={}, timeout=7.5), - mock.call( - method="POST", path=api_path, query_params={}, timeout=7.5, - ), # was retried once - ], - ) - - def test__set_future_result_wo_done(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - set_exception = job.set_exception = mock.Mock() - set_result = job.set_result = mock.Mock() - - job._set_future_result() - - set_exception.assert_not_called() - set_result.assert_not_called() - - def test__set_future_result_w_result_set(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["status"] = {"state": "DONE"} - job._result_set = True - set_exception = job.set_exception = mock.Mock() - set_result = job.set_result = mock.Mock() - - job._set_future_result() - - set_exception.assert_not_called() - set_result.assert_not_called() - - def test__set_future_result_w_done_wo_result_set_w_error(self): - from google.cloud.exceptions import NotFound - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["status"] = { - "state": "DONE", - "errorResult": {"reason": "notFound", "message": "testing"}, - } - set_exception = job.set_exception = mock.Mock() - set_result = job.set_result = mock.Mock() - - job._set_future_result() - - set_exception.assert_called_once() - args, kw = set_exception.call_args - (exception,) = args - self.assertIsInstance(exception, NotFound) - self.assertEqual(exception.message, "testing") - self.assertEqual(kw, {}) - set_result.assert_not_called() - - def test__set_future_result_w_done_wo_result_set_wo_error(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["status"] = {"state": "DONE"} - set_exception = job.set_exception = mock.Mock() - set_result = job.set_result = mock.Mock() - - job._set_future_result() - - set_exception.assert_not_called() - set_result.assert_called_once_with(job) - - def test_done_defaults_wo_state(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - reload_ = job.reload = mock.Mock() - - self.assertFalse(job.done()) - - reload_.assert_called_once_with(retry=DEFAULT_RETRY, timeout=None) - - def test_done_explicit_wo_state(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - reload_ = job.reload = mock.Mock() - retry = DEFAULT_RETRY.with_deadline(1) - - self.assertFalse(job.done(retry=retry, timeout=7.5)) - - reload_.assert_called_once_with(retry=retry, timeout=7.5) - - def test_done_already(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["status"] = {"state": "DONE"} - - self.assertTrue(job.done()) - - @mock.patch("google.api_core.future.polling.PollingFuture.result") - def test_result_default_wo_state(self, result): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - begin = job._begin = mock.Mock() - - self.assertIs(job.result(), result.return_value) - - begin.assert_called_once_with(retry=DEFAULT_RETRY, timeout=None) - result.assert_called_once_with(timeout=None) - - @mock.patch("google.api_core.future.polling.PollingFuture.result") - def test_result_w_retry_wo_state(self, result): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - begin = job._begin = mock.Mock() - retry = mock.Mock() - - self.assertIs(job.result(retry=retry), result.return_value) - - begin.assert_called_once_with(retry=retry, timeout=None) - result.assert_called_once_with(timeout=None) - - @mock.patch("google.api_core.future.polling.PollingFuture.result") - def test_result_explicit_w_state(self, result): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["status"] = {"state": "DONE"} - begin = job._begin = mock.Mock() - timeout = 1 - - self.assertIs(job.result(timeout=timeout), result.return_value) - - begin.assert_not_called() - result.assert_called_once_with(timeout=timeout) - - def test_cancelled_wo_error_result(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - - self.assertFalse(job.cancelled()) - - def test_cancelled_w_error_result_not_stopped(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["status"] = {"errorResult": {"reason": "other"}} - - self.assertFalse(job.cancelled()) - - def test_cancelled_w_error_result_w_stopped(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._properties["status"] = {"errorResult": {"reason": "stopped"}} - - self.assertTrue(job.cancelled()) - - -class Test_JobConfig(unittest.TestCase): - JOB_TYPE = "testing" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery import job - - return job._JobConfig - - def _make_one(self, job_type=JOB_TYPE): - return self._get_target_class()(job_type) - - def test_ctor(self): - job_config = self._make_one() - self.assertEqual(job_config._job_type, self.JOB_TYPE) - self.assertEqual(job_config._properties, {self.JOB_TYPE: {}}) - - def test_fill_from_default(self): - from google.cloud.bigquery import QueryJobConfig - - job_config = QueryJobConfig() - job_config.dry_run = True - job_config.maximum_bytes_billed = 1000 - - default_job_config = QueryJobConfig() - default_job_config.use_query_cache = True - default_job_config.maximum_bytes_billed = 2000 - - final_job_config = job_config._fill_from_default(default_job_config) - self.assertTrue(final_job_config.dry_run) - self.assertTrue(final_job_config.use_query_cache) - self.assertEqual(final_job_config.maximum_bytes_billed, 1000) - - def test_fill_from_default_conflict(self): - from google.cloud.bigquery import QueryJobConfig - - basic_job_config = QueryJobConfig() - conflicting_job_config = self._make_one("conflicting_job_type") - self.assertNotEqual( - basic_job_config._job_type, conflicting_job_config._job_type - ) - - with self.assertRaises(TypeError): - basic_job_config._fill_from_default(conflicting_job_config) - - @mock.patch("google.cloud.bigquery._helpers._get_sub_prop") - def test__get_sub_prop_wo_default(self, _get_sub_prop): - job_config = self._make_one() - key = "key" - self.assertIs(job_config._get_sub_prop(key), _get_sub_prop.return_value) - _get_sub_prop.assert_called_once_with( - job_config._properties, [self.JOB_TYPE, key], default=None - ) - - @mock.patch("google.cloud.bigquery._helpers._get_sub_prop") - def test__get_sub_prop_w_default(self, _get_sub_prop): - job_config = self._make_one() - key = "key" - default = "default" - self.assertIs( - job_config._get_sub_prop(key, default=default), _get_sub_prop.return_value - ) - _get_sub_prop.assert_called_once_with( - job_config._properties, [self.JOB_TYPE, key], default=default - ) - - @mock.patch("google.cloud.bigquery._helpers._set_sub_prop") - def test__set_sub_prop(self, _set_sub_prop): - job_config = self._make_one() - key = "key" - value = "value" - job_config._set_sub_prop(key, value) - _set_sub_prop.assert_called_once_with( - job_config._properties, [self.JOB_TYPE, key], value - ) - - def test_to_api_repr(self): - job_config = self._make_one() - expected = job_config._properties = {self.JOB_TYPE: {"foo": "bar"}} - found = job_config.to_api_repr() - self.assertEqual(found, expected) - self.assertIsNot(found, expected) # copied - - # 'from_api_repr' cannot be tested on '_JobConfig', because it presumes - # the ctor can be called w/o arguments - - def test_labels_miss(self): - job_config = self._make_one() - self.assertEqual(job_config.labels, {}) - - def test_labels_update_in_place(self): - job_config = self._make_one() - labels = job_config.labels - labels["foo"] = "bar" # update in place - self.assertEqual(job_config.labels, {"foo": "bar"}) - - def test_labels_hit(self): - labels = {"foo": "bar"} - job_config = self._make_one() - job_config._properties["labels"] = labels - self.assertEqual(job_config.labels, labels) - - def test_labels_setter_invalid(self): - labels = object() - job_config = self._make_one() - with self.assertRaises(ValueError): - job_config.labels = labels - - def test_labels_setter(self): - labels = {"foo": "bar"} - job_config = self._make_one() - job_config.labels = labels - self.assertEqual(job_config._properties["labels"], labels) - - -class _Base(object): - from google.cloud.bigquery.dataset import DatasetReference - from google.cloud.bigquery.table import TableReference - - ENDPOINT = "https://bigquery.googleapis.com" - PROJECT = "project" - SOURCE1 = "http://example.com/source1.csv" - DS_ID = "dataset_id" - DS_REF = DatasetReference(PROJECT, DS_ID) - TABLE_ID = "table_id" - TABLE_REF = TableReference(DS_REF, TABLE_ID) - JOB_ID = "JOB_ID" - KMS_KEY_NAME = "projects/1/locations/us/keyRings/1/cryptoKeys/1" - - def _make_one(self, *args, **kw): - return self._get_target_class()(*args, **kw) - - def _setUpConstants(self): - import datetime - from google.cloud._helpers import UTC - - self.WHEN_TS = 1437767599.006 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) - self.ETAG = "ETAG" - self.FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID) - self.RESOURCE_URL = "{}/bigquery/v2/projects/{}/jobs/{}".format( - self.ENDPOINT, self.PROJECT, self.JOB_ID - ) - self.USER_EMAIL = "phred@example.com" - - def _table_ref(self, table_id): - from google.cloud.bigquery.table import TableReference - - return TableReference(self.DS_REF, table_id) - - def _make_resource(self, started=False, ended=False): - self._setUpConstants() - return _make_job_resource( - creation_time_ms=int(self.WHEN_TS * 1000), - started_time_ms=int(self.WHEN_TS * 1000), - ended_time_ms=int(self.WHEN_TS * 1000) + 1000000, - started=started, - ended=ended, - etag=self.ETAG, - endpoint=self.ENDPOINT, - job_type=self.JOB_TYPE, - job_id=self.JOB_ID, - project_id=self.PROJECT, - user_email=self.USER_EMAIL, - ) - - def _verifyInitialReadonlyProperties(self, job): - # root elements of resource - self.assertIsNone(job.etag) - self.assertIsNone(job.self_link) - self.assertIsNone(job.user_email) - - # derived from resource['statistics'] - self.assertIsNone(job.created) - self.assertIsNone(job.started) - self.assertIsNone(job.ended) - - # derived from resource['status'] - self.assertIsNone(job.error_result) - self.assertIsNone(job.errors) - self.assertIsNone(job.state) - - def _verifyReadonlyResourceProperties(self, job, resource): - from datetime import timedelta - - statistics = resource.get("statistics", {}) - - if "creationTime" in statistics: - self.assertEqual(job.created, self.WHEN) - else: - self.assertIsNone(job.created) - - if "startTime" in statistics: - self.assertEqual(job.started, self.WHEN) - else: - self.assertIsNone(job.started) - - if "endTime" in statistics: - self.assertEqual(job.ended, self.WHEN + timedelta(seconds=1000)) - else: - self.assertIsNone(job.ended) - - if "etag" in resource: - self.assertEqual(job.etag, self.ETAG) - else: - self.assertIsNone(job.etag) - - if "selfLink" in resource: - self.assertEqual(job.self_link, self.RESOURCE_URL) - else: - self.assertIsNone(job.self_link) - - if "user_email" in resource: - self.assertEqual(job.user_email, self.USER_EMAIL) - else: - self.assertIsNone(job.user_email) - - -class TestLoadJobConfig(unittest.TestCase, _Base): - JOB_TYPE = "load" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import LoadJobConfig - - return LoadJobConfig - - def test_ctor_w_properties(self): - config = self._get_target_class()( - allow_jagged_rows=True, allow_quoted_newlines=True - ) - - self.assertTrue(config.allow_jagged_rows) - self.assertTrue(config.allow_quoted_newlines) - - def test_allow_jagged_rows_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.allow_jagged_rows) - - def test_allow_jagged_rows_hit(self): - config = self._get_target_class()() - config._properties["load"]["allowJaggedRows"] = True - self.assertTrue(config.allow_jagged_rows) - - def test_allow_jagged_rows_setter(self): - config = self._get_target_class()() - config.allow_jagged_rows = True - self.assertTrue(config._properties["load"]["allowJaggedRows"]) - - def test_allow_quoted_newlines_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.allow_quoted_newlines) - - def test_allow_quoted_newlines_hit(self): - config = self._get_target_class()() - config._properties["load"]["allowQuotedNewlines"] = True - self.assertTrue(config.allow_quoted_newlines) - - def test_allow_quoted_newlines_setter(self): - config = self._get_target_class()() - config.allow_quoted_newlines = True - self.assertTrue(config._properties["load"]["allowQuotedNewlines"]) - - def test_autodetect_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.autodetect) - - def test_autodetect_hit(self): - config = self._get_target_class()() - config._properties["load"]["autodetect"] = True - self.assertTrue(config.autodetect) - - def test_autodetect_setter(self): - config = self._get_target_class()() - config.autodetect = True - self.assertTrue(config._properties["load"]["autodetect"]) - - def test_clustering_fields_miss(self): - config = self._get_target_class()() - self.assertIsNone(config.clustering_fields) - - def test_clustering_fields_hit(self): - config = self._get_target_class()() - fields = ["email", "postal_code"] - config._properties["load"]["clustering"] = {"fields": fields} - self.assertEqual(config.clustering_fields, fields) - - def test_clustering_fields_setter(self): - fields = ["email", "postal_code"] - config = self._get_target_class()() - config.clustering_fields = fields - self.assertEqual(config._properties["load"]["clustering"], {"fields": fields}) - - def test_clustering_fields_setter_w_none(self): - config = self._get_target_class()() - fields = ["email", "postal_code"] - config._properties["load"]["clustering"] = {"fields": fields} - config.clustering_fields = None - self.assertIsNone(config.clustering_fields) - self.assertNotIn("clustering", config._properties["load"]) - - def test_create_disposition_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.create_disposition) - - def test_create_disposition_hit(self): - from google.cloud.bigquery.job import CreateDisposition - - disposition = CreateDisposition.CREATE_IF_NEEDED - config = self._get_target_class()() - config._properties["load"]["createDisposition"] = disposition - self.assertEqual(config.create_disposition, disposition) - - def test_create_disposition_setter(self): - from google.cloud.bigquery.job import CreateDisposition - - disposition = CreateDisposition.CREATE_IF_NEEDED - config = self._get_target_class()() - config.create_disposition = disposition - self.assertEqual(config._properties["load"]["createDisposition"], disposition) - - def test_destination_encryption_configuration_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.destination_encryption_configuration) - - def test_destination_encryption_configuration_hit(self): - from google.cloud.bigquery.encryption_configuration import ( - EncryptionConfiguration, - ) - - kms_key_name = "kms-key-name" - encryption_configuration = EncryptionConfiguration(kms_key_name) - config = self._get_target_class()() - config._properties["load"]["destinationEncryptionConfiguration"] = { - "kmsKeyName": kms_key_name - } - self.assertEqual( - config.destination_encryption_configuration, encryption_configuration - ) - - def test_destination_encryption_configuration_setter(self): - from google.cloud.bigquery.encryption_configuration import ( - EncryptionConfiguration, - ) - - kms_key_name = "kms-key-name" - encryption_configuration = EncryptionConfiguration(kms_key_name) - config = self._get_target_class()() - config.destination_encryption_configuration = encryption_configuration - expected = {"kmsKeyName": kms_key_name} - self.assertEqual( - config._properties["load"]["destinationEncryptionConfiguration"], expected - ) - - def test_destination_encryption_configuration_setter_w_none(self): - kms_key_name = "kms-key-name" - config = self._get_target_class()() - config._properties["load"]["destinationEncryptionConfiguration"] = { - "kmsKeyName": kms_key_name - } - config.destination_encryption_configuration = None - self.assertIsNone(config.destination_encryption_configuration) - self.assertNotIn( - "destinationEncryptionConfiguration", config._properties["load"] - ) - - def test_destination_table_description_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.destination_table_description) - - def test_destination_table_description_hit(self): - description = "Description" - config = self._get_target_class()() - config._properties["load"]["destinationTableProperties"] = { - "description": description - } - self.assertEqual(config.destination_table_description, description) - - def test_destination_table_description_setter(self): - description = "Description" - config = self._get_target_class()() - config.destination_table_description = description - expected = {"description": description} - self.assertEqual( - config._properties["load"]["destinationTableProperties"], expected - ) - - def test_destination_table_description_setter_w_fn_already(self): - description = "Description" - friendly_name = "Friendly Name" - config = self._get_target_class()() - config._properties["load"]["destinationTableProperties"] = { - "friendlyName": friendly_name - } - config.destination_table_description = description - expected = {"friendlyName": friendly_name, "description": description} - self.assertEqual( - config._properties["load"]["destinationTableProperties"], expected - ) - - def test_destination_table_description_w_none(self): - description = "Description" - friendly_name = "Friendly Name" - config = self._get_target_class()() - config._properties["load"]["destinationTableProperties"] = { - "description": description, - "friendlyName": friendly_name, - } - config.destination_table_description = None - expected = {"friendlyName": friendly_name} - self.assertEqual( - config._properties["load"]["destinationTableProperties"], expected - ) - - def test_destination_table_friendly_name_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.destination_table_friendly_name) - - def test_destination_table_friendly_name_hit(self): - friendly_name = "Friendly Name" - config = self._get_target_class()() - config._properties["load"]["destinationTableProperties"] = { - "friendlyName": friendly_name - } - self.assertEqual(config.destination_table_friendly_name, friendly_name) - - def test_destination_table_friendly_name_setter(self): - friendly_name = "Friendly Name" - config = self._get_target_class()() - config.destination_table_friendly_name = friendly_name - expected = {"friendlyName": friendly_name} - self.assertEqual( - config._properties["load"]["destinationTableProperties"], expected - ) - - def test_destination_table_friendly_name_setter_w_descr_already(self): - friendly_name = "Friendly Name" - description = "Description" - config = self._get_target_class()() - config._properties["load"]["destinationTableProperties"] = { - "description": description - } - config.destination_table_friendly_name = friendly_name - expected = {"friendlyName": friendly_name, "description": description} - self.assertEqual( - config._properties["load"]["destinationTableProperties"], expected - ) - - def test_destination_table_friendly_name_w_none(self): - friendly_name = "Friendly Name" - description = "Description" - config = self._get_target_class()() - config._properties["load"]["destinationTableProperties"] = { - "description": description, - "friendlyName": friendly_name, - } - config.destination_table_friendly_name = None - expected = {"description": description} - self.assertEqual( - config._properties["load"]["destinationTableProperties"], expected - ) - - def test_encoding_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.encoding) - - def test_encoding_hit(self): - from google.cloud.bigquery.job import Encoding - - encoding = Encoding.UTF_8 - config = self._get_target_class()() - config._properties["load"]["encoding"] = encoding - self.assertEqual(config.encoding, encoding) - - def test_encoding_setter(self): - from google.cloud.bigquery.job import Encoding - - encoding = Encoding.UTF_8 - config = self._get_target_class()() - config.encoding = encoding - self.assertEqual(config._properties["load"]["encoding"], encoding) - - def test_field_delimiter_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.field_delimiter) - - def test_field_delimiter_hit(self): - field_delimiter = "|" - config = self._get_target_class()() - config._properties["load"]["fieldDelimiter"] = field_delimiter - self.assertEqual(config.field_delimiter, field_delimiter) - - def test_field_delimiter_setter(self): - field_delimiter = "|" - config = self._get_target_class()() - config.field_delimiter = field_delimiter - self.assertEqual(config._properties["load"]["fieldDelimiter"], field_delimiter) - - def test_hive_partitioning_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.hive_partitioning) - - def test_hive_partitioning_hit(self): - from google.cloud.bigquery.external_config import HivePartitioningOptions - - config = self._get_target_class()() - config._properties["load"]["hivePartitioningOptions"] = { - "sourceUriPrefix": "http://foo/bar", - "mode": "STRINGS", - } - result = config.hive_partitioning - self.assertIsInstance(result, HivePartitioningOptions) - self.assertEqual(result.source_uri_prefix, "http://foo/bar") - self.assertEqual(result.mode, "STRINGS") - - def test_hive_partitioning_setter(self): - from google.cloud.bigquery.external_config import HivePartitioningOptions - - hive_partitioning = HivePartitioningOptions() - hive_partitioning.source_uri_prefix = "http://foo/bar" - hive_partitioning.mode = "AUTO" - - config = self._get_target_class()() - config.hive_partitioning = hive_partitioning - self.assertEqual( - config._properties["load"]["hivePartitioningOptions"], - {"sourceUriPrefix": "http://foo/bar", "mode": "AUTO"}, - ) - - config.hive_partitioning = None - self.assertIsNone(config._properties["load"]["hivePartitioningOptions"]) - - def test_hive_partitioning_invalid_type(self): - config = self._get_target_class()() - - with self.assertRaises(TypeError): - config.hive_partitioning = {"mode": "AUTO"} - - def test_ignore_unknown_values_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.ignore_unknown_values) - - def test_ignore_unknown_values_hit(self): - config = self._get_target_class()() - config._properties["load"]["ignoreUnknownValues"] = True - self.assertTrue(config.ignore_unknown_values) - - def test_ignore_unknown_values_setter(self): - config = self._get_target_class()() - config.ignore_unknown_values = True - self.assertTrue(config._properties["load"]["ignoreUnknownValues"]) - - def test_max_bad_records_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.max_bad_records) - - def test_max_bad_records_hit(self): - max_bad_records = 13 - config = self._get_target_class()() - config._properties["load"]["maxBadRecords"] = max_bad_records - self.assertEqual(config.max_bad_records, max_bad_records) - - def test_max_bad_records_setter(self): - max_bad_records = 13 - config = self._get_target_class()() - config.max_bad_records = max_bad_records - self.assertEqual(config._properties["load"]["maxBadRecords"], max_bad_records) - - def test_null_marker_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.null_marker) - - def test_null_marker_hit(self): - null_marker = "XXX" - config = self._get_target_class()() - config._properties["load"]["nullMarker"] = null_marker - self.assertEqual(config.null_marker, null_marker) - - def test_null_marker_setter(self): - null_marker = "XXX" - config = self._get_target_class()() - config.null_marker = null_marker - self.assertEqual(config._properties["load"]["nullMarker"], null_marker) - - def test_quote_character_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.quote_character) - - def test_quote_character_hit(self): - quote_character = "'" - config = self._get_target_class()() - config._properties["load"]["quote"] = quote_character - self.assertEqual(config.quote_character, quote_character) - - def test_quote_character_setter(self): - quote_character = "'" - config = self._get_target_class()() - config.quote_character = quote_character - self.assertEqual(config._properties["load"]["quote"], quote_character) - - def test_schema_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.schema) - - def test_schema_hit(self): - from google.cloud.bigquery.schema import SchemaField - - config = self._get_target_class()() - all_props_repr = { - "mode": "REQUIRED", - "name": "foo", - "type": "INTEGER", - "description": "Foo", - } - minimal_repr = {"name": "bar", "type": "STRING"} - config._properties["load"]["schema"] = { - "fields": [all_props_repr, minimal_repr] - } - all_props, minimal = config.schema - self.assertEqual(all_props, SchemaField.from_api_repr(all_props_repr)) - self.assertEqual(minimal, SchemaField.from_api_repr(minimal_repr)) - - def test_schema_setter_fields(self): - from google.cloud.bigquery.schema import SchemaField - - config = self._get_target_class()() - full_name = SchemaField("full_name", "STRING", mode="REQUIRED") - age = SchemaField("age", "INTEGER", mode="REQUIRED") - config.schema = [full_name, age] - full_name_repr = { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - } - age_repr = { - "name": "age", - "type": "INTEGER", - "mode": "REQUIRED", - "description": None, - } - self.assertEqual( - config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]} - ) - - def test_schema_setter_valid_mappings_list(self): - config = self._get_target_class()() - - schema = [ - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, - ] - config.schema = schema - - full_name_repr = { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - } - age_repr = { - "name": "age", - "type": "INTEGER", - "mode": "REQUIRED", - "description": None, - } - self.assertEqual( - config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]} - ) - - def test_schema_setter_invalid_mappings_list(self): - config = self._get_target_class()() - - schema = [ - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "age", "typeoo": "INTEGER", "mode": "REQUIRED"}, - ] - - with self.assertRaises(Exception): - config.schema = schema - - def test_schema_setter_unsetting_schema(self): - from google.cloud.bigquery.schema import SchemaField - - config = self._get_target_class()() - config._properties["load"]["schema"] = [ - SchemaField("full_name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - - config.schema = None - self.assertNotIn("schema", config._properties["load"]) - config.schema = None # no error, idempotent operation - - def test_schema_update_options_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.schema_update_options) - - def test_schema_update_options_hit(self): - from google.cloud.bigquery.job import SchemaUpdateOption - - options = [ - SchemaUpdateOption.ALLOW_FIELD_ADDITION, - SchemaUpdateOption.ALLOW_FIELD_RELAXATION, - ] - config = self._get_target_class()() - config._properties["load"]["schemaUpdateOptions"] = options - self.assertEqual(config.schema_update_options, options) - - def test_schema_update_options_setter(self): - from google.cloud.bigquery.job import SchemaUpdateOption - - options = [ - SchemaUpdateOption.ALLOW_FIELD_ADDITION, - SchemaUpdateOption.ALLOW_FIELD_RELAXATION, - ] - config = self._get_target_class()() - config.schema_update_options = options - self.assertEqual(config._properties["load"]["schemaUpdateOptions"], options) - - def test_skip_leading_rows_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.skip_leading_rows) - - def test_skip_leading_rows_hit_w_str(self): - skip_leading_rows = 1 - config = self._get_target_class()() - config._properties["load"]["skipLeadingRows"] = str(skip_leading_rows) - self.assertEqual(config.skip_leading_rows, skip_leading_rows) - - def test_skip_leading_rows_hit_w_integer(self): - skip_leading_rows = 1 - config = self._get_target_class()() - config._properties["load"]["skipLeadingRows"] = skip_leading_rows - self.assertEqual(config.skip_leading_rows, skip_leading_rows) - - def test_skip_leading_rows_setter(self): - skip_leading_rows = 1 - config = self._get_target_class()() - config.skip_leading_rows = skip_leading_rows - self.assertEqual( - config._properties["load"]["skipLeadingRows"], str(skip_leading_rows) - ) - - def test_source_format_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.source_format) - - def test_source_format_hit(self): - from google.cloud.bigquery.job import SourceFormat - - source_format = SourceFormat.CSV - config = self._get_target_class()() - config._properties["load"]["sourceFormat"] = source_format - self.assertEqual(config.source_format, source_format) - - def test_source_format_setter(self): - from google.cloud.bigquery.job import SourceFormat - - source_format = SourceFormat.CSV - config = self._get_target_class()() - config.source_format = source_format - self.assertEqual(config._properties["load"]["sourceFormat"], source_format) - - def test_range_partitioning_w_none(self): - object_under_test = self._get_target_class()() - assert object_under_test.range_partitioning is None - - def test_range_partitioning_w_value(self): - object_under_test = self._get_target_class()() - object_under_test._properties["load"]["rangePartitioning"] = { - "field": "column_one", - "range": {"start": 1, "end": 1000, "interval": 10}, - } - object_under_test.range_partitioning.field == "column_one" - object_under_test.range_partitioning.range_.start == 1 - object_under_test.range_partitioning.range_.end == 1000 - object_under_test.range_partitioning.range_.interval == 10 - - def test_range_partitioning_setter(self): - from google.cloud.bigquery.table import PartitionRange - from google.cloud.bigquery.table import RangePartitioning - - object_under_test = self._get_target_class()() - object_under_test.range_partitioning = RangePartitioning( - field="column_one", range_=PartitionRange(start=1, end=1000, interval=10) - ) - object_under_test.range_partitioning.field == "column_one" - object_under_test.range_partitioning.range_.start == 1 - object_under_test.range_partitioning.range_.end == 1000 - object_under_test.range_partitioning.range_.interval == 10 - - def test_range_partitioning_setter_w_none(self): - object_under_test = self._get_target_class()() - object_under_test.range_partitioning = None - assert object_under_test.range_partitioning is None - - def test_range_partitioning_setter_w_wrong_type(self): - object_under_test = self._get_target_class()() - with pytest.raises(ValueError, match="RangePartitioning"): - object_under_test.range_partitioning = object() - - def test_time_partitioning_miss(self): - config = self._get_target_class()() - self.assertIsNone(config.time_partitioning) - - def test_time_partitioning_hit(self): - from google.cloud.bigquery.table import TimePartitioning - from google.cloud.bigquery.table import TimePartitioningType - - field = "creation_date" - year_ms = 86400 * 1000 * 365 - config = self._get_target_class()() - config._properties["load"]["timePartitioning"] = { - "type": TimePartitioningType.DAY, - "field": field, - "expirationMs": str(year_ms), - "requirePartitionFilter": False, - } - with warnings.catch_warnings(record=True) as warned: - expected = TimePartitioning( - type_=TimePartitioningType.DAY, - field=field, - expiration_ms=year_ms, - require_partition_filter=False, - ) - self.assertEqual(config.time_partitioning, expected) - - assert len(warned) == 1 - warning = warned[0] - assert "TimePartitioning.require_partition_filter" in str(warning) - - def test_time_partitioning_setter(self): - from google.cloud.bigquery.table import TimePartitioning - from google.cloud.bigquery.table import TimePartitioningType - - field = "creation_date" - year_ms = 86400 * 1000 * 365 - - with warnings.catch_warnings(record=True) as warned: - time_partitioning = TimePartitioning( - type_=TimePartitioningType.DAY, - field=field, - expiration_ms=year_ms, - require_partition_filter=False, - ) - - config = self._get_target_class()() - config.time_partitioning = time_partitioning - expected = { - "type": TimePartitioningType.DAY, - "field": field, - "expirationMs": str(year_ms), - "requirePartitionFilter": False, - } - self.assertEqual(config._properties["load"]["timePartitioning"], expected) - - assert len(warned) == 1 - warning = warned[0] - assert "TimePartitioning.require_partition_filter" in str(warning) - - def test_time_partitioning_setter_w_none(self): - from google.cloud.bigquery.table import TimePartitioningType - - field = "creation_date" - year_ms = 86400 * 1000 * 365 - config = self._get_target_class()() - config._properties["load"]["timePartitioning"] = { - "type": TimePartitioningType.DAY, - "field": field, - "expirationMs": str(year_ms), - "requirePartitionFilter": False, - } - config.time_partitioning = None - self.assertIsNone(config.time_partitioning) - self.assertNotIn("timePartitioning", config._properties["load"]) - - def test_use_avro_logical_types(self): - config = self._get_target_class()() - self.assertIsNone(config.use_avro_logical_types) - - def test_use_avro_logical_types_setter(self): - config = self._get_target_class()() - config.use_avro_logical_types = True - self.assertTrue(config._properties["load"]["useAvroLogicalTypes"]) - - def test_write_disposition_missing(self): - config = self._get_target_class()() - self.assertIsNone(config.write_disposition) - - def test_write_disposition_hit(self): - from google.cloud.bigquery.job import WriteDisposition - - write_disposition = WriteDisposition.WRITE_TRUNCATE - config = self._get_target_class()() - config._properties["load"]["writeDisposition"] = write_disposition - self.assertEqual(config.write_disposition, write_disposition) - - def test_write_disposition_setter(self): - from google.cloud.bigquery.job import WriteDisposition - - write_disposition = WriteDisposition.WRITE_TRUNCATE - config = self._get_target_class()() - config.write_disposition = write_disposition - self.assertEqual( - config._properties["load"]["writeDisposition"], write_disposition - ) - - -class TestLoadJob(unittest.TestCase, _Base): - JOB_TYPE = "load" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import LoadJob - - return LoadJob - - def _setUpConstants(self): - super(TestLoadJob, self)._setUpConstants() - self.INPUT_FILES = 2 - self.INPUT_BYTES = 12345 - self.OUTPUT_BYTES = 23456 - self.OUTPUT_ROWS = 345 - - def _make_resource(self, started=False, ended=False): - resource = super(TestLoadJob, self)._make_resource(started, ended) - config = resource["configuration"]["load"] - config["sourceUris"] = [self.SOURCE1] - config["destinationTable"] = { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.TABLE_ID, - } - - if ended: - resource["status"] = {"state": "DONE"} - resource["statistics"]["load"]["inputFiles"] = self.INPUT_FILES - resource["statistics"]["load"]["inputFileBytes"] = self.INPUT_BYTES - resource["statistics"]["load"]["outputBytes"] = self.OUTPUT_BYTES - resource["statistics"]["load"]["outputRows"] = self.OUTPUT_ROWS - - return resource - - def _verifyBooleanConfigProperties(self, job, config): - if "allowJaggedRows" in config: - self.assertEqual(job.allow_jagged_rows, config["allowJaggedRows"]) - else: - self.assertIsNone(job.allow_jagged_rows) - if "allowQuotedNewlines" in config: - self.assertEqual(job.allow_quoted_newlines, config["allowQuotedNewlines"]) - else: - self.assertIsNone(job.allow_quoted_newlines) - if "autodetect" in config: - self.assertEqual(job.autodetect, config["autodetect"]) - else: - self.assertIsNone(job.autodetect) - if "ignoreUnknownValues" in config: - self.assertEqual(job.ignore_unknown_values, config["ignoreUnknownValues"]) - else: - self.assertIsNone(job.ignore_unknown_values) - if "useAvroLogicalTypes" in config: - self.assertEqual(job.use_avro_logical_types, config["useAvroLogicalTypes"]) - else: - self.assertIsNone(job.use_avro_logical_types) - - def _verifyEnumConfigProperties(self, job, config): - if "createDisposition" in config: - self.assertEqual(job.create_disposition, config["createDisposition"]) - else: - self.assertIsNone(job.create_disposition) - if "encoding" in config: - self.assertEqual(job.encoding, config["encoding"]) - else: - self.assertIsNone(job.encoding) - if "sourceFormat" in config: - self.assertEqual(job.source_format, config["sourceFormat"]) - else: - self.assertIsNone(job.source_format) - if "writeDisposition" in config: - self.assertEqual(job.write_disposition, config["writeDisposition"]) - else: - self.assertIsNone(job.write_disposition) - if "schemaUpdateOptions" in config: - self.assertEqual(job.schema_update_options, config["schemaUpdateOptions"]) - else: - self.assertIsNone(job.schema_update_options) - - def _verifyResourceProperties(self, job, resource): - self._verifyReadonlyResourceProperties(job, resource) - - config = resource.get("configuration", {}).get("load") - - self._verifyBooleanConfigProperties(job, config) - self._verifyEnumConfigProperties(job, config) - - self.assertEqual(job.source_uris, config["sourceUris"]) - - table_ref = config["destinationTable"] - self.assertEqual(job.destination.project, table_ref["projectId"]) - self.assertEqual(job.destination.dataset_id, table_ref["datasetId"]) - self.assertEqual(job.destination.table_id, table_ref["tableId"]) - - if "fieldDelimiter" in config: - self.assertEqual(job.field_delimiter, config["fieldDelimiter"]) - else: - self.assertIsNone(job.field_delimiter) - if "maxBadRecords" in config: - self.assertEqual(job.max_bad_records, config["maxBadRecords"]) - else: - self.assertIsNone(job.max_bad_records) - if "nullMarker" in config: - self.assertEqual(job.null_marker, config["nullMarker"]) - else: - self.assertIsNone(job.null_marker) - if "quote" in config: - self.assertEqual(job.quote_character, config["quote"]) - else: - self.assertIsNone(job.quote_character) - if "skipLeadingRows" in config: - self.assertEqual(str(job.skip_leading_rows), config["skipLeadingRows"]) - else: - self.assertIsNone(job.skip_leading_rows) - - if "destinationEncryptionConfiguration" in config: - self.assertIsNotNone(job.destination_encryption_configuration) - self.assertEqual( - job.destination_encryption_configuration.kms_key_name, - config["destinationEncryptionConfiguration"]["kmsKeyName"], - ) - else: - self.assertIsNone(job.destination_encryption_configuration) - - def test_ctor(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - self.assertIs(job.destination, self.TABLE_REF) - self.assertEqual(list(job.source_uris), [self.SOURCE1]) - self.assertIs(job._client, client) - self.assertEqual(job.job_type, self.JOB_TYPE) - self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) - - self._verifyInitialReadonlyProperties(job) - - # derived from resource['statistics']['load'] - self.assertIsNone(job.input_file_bytes) - self.assertIsNone(job.input_files) - self.assertIsNone(job.output_bytes) - self.assertIsNone(job.output_rows) - - # set/read from resource['configuration']['load'] - self.assertIsNone(job.schema) - self.assertIsNone(job.allow_jagged_rows) - self.assertIsNone(job.allow_quoted_newlines) - self.assertIsNone(job.autodetect) - self.assertIsNone(job.create_disposition) - self.assertIsNone(job.encoding) - self.assertIsNone(job.field_delimiter) - self.assertIsNone(job.ignore_unknown_values) - self.assertIsNone(job.max_bad_records) - self.assertIsNone(job.null_marker) - self.assertIsNone(job.quote_character) - self.assertIsNone(job.skip_leading_rows) - self.assertIsNone(job.source_format) - self.assertIsNone(job.write_disposition) - self.assertIsNone(job.destination_encryption_configuration) - self.assertIsNone(job.destination_table_description) - self.assertIsNone(job.destination_table_friendly_name) - self.assertIsNone(job.range_partitioning) - self.assertIsNone(job.time_partitioning) - self.assertIsNone(job.use_avro_logical_types) - self.assertIsNone(job.clustering_fields) - self.assertIsNone(job.schema_update_options) - - def test_ctor_w_config(self): - from google.cloud.bigquery.schema import SchemaField - from google.cloud.bigquery.job import LoadJobConfig - - client = _make_client(project=self.PROJECT) - full_name = SchemaField("full_name", "STRING", mode="REQUIRED") - age = SchemaField("age", "INTEGER", mode="REQUIRED") - config = LoadJobConfig() - config.schema = [full_name, age] - job = self._make_one( - self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client, config - ) - self.assertEqual(job.schema, [full_name, age]) - config.destination_table_description = "Description" - expected = {"description": "Description"} - self.assertEqual( - config._properties["load"]["destinationTableProperties"], expected - ) - friendly_name = "Friendly Name" - config._properties["load"]["destinationTableProperties"] = { - "friendlyName": friendly_name - } - self.assertEqual(config.destination_table_friendly_name, friendly_name) - - def test_ctor_w_job_reference(self): - from google.cloud.bigquery import job - - client = _make_client(project=self.PROJECT) - job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") - load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) - self.assertEqual(load_job.project, "alternative-project") - self.assertEqual(load_job.location, "US") - - def test_done(self): - client = _make_client(project=self.PROJECT) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - self.assertTrue(job.done()) - - def test_result(self): - client = _make_client(project=self.PROJECT) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - - result = job.result() - - self.assertIs(result, job) - - def test_result_invokes_begin(self): - begun_resource = self._make_resource() - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection(begun_resource, done_resource) - client = _make_client(self.PROJECT) - client._connection = connection - - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - job.result() - - self.assertEqual(len(connection.api_request.call_args_list), 2) - begin_request, reload_request = connection.api_request.call_args_list - self.assertEqual(begin_request[1]["method"], "POST") - self.assertEqual(reload_request[1]["method"], "GET") - - def test_schema_setter_non_list(self): - from google.cloud.bigquery.job import LoadJobConfig - - config = LoadJobConfig() - with self.assertRaises(TypeError): - config.schema = object() - - def test_schema_setter_invalid_field(self): - from google.cloud.bigquery.job import LoadJobConfig - from google.cloud.bigquery.schema import SchemaField - - config = LoadJobConfig() - full_name = SchemaField("full_name", "STRING", mode="REQUIRED") - with self.assertRaises(ValueError): - config.schema = [full_name, object()] - - def test_schema_setter(self): - from google.cloud.bigquery.job import LoadJobConfig - from google.cloud.bigquery.schema import SchemaField - - config = LoadJobConfig() - full_name = SchemaField("full_name", "STRING", mode="REQUIRED") - age = SchemaField("age", "INTEGER", mode="REQUIRED") - config.schema = [full_name, age] - self.assertEqual(config.schema, [full_name, age]) - - def test_props_set_by_server(self): - import datetime - from google.cloud._helpers import UTC - from google.cloud._helpers import _millis - - CREATED = datetime.datetime(2015, 8, 11, 12, 13, 22, tzinfo=UTC) - STARTED = datetime.datetime(2015, 8, 11, 13, 47, 15, tzinfo=UTC) - ENDED = datetime.datetime(2015, 8, 11, 14, 47, 15, tzinfo=UTC) - FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID) - URL = "http://example.com/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - EMAIL = "phred@example.com" - ERROR_RESULT = { - "debugInfo": "DEBUG", - "location": "LOCATION", - "message": "MESSAGE", - "reason": "REASON", - } - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - job._properties["etag"] = "ETAG" - job._properties["id"] = FULL_JOB_ID - job._properties["selfLink"] = URL - job._properties["user_email"] = EMAIL - - statistics = job._properties["statistics"] = {} - statistics["creationTime"] = _millis(CREATED) - statistics["startTime"] = _millis(STARTED) - statistics["endTime"] = _millis(ENDED) - - self.assertEqual(job.etag, "ETAG") - self.assertEqual(job.self_link, URL) - self.assertEqual(job.user_email, EMAIL) - - self.assertEqual(job.created, CREATED) - self.assertEqual(job.started, STARTED) - self.assertEqual(job.ended, ENDED) - - # running jobs have no load stats not yet set. - self.assertIsNone(job.output_bytes) - - load_stats = statistics["load"] = {} - load_stats["inputFileBytes"] = 12345 - load_stats["inputFiles"] = 1 - load_stats["outputBytes"] = 23456 - load_stats["outputRows"] = 345 - - self.assertEqual(job.input_file_bytes, 12345) - self.assertEqual(job.input_files, 1) - self.assertEqual(job.output_bytes, 23456) - self.assertEqual(job.output_rows, 345) - - status = job._properties["status"] = {} - - self.assertIsNone(job.error_result) - self.assertIsNone(job.errors) - self.assertIsNone(job.state) - - status["errorResult"] = ERROR_RESULT - status["errors"] = [ERROR_RESULT] - status["state"] = "STATE" - - self.assertEqual(job.error_result, ERROR_RESULT) - self.assertEqual(job.errors, [ERROR_RESULT]) - self.assertEqual(job.state, "STATE") - - def test_from_api_repr_missing_identity(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = {} - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_missing_config(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": "%s:%s" % (self.PROJECT, self.JOB_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - } - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_bare(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.FULL_JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "load": { - "sourceUris": [self.SOURCE1], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.TABLE_ID, - }, - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_with_encryption(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.FULL_JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "load": { - "sourceUris": [self.SOURCE1], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.TABLE_ID, - }, - "destinationEncryptionConfiguration": { - "kmsKeyName": self.KMS_KEY_NAME - }, - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_repr_w_properties(self): - from google.cloud.bigquery.job import CreateDisposition - - client = _make_client(project=self.PROJECT) - RESOURCE = self._make_resource() - load_config = RESOURCE["configuration"]["load"] - load_config["createDisposition"] = CreateDisposition.CREATE_IF_NEEDED - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_already_running(self): - conn = _make_connection() - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - job._properties["status"] = {"state": "RUNNING"} - - with self.assertRaises(ValueError): - job._begin() - - def test_begin_w_bound_client(self): - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - path = "/projects/{}/jobs".format(self.PROJECT) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": path}, client, job) - - conn.api_request.assert_called_once_with( - method="POST", - path=path, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "load": { - "sourceUris": [self.SOURCE1], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.TABLE_ID, - }, - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_autodetect(self): - from google.cloud.bigquery.job import LoadJobConfig - - path = "/projects/{}/jobs".format(self.PROJECT) - resource = self._make_resource() - resource["configuration"]["load"]["autodetect"] = True - # Ensure None for missing server-set props - del resource["statistics"]["creationTime"] - del resource["etag"] - del resource["selfLink"] - del resource["user_email"] - conn = _make_connection(resource) - client = _make_client(project=self.PROJECT, connection=conn) - config = LoadJobConfig() - config.autodetect = True - job = self._make_one( - self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client, config - ) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": path}, client, job) - - sent = { - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "load": { - "sourceUris": [self.SOURCE1], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.TABLE_ID, - }, - "autodetect": True, - } - }, - } - conn.api_request.assert_called_once_with( - method="POST", path=path, data=sent, timeout=None - ) - self._verifyResourceProperties(job, resource) - - def test_begin_w_alternate_client(self): - from google.cloud.bigquery.job import CreateDisposition - from google.cloud.bigquery.job import LoadJobConfig - from google.cloud.bigquery.job import SchemaUpdateOption - from google.cloud.bigquery.job import WriteDisposition - from google.cloud.bigquery.schema import SchemaField - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource(ended=True) - LOAD_CONFIGURATION = { - "sourceUris": [self.SOURCE1], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.TABLE_ID, - }, - "allowJaggedRows": True, - "allowQuotedNewlines": True, - "createDisposition": CreateDisposition.CREATE_NEVER, - "encoding": "ISO-8559-1", - "fieldDelimiter": "|", - "ignoreUnknownValues": True, - "maxBadRecords": 100, - "nullMarker": r"\N", - "quote": "'", - "skipLeadingRows": "1", - "sourceFormat": "CSV", - "useAvroLogicalTypes": True, - "writeDisposition": WriteDisposition.WRITE_TRUNCATE, - "schema": { - "fields": [ - { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - }, - { - "name": "age", - "type": "INTEGER", - "mode": "REQUIRED", - "description": None, - }, - ] - }, - "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION], - } - RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - full_name = SchemaField("full_name", "STRING", mode="REQUIRED") - age = SchemaField("age", "INTEGER", mode="REQUIRED") - config = LoadJobConfig() - config.schema = [full_name, age] - job = self._make_one( - self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1, config - ) - config.allow_jagged_rows = True - config.allow_quoted_newlines = True - config.create_disposition = CreateDisposition.CREATE_NEVER - config.encoding = "ISO-8559-1" - config.field_delimiter = "|" - config.ignore_unknown_values = True - config.max_bad_records = 100 - config.null_marker = r"\N" - config.quote_character = "'" - config.skip_leading_rows = 1 - config.source_format = "CSV" - config.use_avro_logical_types = True - config.write_disposition = WriteDisposition.WRITE_TRUNCATE - config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION] - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - self.assertEqual(len(conn2.api_request.call_args_list), 1) - req = conn2.api_request.call_args_list[0] - self.assertEqual(req[1]["method"], "POST") - self.assertEqual(req[1]["path"], PATH) - SENT = { - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": {"load": LOAD_CONFIGURATION}, - } - self.maxDiff = None - self.assertEqual(req[1]["data"], SENT) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_job_reference(self): - from google.cloud.bigquery import job - - resource = self._make_resource() - resource["jobReference"]["projectId"] = "alternative-project" - resource["jobReference"]["location"] = "US" - job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") - conn = _make_connection(resource) - client = _make_client(project=self.PROJECT, connection=conn) - load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - load_job._begin() - final_attributes.assert_called_with( - {"path": "/projects/alternative-project/jobs"}, client, load_job - ) - - conn.api_request.assert_called_once() - _, request = conn.api_request.call_args - self.assertEqual(request["method"], "POST") - self.assertEqual(request["path"], "/projects/alternative-project/jobs") - self.assertEqual( - request["data"]["jobReference"]["projectId"], "alternative-project" - ) - self.assertEqual(request["data"]["jobReference"]["location"], "US") - self.assertEqual(request["data"]["jobReference"]["jobId"], self.JOB_ID) - - def test_exists_miss_w_bound_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn = _make_connection() - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertFalse(job.exists()) - - final_attributes.assert_called_with( - {"path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID)}, - client, - job, - ) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None - ) - - def test_exists_hit_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection({}) - client2 = _make_client(project=self.PROJECT, connection=conn2) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertTrue(job.exists(client=client2)) - - final_attributes.assert_called_with( - {"path": "/projects/{}/jobs/{}".format(self.PROJECT, self.JOB_ID)}, - client2, - job, - ) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None - ) - - def test_exists_miss_w_job_reference(self): - from google.cloud.bigquery import job - - job_ref = job._JobReference("my-job-id", "other-project", "US") - conn = _make_connection() - client = _make_client(project=self.PROJECT, connection=conn) - load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertFalse(load_job.exists()) - - final_attributes.assert_called_with( - {"path": "/projects/other-project/jobs/my-job-id"}, client, load_job - ) - - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/other-project/jobs/my-job-id", - query_params={"fields": "id", "location": "US"}, - timeout=None, - ) - - def test_reload_w_bound_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource() - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_reload_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource() - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_reload_w_job_reference(self): - from google.cloud.bigquery import job - - resource = self._make_resource(ended=True) - resource["jobReference"]["projectId"] = "alternative-project" - resource["jobReference"]["location"] = "US" - job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") - conn = _make_connection(resource) - client = _make_client(project=self.PROJECT, connection=conn) - load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - load_job.reload() - - final_attributes.assert_called_with( - {"path": "/projects/alternative-project/jobs/{}".format(self.JOB_ID)}, - client, - load_job, - ) - - conn.api_request.assert_called_once_with( - method="GET", - path="/projects/alternative-project/jobs/{}".format(self.JOB_ID), - query_params={"location": "US"}, - timeout=None, - ) - - def test_cancel_w_bound_client(self): - PATH = "/projects/%s/jobs/%s/cancel" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource(ended=True) - RESPONSE = {"job": RESOURCE} - conn = _make_connection(RESPONSE) - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.cancel() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="POST", path=PATH, query_params={}, timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_cancel_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s/cancel" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource(ended=True) - RESPONSE = {"job": RESOURCE} - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESPONSE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.cancel(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="POST", path=PATH, query_params={}, timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_cancel_w_job_reference(self): - from google.cloud.bigquery import job - - resource = self._make_resource(ended=True) - resource["jobReference"]["projectId"] = "alternative-project" - resource["jobReference"]["location"] = "US" - job_ref = job._JobReference(self.JOB_ID, "alternative-project", "US") - conn = _make_connection({"job": resource}) - client = _make_client(project=self.PROJECT, connection=conn) - load_job = self._make_one(job_ref, [self.SOURCE1], self.TABLE_REF, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - load_job.cancel() - - final_attributes.assert_called_with( - { - "path": "/projects/alternative-project/jobs/{}/cancel".format( - self.JOB_ID - ) - }, - client, - load_job, - ) - conn.api_request.assert_called_once_with( - method="POST", - path="/projects/alternative-project/jobs/{}/cancel".format(self.JOB_ID), - query_params={"location": "US"}, - timeout=None, - ) - - -class TestCopyJobConfig(unittest.TestCase, _Base): - JOB_TYPE = "copy" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import CopyJobConfig - - return CopyJobConfig - - def test_ctor_w_properties(self): - from google.cloud.bigquery.job import CreateDisposition - from google.cloud.bigquery.job import WriteDisposition - - create_disposition = CreateDisposition.CREATE_NEVER - write_disposition = WriteDisposition.WRITE_TRUNCATE - config = self._get_target_class()( - create_disposition=create_disposition, write_disposition=write_disposition - ) - - self.assertEqual(config.create_disposition, create_disposition) - self.assertEqual(config.write_disposition, write_disposition) - - def test_to_api_repr_with_encryption(self): - from google.cloud.bigquery.encryption_configuration import ( - EncryptionConfiguration, - ) - - config = self._make_one() - config.destination_encryption_configuration = EncryptionConfiguration( - kms_key_name=self.KMS_KEY_NAME - ) - resource = config.to_api_repr() - self.assertEqual( - resource, - { - "copy": { - "destinationEncryptionConfiguration": { - "kmsKeyName": self.KMS_KEY_NAME - } - } - }, - ) - - def test_to_api_repr_with_encryption_none(self): - config = self._make_one() - config.destination_encryption_configuration = None - resource = config.to_api_repr() - self.assertEqual( - resource, {"copy": {"destinationEncryptionConfiguration": None}} - ) - - -class TestCopyJob(unittest.TestCase, _Base): - JOB_TYPE = "copy" - SOURCE_TABLE = "source_table" - DESTINATION_TABLE = "destination_table" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import CopyJob - - return CopyJob - - def _make_resource(self, started=False, ended=False): - resource = super(TestCopyJob, self)._make_resource(started, ended) - config = resource["configuration"]["copy"] - config["sourceTables"] = [ - { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - } - ] - config["destinationTable"] = { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - } - - return resource - - def _verifyResourceProperties(self, job, resource): - self._verifyReadonlyResourceProperties(job, resource) - - config = resource.get("configuration", {}).get("copy") - - table_ref = config["destinationTable"] - self.assertEqual(job.destination.project, table_ref["projectId"]) - self.assertEqual(job.destination.dataset_id, table_ref["datasetId"]) - self.assertEqual(job.destination.table_id, table_ref["tableId"]) - - sources = config.get("sourceTables") - if sources is None: - sources = [config["sourceTable"]] - self.assertEqual(len(sources), len(job.sources)) - for table_ref, table in zip(sources, job.sources): - self.assertEqual(table.project, table_ref["projectId"]) - self.assertEqual(table.dataset_id, table_ref["datasetId"]) - self.assertEqual(table.table_id, table_ref["tableId"]) - - if "createDisposition" in config: - self.assertEqual(job.create_disposition, config["createDisposition"]) - else: - self.assertIsNone(job.create_disposition) - - if "writeDisposition" in config: - self.assertEqual(job.write_disposition, config["writeDisposition"]) - else: - self.assertIsNone(job.write_disposition) - - if "destinationEncryptionConfiguration" in config: - self.assertIsNotNone(job.destination_encryption_configuration) - self.assertEqual( - job.destination_encryption_configuration.kms_key_name, - config["destinationEncryptionConfiguration"]["kmsKeyName"], - ) - else: - self.assertIsNone(job.destination_encryption_configuration) - - def test_ctor(self): - client = _make_client(project=self.PROJECT) - source = self._table_ref(self.SOURCE_TABLE) - destination = self._table_ref(self.DESTINATION_TABLE) - job = self._make_one(self.JOB_ID, [source], destination, client) - self.assertIs(job.destination, destination) - self.assertEqual(job.sources, [source]) - self.assertIs(job._client, client) - self.assertEqual(job.job_type, self.JOB_TYPE) - self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) - - self._verifyInitialReadonlyProperties(job) - - # set/read from resource['configuration']['copy'] - self.assertIsNone(job.create_disposition) - self.assertIsNone(job.write_disposition) - self.assertIsNone(job.destination_encryption_configuration) - - def test_from_api_repr_missing_identity(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = {} - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_missing_config(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - } - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_bare(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "copy": { - "sourceTables": [ - { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - } - ], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - }, - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_with_encryption(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "copy": { - "sourceTables": [ - { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - } - ], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - }, - "destinationEncryptionConfiguration": { - "kmsKeyName": self.KMS_KEY_NAME - }, - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_repr_w_sourcetable(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "copy": { - "sourceTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - }, - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - }, - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_repr_wo_sources(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "copy": { - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - } - } - }, - } - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_w_properties(self): - from google.cloud.bigquery.job import CreateDisposition - - client = _make_client(project=self.PROJECT) - RESOURCE = self._make_resource() - copy_config = RESOURCE["configuration"]["copy"] - copy_config["createDisposition"] = CreateDisposition.CREATE_IF_NEEDED - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_bound_client(self): - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - source = self._table_ref(self.SOURCE_TABLE) - destination = self._table_ref(self.DESTINATION_TABLE) - job = self._make_one(self.JOB_ID, [source], destination, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "copy": { - "sourceTables": [ - { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - } - ], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - }, - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_alternate_client(self): - from google.cloud.bigquery.job import CopyJobConfig - - from google.cloud.bigquery.job import CreateDisposition - from google.cloud.bigquery.job import WriteDisposition - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource(ended=True) - COPY_CONFIGURATION = { - "sourceTables": [ - { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - } - ], - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - }, - "createDisposition": CreateDisposition.CREATE_NEVER, - "writeDisposition": WriteDisposition.WRITE_TRUNCATE, - } - RESOURCE["configuration"]["copy"] = COPY_CONFIGURATION - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - source = self._table_ref(self.SOURCE_TABLE) - destination = self._table_ref(self.DESTINATION_TABLE) - config = CopyJobConfig() - config.create_disposition = CreateDisposition.CREATE_NEVER - config.write_disposition = WriteDisposition.WRITE_TRUNCATE - job = self._make_one(self.JOB_ID, [source], destination, client1, config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": {"copy": COPY_CONFIGURATION}, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_exists_miss_w_bound_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn = _make_connection() - client = _make_client(project=self.PROJECT, connection=conn) - - source = self._table_ref(self.SOURCE_TABLE) - destination = self._table_ref(self.DESTINATION_TABLE) - job = self._make_one(self.JOB_ID, [source], destination, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertFalse(job.exists()) - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None, - ) - - def test_exists_hit_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection({}) - client2 = _make_client(project=self.PROJECT, connection=conn2) - source = self._table_ref(self.SOURCE_TABLE) - destination = self._table_ref(self.DESTINATION_TABLE) - job = self._make_one(self.JOB_ID, [source], destination, client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertTrue(job.exists(client=client2)) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None - ) - - def test_reload_w_bound_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource() - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - source = self._table_ref(self.SOURCE_TABLE) - destination = self._table_ref(self.DESTINATION_TABLE) - job = self._make_one(self.JOB_ID, [source], destination, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_reload_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource() - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - source = self._table_ref(self.SOURCE_TABLE) - destination = self._table_ref(self.DESTINATION_TABLE) - job = self._make_one(self.JOB_ID, [source], destination, client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - -class TestExtractJobConfig(unittest.TestCase, _Base): - JOB_TYPE = "extract" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import ExtractJobConfig - - return ExtractJobConfig - - def test_ctor_w_properties(self): - config = self._get_target_class()(field_delimiter="\t", print_header=True) - - self.assertEqual(config.field_delimiter, "\t") - self.assertTrue(config.print_header) - - def test_to_api_repr(self): - from google.cloud.bigquery import job - - config = self._make_one() - config.compression = job.Compression.SNAPPY - config.destination_format = job.DestinationFormat.AVRO - config.field_delimiter = "ignored for avro" - config.print_header = False - config._properties["extract"]["someNewField"] = "some-value" - config.use_avro_logical_types = True - resource = config.to_api_repr() - self.assertEqual( - resource, - { - "extract": { - "compression": "SNAPPY", - "destinationFormat": "AVRO", - "fieldDelimiter": "ignored for avro", - "printHeader": False, - "someNewField": "some-value", - "useAvroLogicalTypes": True, - } - }, - ) - - def test_from_api_repr(self): - cls = self._get_target_class() - config = cls.from_api_repr( - { - "extract": { - "compression": "NONE", - "destinationFormat": "CSV", - "fieldDelimiter": "\t", - "printHeader": True, - "someNewField": "some-value", - "useAvroLogicalTypes": False, - } - } - ) - self.assertEqual(config.compression, "NONE") - self.assertEqual(config.destination_format, "CSV") - self.assertEqual(config.field_delimiter, "\t") - self.assertEqual(config.print_header, True) - self.assertEqual(config._properties["extract"]["someNewField"], "some-value") - self.assertEqual(config.use_avro_logical_types, False) - - -class TestExtractJob(unittest.TestCase, _Base): - JOB_TYPE = "extract" - SOURCE_TABLE = "source_table" - DESTINATION_URI = "gs://bucket_name/object_name" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import ExtractJob - - return ExtractJob - - def _make_resource(self, started=False, ended=False): - resource = super(TestExtractJob, self)._make_resource(started, ended) - config = resource["configuration"]["extract"] - config["sourceTable"] = { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - } - config["destinationUris"] = [self.DESTINATION_URI] - return resource - - def _verifyResourceProperties(self, job, resource): - self._verifyReadonlyResourceProperties(job, resource) - - config = resource.get("configuration", {}).get("extract") - - self.assertEqual(job.destination_uris, config["destinationUris"]) - - if "sourceTable" in config: - table_ref = config["sourceTable"] - self.assertEqual(job.source.project, table_ref["projectId"]) - self.assertEqual(job.source.dataset_id, table_ref["datasetId"]) - self.assertEqual(job.source.table_id, table_ref["tableId"]) - else: - model_ref = config["sourceModel"] - self.assertEqual(job.source.project, model_ref["projectId"]) - self.assertEqual(job.source.dataset_id, model_ref["datasetId"]) - self.assertEqual(job.source.model_id, model_ref["modelId"]) - - if "compression" in config: - self.assertEqual(job.compression, config["compression"]) - else: - self.assertIsNone(job.compression) - - if "destinationFormat" in config: - self.assertEqual(job.destination_format, config["destinationFormat"]) - else: - self.assertIsNone(job.destination_format) - - if "fieldDelimiter" in config: - self.assertEqual(job.field_delimiter, config["fieldDelimiter"]) - else: - self.assertIsNone(job.field_delimiter) - - if "printHeader" in config: - self.assertEqual(job.print_header, config["printHeader"]) - else: - self.assertIsNone(job.print_header) - - def test_ctor(self): - from google.cloud.bigquery.table import Table - - client = _make_client(project=self.PROJECT) - source = Table(self.TABLE_REF) - job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) - self.assertEqual(job.source.project, self.PROJECT) - self.assertEqual(job.source.dataset_id, self.DS_ID) - self.assertEqual(job.source.table_id, self.TABLE_ID) - self.assertEqual(job.destination_uris, [self.DESTINATION_URI]) - self.assertIs(job._client, client) - self.assertEqual(job.job_type, self.JOB_TYPE) - self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) - - self._verifyInitialReadonlyProperties(job) - - # set/read from resource['configuration']['extract'] - self.assertIsNone(job.compression) - self.assertIsNone(job.destination_format) - self.assertIsNone(job.field_delimiter) - self.assertIsNone(job.print_header) - - def test_destination_uri_file_counts(self): - file_counts = 23 - client = _make_client(project=self.PROJECT) - job = self._make_one( - self.JOB_ID, self.TABLE_REF, [self.DESTINATION_URI], client - ) - self.assertIsNone(job.destination_uri_file_counts) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.destination_uri_file_counts) - - extract_stats = statistics["extract"] = {} - self.assertIsNone(job.destination_uri_file_counts) - - extract_stats["destinationUriFileCounts"] = [str(file_counts)] - self.assertEqual(job.destination_uri_file_counts, [file_counts]) - - def test_from_api_repr_missing_identity(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = {} - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_missing_config(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - } - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_bare(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "extract": { - "sourceTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - }, - "destinationUris": [self.DESTINATION_URI], - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_repr_for_model(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "extract": { - "sourceModel": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "modelId": "model_id", - }, - "destinationUris": [self.DESTINATION_URI], - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_repr_w_properties(self): - from google.cloud.bigquery.job import Compression - - client = _make_client(project=self.PROJECT) - RESOURCE = self._make_resource() - extract_config = RESOURCE["configuration"]["extract"] - extract_config["compression"] = Compression.GZIP - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_bound_client(self): - from google.cloud.bigquery.dataset import DatasetReference - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - source_dataset = DatasetReference(self.PROJECT, self.DS_ID) - source = source_dataset.table(self.SOURCE_TABLE) - job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "extract": { - "sourceTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - }, - "destinationUris": [self.DESTINATION_URI], - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_alternate_client(self): - from google.cloud.bigquery.dataset import DatasetReference - from google.cloud.bigquery.job import Compression - from google.cloud.bigquery.job import DestinationFormat - from google.cloud.bigquery.job import ExtractJobConfig - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource(ended=True) - EXTRACT_CONFIGURATION = { - "sourceTable": { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.SOURCE_TABLE, - }, - "destinationUris": [self.DESTINATION_URI], - "compression": Compression.GZIP, - "destinationFormat": DestinationFormat.NEWLINE_DELIMITED_JSON, - "fieldDelimiter": "|", - "printHeader": False, - } - RESOURCE["configuration"]["extract"] = EXTRACT_CONFIGURATION - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - source_dataset = DatasetReference(self.PROJECT, self.DS_ID) - source = source_dataset.table(self.SOURCE_TABLE) - config = ExtractJobConfig() - config.compression = Compression.GZIP - config.destination_format = DestinationFormat.NEWLINE_DELIMITED_JSON - config.field_delimiter = "|" - config.print_header = False - job = self._make_one( - self.JOB_ID, source, [self.DESTINATION_URI], client1, config - ) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": {"extract": EXTRACT_CONFIGURATION}, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_exists_miss_w_bound_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn = _make_connection() - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one( - self.JOB_ID, self.TABLE_REF, [self.DESTINATION_URI], client - ) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertFalse(job.exists()) - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None, - ) - - def test_exists_hit_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection({}) - client2 = _make_client(project=self.PROJECT, connection=conn2) - job = self._make_one( - self.JOB_ID, self.TABLE_REF, [self.DESTINATION_URI], client1 - ) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertTrue(job.exists(client=client2)) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None - ) - - def test_reload_w_bound_client(self): - from google.cloud.bigquery.dataset import DatasetReference - - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource() - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - source_dataset = DatasetReference(self.PROJECT, self.DS_ID) - source = source_dataset.table(self.SOURCE_TABLE) - job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload() - - final_attributes.assert_called_with({"path": PATH}, client, job) - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_reload_w_alternate_client(self): - from google.cloud.bigquery.dataset import DatasetReference - - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - RESOURCE = self._make_resource() - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - source_dataset = DatasetReference(self.PROJECT, self.DS_ID) - source = source_dataset.table(self.SOURCE_TABLE) - job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - -class TestQueryJobConfig(unittest.TestCase, _Base): - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import QueryJobConfig - - return QueryJobConfig - - def _make_one(self, *args, **kw): - return self._get_target_class()(*args, **kw) - - def test_ctor(self): - config = self._make_one() - self.assertEqual(config._properties, {"query": {}}) - - def test_ctor_w_none(self): - config = self._make_one() - config.default_dataset = None - config.destination = None - self.assertIsNone(config.default_dataset) - self.assertIsNone(config.destination) - - def test_ctor_w_properties(self): - config = self._get_target_class()(use_query_cache=False, use_legacy_sql=True) - - self.assertFalse(config.use_query_cache) - self.assertTrue(config.use_legacy_sql) - - def test_ctor_w_string_default_dataset(self): - from google.cloud.bigquery import dataset - - default_dataset = "default-proj.default_dset" - config = self._get_target_class()(default_dataset=default_dataset) - expected = dataset.DatasetReference.from_string(default_dataset) - self.assertEqual(config.default_dataset, expected) - - def test_ctor_w_string_destinaton(self): - from google.cloud.bigquery import table - - destination = "dest-proj.dest_dset.dest_tbl" - config = self._get_target_class()(destination=destination) - expected = table.TableReference.from_string(destination) - self.assertEqual(config.destination, expected) - - def test_default_dataset_w_string(self): - from google.cloud.bigquery import dataset - - default_dataset = "default-proj.default_dset" - config = self._make_one() - config.default_dataset = default_dataset - expected = dataset.DatasetReference.from_string(default_dataset) - self.assertEqual(config.default_dataset, expected) - - def test_default_dataset_w_dataset(self): - from google.cloud.bigquery import dataset - - default_dataset = "default-proj.default_dset" - expected = dataset.DatasetReference.from_string(default_dataset) - config = self._make_one() - config.default_dataset = dataset.Dataset(expected) - self.assertEqual(config.default_dataset, expected) - - def test_destinaton_w_string(self): - from google.cloud.bigquery import table - - destination = "dest-proj.dest_dset.dest_tbl" - config = self._make_one() - config.destination = destination - expected = table.TableReference.from_string(destination) - self.assertEqual(config.destination, expected) - - def test_range_partitioning_w_none(self): - object_under_test = self._get_target_class()() - assert object_under_test.range_partitioning is None - - def test_range_partitioning_w_value(self): - object_under_test = self._get_target_class()() - object_under_test._properties["query"]["rangePartitioning"] = { - "field": "column_one", - "range": {"start": 1, "end": 1000, "interval": 10}, - } - object_under_test.range_partitioning.field == "column_one" - object_under_test.range_partitioning.range_.start == 1 - object_under_test.range_partitioning.range_.end == 1000 - object_under_test.range_partitioning.range_.interval == 10 - - def test_range_partitioning_setter(self): - from google.cloud.bigquery.table import PartitionRange - from google.cloud.bigquery.table import RangePartitioning - - object_under_test = self._get_target_class()() - object_under_test.range_partitioning = RangePartitioning( - field="column_one", range_=PartitionRange(start=1, end=1000, interval=10) - ) - object_under_test.range_partitioning.field == "column_one" - object_under_test.range_partitioning.range_.start == 1 - object_under_test.range_partitioning.range_.end == 1000 - object_under_test.range_partitioning.range_.interval == 10 - - def test_range_partitioning_setter_w_none(self): - object_under_test = self._get_target_class()() - object_under_test.range_partitioning = None - assert object_under_test.range_partitioning is None - - def test_range_partitioning_setter_w_wrong_type(self): - object_under_test = self._get_target_class()() - with pytest.raises(ValueError, match="RangePartitioning"): - object_under_test.range_partitioning = object() - - def test_time_partitioning(self): - from google.cloud.bigquery import table - - time_partitioning = table.TimePartitioning( - type_=table.TimePartitioningType.DAY, field="name" - ) - config = self._make_one() - config.time_partitioning = time_partitioning - # TimePartitioning should be configurable after assigning - time_partitioning.expiration_ms = 10000 - - self.assertEqual(config.time_partitioning.type_, table.TimePartitioningType.DAY) - self.assertEqual(config.time_partitioning.field, "name") - self.assertEqual(config.time_partitioning.expiration_ms, 10000) - - config.time_partitioning = None - self.assertIsNone(config.time_partitioning) - - def test_clustering_fields(self): - fields = ["email", "postal_code"] - config = self._get_target_class()() - config.clustering_fields = fields - self.assertEqual(config.clustering_fields, fields) - - config.clustering_fields = None - self.assertIsNone(config.clustering_fields) - - def test_from_api_repr_empty(self): - klass = self._get_target_class() - config = klass.from_api_repr({}) - self.assertIsNone(config.dry_run) - self.assertIsNone(config.use_legacy_sql) - self.assertIsNone(config.default_dataset) - self.assertIsNone(config.destination) - self.assertIsNone(config.destination_encryption_configuration) - - def test_from_api_repr_normal(self): - from google.cloud.bigquery.dataset import DatasetReference - - resource = { - "query": { - "useLegacySql": True, - "query": "no property for me", - "defaultDataset": { - "projectId": "someproject", - "datasetId": "somedataset", - }, - "someNewProperty": "I should be saved, too.", - }, - "dryRun": True, - } - klass = self._get_target_class() - - config = klass.from_api_repr(resource) - - self.assertTrue(config.use_legacy_sql) - self.assertEqual( - config.default_dataset, DatasetReference("someproject", "somedataset") - ) - self.assertTrue(config.dry_run) - # Make sure unknown properties propagate. - self.assertEqual(config._properties["query"]["query"], "no property for me") - self.assertEqual( - config._properties["query"]["someNewProperty"], "I should be saved, too." - ) - - def test_to_api_repr_normal(self): - from google.cloud.bigquery.dataset import DatasetReference - - config = self._make_one() - config.use_legacy_sql = True - config.default_dataset = DatasetReference("someproject", "somedataset") - config.dry_run = False - config._properties["someNewProperty"] = "Woohoo, alpha stuff." - - resource = config.to_api_repr() - - self.assertFalse(resource["dryRun"]) - self.assertTrue(resource["query"]["useLegacySql"]) - self.assertEqual( - resource["query"]["defaultDataset"]["projectId"], "someproject" - ) - self.assertEqual( - resource["query"]["defaultDataset"]["datasetId"], "somedataset" - ) - # Make sure unknown properties propagate. - self.assertEqual(resource["someNewProperty"], "Woohoo, alpha stuff.") - - def test_to_api_repr_with_encryption(self): - from google.cloud.bigquery.encryption_configuration import ( - EncryptionConfiguration, - ) - - config = self._make_one() - config.destination_encryption_configuration = EncryptionConfiguration( - kms_key_name=self.KMS_KEY_NAME - ) - resource = config.to_api_repr() - self.assertEqual( - resource, - { - "query": { - "destinationEncryptionConfiguration": { - "kmsKeyName": self.KMS_KEY_NAME - } - } - }, - ) - - def test_to_api_repr_with_encryption_none(self): - config = self._make_one() - config.destination_encryption_configuration = None - resource = config.to_api_repr() - self.assertEqual( - resource, {"query": {"destinationEncryptionConfiguration": None}} - ) - - def test_from_api_repr_with_encryption(self): - resource = { - "query": { - "destinationEncryptionConfiguration": {"kmsKeyName": self.KMS_KEY_NAME} - } - } - klass = self._get_target_class() - config = klass.from_api_repr(resource) - self.assertEqual( - config.destination_encryption_configuration.kms_key_name, self.KMS_KEY_NAME - ) - - -class TestQueryJob(unittest.TestCase, _Base): - JOB_TYPE = "query" - QUERY = "select count(*) from persons" - DESTINATION_TABLE = "destination_table" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import QueryJob - - return QueryJob - - def _make_resource(self, started=False, ended=False): - resource = super(TestQueryJob, self)._make_resource(started, ended) - config = resource["configuration"]["query"] - config["query"] = self.QUERY - - if ended: - resource["status"] = {"state": "DONE"} - - return resource - - def _verifyBooleanResourceProperties(self, job, config): - - if "allowLargeResults" in config: - self.assertEqual(job.allow_large_results, config["allowLargeResults"]) - else: - self.assertIsNone(job.allow_large_results) - if "flattenResults" in config: - self.assertEqual(job.flatten_results, config["flattenResults"]) - else: - self.assertIsNone(job.flatten_results) - if "useQueryCache" in config: - self.assertEqual(job.use_query_cache, config["useQueryCache"]) - else: - self.assertIsNone(job.use_query_cache) - if "useLegacySql" in config: - self.assertEqual(job.use_legacy_sql, config["useLegacySql"]) - else: - self.assertIsNone(job.use_legacy_sql) - - def _verifyIntegerResourceProperties(self, job, config): - if "maximumBillingTier" in config: - self.assertEqual(job.maximum_billing_tier, config["maximumBillingTier"]) - else: - self.assertIsNone(job.maximum_billing_tier) - if "maximumBytesBilled" in config: - self.assertEqual( - str(job.maximum_bytes_billed), config["maximumBytesBilled"] - ) - self.assertIsInstance(job.maximum_bytes_billed, int) - else: - self.assertIsNone(job.maximum_bytes_billed) - - def _verify_udf_resources(self, job, config): - udf_resources = config.get("userDefinedFunctionResources", ()) - self.assertEqual(len(job.udf_resources), len(udf_resources)) - for found, expected in zip(job.udf_resources, udf_resources): - if "resourceUri" in expected: - self.assertEqual(found.udf_type, "resourceUri") - self.assertEqual(found.value, expected["resourceUri"]) - else: - self.assertEqual(found.udf_type, "inlineCode") - self.assertEqual(found.value, expected["inlineCode"]) - - def _verifyQueryParameters(self, job, config): - query_parameters = config.get("queryParameters", ()) - self.assertEqual(len(job.query_parameters), len(query_parameters)) - for found, expected in zip(job.query_parameters, query_parameters): - self.assertEqual(found.to_api_repr(), expected) - - def _verify_table_definitions(self, job, config): - table_defs = config.get("tableDefinitions") - if job.table_definitions is None: - self.assertIsNone(table_defs) - else: - self.assertEqual(len(job.table_definitions), len(table_defs)) - for found_key, found_ec in job.table_definitions.items(): - expected_ec = table_defs.get(found_key) - self.assertIsNotNone(expected_ec) - self.assertEqual(found_ec.to_api_repr(), expected_ec) - - def _verify_configuration_properties(self, job, configuration): - if "dryRun" in configuration: - self.assertEqual(job.dry_run, configuration["dryRun"]) - else: - self.assertIsNone(job.dry_run) - - def _verifyResourceProperties(self, job, resource): - self._verifyReadonlyResourceProperties(job, resource) - - configuration = resource.get("configuration", {}) - self._verify_configuration_properties(job, configuration) - - query_config = resource.get("configuration", {}).get("query") - self._verifyBooleanResourceProperties(job, query_config) - self._verifyIntegerResourceProperties(job, query_config) - self._verify_udf_resources(job, query_config) - self._verifyQueryParameters(job, query_config) - self._verify_table_definitions(job, query_config) - - self.assertEqual(job.query, query_config["query"]) - if "createDisposition" in query_config: - self.assertEqual(job.create_disposition, query_config["createDisposition"]) - else: - self.assertIsNone(job.create_disposition) - if "defaultDataset" in query_config: - ds_ref = job.default_dataset - ds_ref = {"projectId": ds_ref.project, "datasetId": ds_ref.dataset_id} - self.assertEqual(ds_ref, query_config["defaultDataset"]) - else: - self.assertIsNone(job.default_dataset) - if "destinationTable" in query_config: - table = job.destination - tb_ref = { - "projectId": table.project, - "datasetId": table.dataset_id, - "tableId": table.table_id, - } - self.assertEqual(tb_ref, query_config["destinationTable"]) - else: - self.assertIsNone(job.destination) - if "priority" in query_config: - self.assertEqual(job.priority, query_config["priority"]) - else: - self.assertIsNone(job.priority) - if "writeDisposition" in query_config: - self.assertEqual(job.write_disposition, query_config["writeDisposition"]) - else: - self.assertIsNone(job.write_disposition) - if "destinationEncryptionConfiguration" in query_config: - self.assertIsNotNone(job.destination_encryption_configuration) - self.assertEqual( - job.destination_encryption_configuration.kms_key_name, - query_config["destinationEncryptionConfiguration"]["kmsKeyName"], - ) - else: - self.assertIsNone(job.destination_encryption_configuration) - if "schemaUpdateOptions" in query_config: - self.assertEqual( - job.schema_update_options, query_config["schemaUpdateOptions"] - ) - else: - self.assertIsNone(job.schema_update_options) - - def test_ctor_defaults(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertEqual(job.query, self.QUERY) - self.assertIs(job._client, client) - self.assertEqual(job.job_type, self.JOB_TYPE) - self.assertEqual(job.path, "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID)) - - self._verifyInitialReadonlyProperties(job) - - self.assertFalse(job.use_legacy_sql) - - # set/read from resource['configuration']['query'] - self.assertIsNone(job.allow_large_results) - self.assertIsNone(job.create_disposition) - self.assertIsNone(job.default_dataset) - self.assertIsNone(job.destination) - self.assertIsNone(job.flatten_results) - self.assertIsNone(job.priority) - self.assertIsNone(job.use_query_cache) - self.assertIsNone(job.dry_run) - self.assertIsNone(job.write_disposition) - self.assertIsNone(job.maximum_billing_tier) - self.assertIsNone(job.maximum_bytes_billed) - self.assertIsNone(job.table_definitions) - self.assertIsNone(job.destination_encryption_configuration) - self.assertIsNone(job.range_partitioning) - self.assertIsNone(job.time_partitioning) - self.assertIsNone(job.clustering_fields) - self.assertIsNone(job.schema_update_options) - - def test_ctor_w_udf_resources(self): - from google.cloud.bigquery.job import QueryJobConfig - from google.cloud.bigquery.query import UDFResource - - RESOURCE_URI = "gs://some-bucket/js/lib.js" - udf_resources = [UDFResource("resourceUri", RESOURCE_URI)] - client = _make_client(project=self.PROJECT) - config = QueryJobConfig() - config.udf_resources = udf_resources - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) - self.assertEqual(job.udf_resources, udf_resources) - - def test_ctor_w_query_parameters(self): - from google.cloud.bigquery.job import QueryJobConfig - from google.cloud.bigquery.query import ScalarQueryParameter - - query_parameters = [ScalarQueryParameter("foo", "INT64", 123)] - client = _make_client(project=self.PROJECT) - config = QueryJobConfig(query_parameters=query_parameters) - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) - self.assertEqual(job.query_parameters, query_parameters) - - def test_from_api_repr_missing_identity(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = {} - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_missing_config(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - } - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_bare(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": {"query": {"query": self.QUERY}}, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_repr_with_encryption(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": self.JOB_ID, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": { - "query": self.QUERY, - "destinationEncryptionConfiguration": { - "kmsKeyName": self.KMS_KEY_NAME - }, - } - }, - } - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_from_api_repr_w_properties(self): - from google.cloud.bigquery.job import CreateDisposition - from google.cloud.bigquery.job import SchemaUpdateOption - from google.cloud.bigquery.job import WriteDisposition - - client = _make_client(project=self.PROJECT) - RESOURCE = self._make_resource() - query_config = RESOURCE["configuration"]["query"] - query_config["createDisposition"] = CreateDisposition.CREATE_IF_NEEDED - query_config["writeDisposition"] = WriteDisposition.WRITE_TRUNCATE - query_config["destinationTable"] = { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.DESTINATION_TABLE, - } - query_config["schemaUpdateOptions"] = [SchemaUpdateOption.ALLOW_FIELD_ADDITION] - klass = self._get_target_class() - job = klass.from_api_repr(RESOURCE, client=client) - self.assertIs(job._client, client) - self._verifyResourceProperties(job, RESOURCE) - - def test_cancelled(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - job._properties["status"] = { - "state": "DONE", - "errorResult": {"reason": "stopped"}, - } - - self.assertTrue(job.cancelled()) - - def test_done(self): - client = _make_client(project=self.PROJECT) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - self.assertTrue(job.done()) - - def test_done_w_timeout(self): - client = _make_client(project=self.PROJECT) - resource = self._make_resource(ended=False) - job = self._get_target_class().from_api_repr(resource, client) - - with mock.patch.object( - client, "_get_query_results" - ) as fake_get_results, mock.patch.object(job, "reload") as fake_reload: - job.done(timeout=42) - - fake_get_results.assert_called_once() - call_args = fake_get_results.call_args - self.assertEqual(call_args.kwargs.get("timeout"), 42) - - call_args = fake_reload.call_args - self.assertEqual(call_args.kwargs.get("timeout"), 42) - - def test_done_w_timeout_and_longer_internal_api_timeout(self): - client = _make_client(project=self.PROJECT) - resource = self._make_resource(ended=False) - job = self._get_target_class().from_api_repr(resource, client) - job._done_timeout = 8.8 - - with mock.patch.object( - client, "_get_query_results" - ) as fake_get_results, mock.patch.object(job, "reload") as fake_reload: - job.done(timeout=5.5) - - # The expected timeout used is simply the given timeout, as the latter - # is shorter than the job's internal done timeout. - expected_timeout = 5.5 - - fake_get_results.assert_called_once() - call_args = fake_get_results.call_args - self.assertAlmostEqual(call_args.kwargs.get("timeout"), expected_timeout) - - call_args = fake_reload.call_args - self.assertAlmostEqual(call_args.kwargs.get("timeout"), expected_timeout) - - def test_query_plan(self): - from google.cloud._helpers import _RFC3339_MICROS - from google.cloud.bigquery.job import QueryPlanEntry - from google.cloud.bigquery.job import QueryPlanEntryStep - - plan_entries = [ - { - "name": "NAME", - "id": "1234", - "inputStages": ["88", "101"], - "startMs": "1522540800000", - "endMs": "1522540804000", - "parallelInputs": "1000", - "completedParallelInputs": "5", - "waitMsAvg": "33", - "waitMsMax": "400", - "waitRatioAvg": 2.71828, - "waitRatioMax": 3.14159, - "readMsAvg": "45", - "readMsMax": "90", - "readRatioAvg": 1.41421, - "readRatioMax": 1.73205, - "computeMsAvg": "55", - "computeMsMax": "99", - "computeRatioAvg": 0.69315, - "computeRatioMax": 1.09861, - "writeMsAvg": "203", - "writeMsMax": "340", - "writeRatioAvg": 3.32193, - "writeRatioMax": 2.30258, - "recordsRead": "100", - "recordsWritten": "1", - "status": "STATUS", - "shuffleOutputBytes": "1024", - "shuffleOutputBytesSpilled": "1", - "steps": [{"kind": "KIND", "substeps": ["SUBSTEP1", "SUBSTEP2"]}], - } - ] - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertEqual(job.query_plan, []) - - statistics = job._properties["statistics"] = {} - self.assertEqual(job.query_plan, []) - - query_stats = statistics["query"] = {} - self.assertEqual(job.query_plan, []) - - query_stats["queryPlan"] = plan_entries - - self.assertEqual(len(job.query_plan), len(plan_entries)) - for found, expected in zip(job.query_plan, plan_entries): - self.assertIsInstance(found, QueryPlanEntry) - self.assertEqual(found.name, expected["name"]) - self.assertEqual(found.entry_id, expected["id"]) - self.assertEqual(len(found.input_stages), len(expected["inputStages"])) - for f_id in found.input_stages: - self.assertIn(f_id, [int(e) for e in expected["inputStages"]]) - self.assertEqual( - found.start.strftime(_RFC3339_MICROS), "2018-04-01T00:00:00.000000Z" - ) - self.assertEqual( - found.end.strftime(_RFC3339_MICROS), "2018-04-01T00:00:04.000000Z" - ) - self.assertEqual(found.parallel_inputs, int(expected["parallelInputs"])) - self.assertEqual( - found.completed_parallel_inputs, - int(expected["completedParallelInputs"]), - ) - self.assertEqual(found.wait_ms_avg, int(expected["waitMsAvg"])) - self.assertEqual(found.wait_ms_max, int(expected["waitMsMax"])) - self.assertEqual(found.wait_ratio_avg, expected["waitRatioAvg"]) - self.assertEqual(found.wait_ratio_max, expected["waitRatioMax"]) - self.assertEqual(found.read_ms_avg, int(expected["readMsAvg"])) - self.assertEqual(found.read_ms_max, int(expected["readMsMax"])) - self.assertEqual(found.read_ratio_avg, expected["readRatioAvg"]) - self.assertEqual(found.read_ratio_max, expected["readRatioMax"]) - self.assertEqual(found.compute_ms_avg, int(expected["computeMsAvg"])) - self.assertEqual(found.compute_ms_max, int(expected["computeMsMax"])) - self.assertEqual(found.compute_ratio_avg, expected["computeRatioAvg"]) - self.assertEqual(found.compute_ratio_max, expected["computeRatioMax"]) - self.assertEqual(found.write_ms_avg, int(expected["writeMsAvg"])) - self.assertEqual(found.write_ms_max, int(expected["writeMsMax"])) - self.assertEqual(found.write_ratio_avg, expected["writeRatioAvg"]) - self.assertEqual(found.write_ratio_max, expected["writeRatioMax"]) - self.assertEqual(found.records_read, int(expected["recordsRead"])) - self.assertEqual(found.records_written, int(expected["recordsWritten"])) - self.assertEqual(found.status, expected["status"]) - self.assertEqual( - found.shuffle_output_bytes, int(expected["shuffleOutputBytes"]) - ) - self.assertEqual( - found.shuffle_output_bytes_spilled, - int(expected["shuffleOutputBytesSpilled"]), - ) - - self.assertEqual(len(found.steps), len(expected["steps"])) - for f_step, e_step in zip(found.steps, expected["steps"]): - self.assertIsInstance(f_step, QueryPlanEntryStep) - self.assertEqual(f_step.kind, e_step["kind"]) - self.assertEqual(f_step.substeps, e_step["substeps"]) - - def test_total_bytes_processed(self): - total_bytes = 1234 - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.total_bytes_processed) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.total_bytes_processed) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.total_bytes_processed) - - query_stats["totalBytesProcessed"] = str(total_bytes) - self.assertEqual(job.total_bytes_processed, total_bytes) - - def test_total_bytes_billed(self): - total_bytes = 1234 - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.total_bytes_billed) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.total_bytes_billed) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.total_bytes_billed) - - query_stats["totalBytesBilled"] = str(total_bytes) - self.assertEqual(job.total_bytes_billed, total_bytes) - - def test_billing_tier(self): - billing_tier = 1 - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.billing_tier) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.billing_tier) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.billing_tier) - - query_stats["billingTier"] = billing_tier - self.assertEqual(job.billing_tier, billing_tier) - - def test_cache_hit(self): - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.cache_hit) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.cache_hit) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.cache_hit) - - query_stats["cacheHit"] = True - self.assertTrue(job.cache_hit) - - def test_ddl_operation_performed(self): - op = "SKIP" - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.ddl_operation_performed) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.ddl_operation_performed) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.ddl_operation_performed) - - query_stats["ddlOperationPerformed"] = op - self.assertEqual(job.ddl_operation_performed, op) - - def test_ddl_target_routine(self): - from google.cloud.bigquery.routine import RoutineReference - - ref_routine = { - "projectId": self.PROJECT, - "datasetId": "ddl_ds", - "routineId": "targetroutine", - } - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.ddl_target_routine) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.ddl_target_routine) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.ddl_target_routine) - - query_stats["ddlTargetRoutine"] = ref_routine - self.assertIsInstance(job.ddl_target_routine, RoutineReference) - self.assertEqual(job.ddl_target_routine.routine_id, "targetroutine") - self.assertEqual(job.ddl_target_routine.dataset_id, "ddl_ds") - self.assertEqual(job.ddl_target_routine.project, self.PROJECT) - - def test_ddl_target_table(self): - from google.cloud.bigquery.table import TableReference - - ref_table = { - "projectId": self.PROJECT, - "datasetId": "ddl_ds", - "tableId": "targettable", - } - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.ddl_target_table) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.ddl_target_table) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.ddl_target_table) - - query_stats["ddlTargetTable"] = ref_table - self.assertIsInstance(job.ddl_target_table, TableReference) - self.assertEqual(job.ddl_target_table.table_id, "targettable") - self.assertEqual(job.ddl_target_table.dataset_id, "ddl_ds") - self.assertEqual(job.ddl_target_table.project, self.PROJECT) - - def test_num_dml_affected_rows(self): - num_rows = 1234 - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.num_dml_affected_rows) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.num_dml_affected_rows) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.num_dml_affected_rows) - - query_stats["numDmlAffectedRows"] = str(num_rows) - self.assertEqual(job.num_dml_affected_rows, num_rows) - - def test_slot_millis(self): - millis = 1234 - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.slot_millis) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.slot_millis) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.slot_millis) - - query_stats["totalSlotMs"] = millis - self.assertEqual(job.slot_millis, millis) - - def test_statement_type(self): - statement_type = "SELECT" - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.statement_type) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.statement_type) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.statement_type) - - query_stats["statementType"] = statement_type - self.assertEqual(job.statement_type, statement_type) - - def test_referenced_tables(self): - from google.cloud.bigquery.table import TableReference - - ref_tables_resource = [ - {"projectId": self.PROJECT, "datasetId": "dataset", "tableId": "local1"}, - {"projectId": self.PROJECT, "datasetId": "dataset", "tableId": "local2"}, - { - "projectId": "other-project-123", - "datasetId": "other-dataset", - "tableId": "other-table", - }, - ] - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertEqual(job.referenced_tables, []) - - statistics = job._properties["statistics"] = {} - self.assertEqual(job.referenced_tables, []) - - query_stats = statistics["query"] = {} - self.assertEqual(job.referenced_tables, []) - - query_stats["referencedTables"] = ref_tables_resource - - local1, local2, remote = job.referenced_tables - - self.assertIsInstance(local1, TableReference) - self.assertEqual(local1.table_id, "local1") - self.assertEqual(local1.dataset_id, "dataset") - self.assertEqual(local1.project, self.PROJECT) - - self.assertIsInstance(local2, TableReference) - self.assertEqual(local2.table_id, "local2") - self.assertEqual(local2.dataset_id, "dataset") - self.assertEqual(local2.project, self.PROJECT) - - self.assertIsInstance(remote, TableReference) - self.assertEqual(remote.table_id, "other-table") - self.assertEqual(remote.dataset_id, "other-dataset") - self.assertEqual(remote.project, "other-project-123") - - def test_timeline(self): - timeline_resource = [ - { - "elapsedMs": 1, - "activeUnits": 22, - "pendingUnits": 33, - "completedUnits": 44, - "totalSlotMs": 101, - } - ] - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertEqual(job.timeline, []) - - statistics = job._properties["statistics"] = {} - self.assertEqual(job.timeline, []) - - query_stats = statistics["query"] = {} - self.assertEqual(job.timeline, []) - - query_stats["timeline"] = timeline_resource - - self.assertEqual(len(job.timeline), len(timeline_resource)) - self.assertEqual(job.timeline[0].elapsed_ms, 1) - self.assertEqual(job.timeline[0].active_units, 22) - self.assertEqual(job.timeline[0].pending_units, 33) - self.assertEqual(job.timeline[0].completed_units, 44) - self.assertEqual(job.timeline[0].slot_millis, 101) - - def test_undeclared_query_parameters(self): - from google.cloud.bigquery.query import ArrayQueryParameter - from google.cloud.bigquery.query import ScalarQueryParameter - from google.cloud.bigquery.query import StructQueryParameter - - undeclared = [ - { - "name": "my_scalar", - "parameterType": {"type": "STRING"}, - "parameterValue": {"value": "value"}, - }, - { - "name": "my_array", - "parameterType": {"type": "ARRAY", "arrayType": {"type": "INT64"}}, - "parameterValue": { - "arrayValues": [{"value": "1066"}, {"value": "1745"}] - }, - }, - { - "name": "my_struct", - "parameterType": { - "type": "STRUCT", - "structTypes": [{"name": "count", "type": {"type": "INT64"}}], - }, - "parameterValue": {"structValues": {"count": {"value": "123"}}}, - }, - ] - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertEqual(job.undeclared_query_parameters, []) - - statistics = job._properties["statistics"] = {} - self.assertEqual(job.undeclared_query_parameters, []) - - query_stats = statistics["query"] = {} - self.assertEqual(job.undeclared_query_parameters, []) - - query_stats["undeclaredQueryParameters"] = undeclared - - scalar, array, struct = job.undeclared_query_parameters - - self.assertIsInstance(scalar, ScalarQueryParameter) - self.assertEqual(scalar.name, "my_scalar") - self.assertEqual(scalar.type_, "STRING") - self.assertEqual(scalar.value, "value") - - self.assertIsInstance(array, ArrayQueryParameter) - self.assertEqual(array.name, "my_array") - self.assertEqual(array.array_type, "INT64") - self.assertEqual(array.values, [1066, 1745]) - - self.assertIsInstance(struct, StructQueryParameter) - self.assertEqual(struct.name, "my_struct") - self.assertEqual(struct.struct_types, {"count": "INT64"}) - self.assertEqual(struct.struct_values, {"count": 123}) - - def test_estimated_bytes_processed(self): - est_bytes = 123456 - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, self.QUERY, client) - self.assertIsNone(job.estimated_bytes_processed) - - statistics = job._properties["statistics"] = {} - self.assertIsNone(job.estimated_bytes_processed) - - query_stats = statistics["query"] = {} - self.assertIsNone(job.estimated_bytes_processed) - - query_stats["estimatedBytesProcessed"] = str(est_bytes) - self.assertEqual(job.estimated_bytes_processed, est_bytes) - - def test_result(self): - from google.cloud.bigquery.table import RowIterator - - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - "totalRows": "2", - } - tabledata_resource = { - # Explicitly set totalRows to be different from the query response. - # to test update during iteration. - "totalRows": "1", - "pageToken": None, - "rows": [{"f": [{"v": "abc"}]}], - } - connection = _make_connection(query_resource, tabledata_resource) - client = _make_client(self.PROJECT, connection=connection) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - - result = job.result() - - self.assertIsInstance(result, RowIterator) - self.assertEqual(result.total_rows, 2) - - rows = list(result) - self.assertEqual(len(rows), 1) - self.assertEqual(rows[0].col1, "abc") - # Test that the total_rows property has changed during iteration, based - # on the response from tabledata.list. - self.assertEqual(result.total_rows, 1) - - def test_result_with_max_results(self): - from google.cloud.bigquery.table import RowIterator - - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - "totalRows": "5", - } - tabledata_resource = { - "totalRows": "5", - "pageToken": None, - "rows": [ - {"f": [{"v": "abc"}]}, - {"f": [{"v": "def"}]}, - {"f": [{"v": "ghi"}]}, - ], - } - connection = _make_connection(query_resource, tabledata_resource) - client = _make_client(self.PROJECT, connection=connection) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - - max_results = 3 - - result = job.result(max_results=max_results) - - self.assertIsInstance(result, RowIterator) - self.assertEqual(result.total_rows, 5) - - rows = list(result) - - self.assertEqual(len(rows), 3) - self.assertEqual(len(connection.api_request.call_args_list), 2) - tabledata_list_request = connection.api_request.call_args_list[1] - self.assertEqual( - tabledata_list_request[1]["query_params"]["maxResults"], max_results - ) - - def test_result_w_empty_schema(self): - from google.cloud.bigquery.table import _EmptyRowIterator - - # Destination table may have no schema for some DDL and DML queries. - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": []}, - } - connection = _make_connection(query_resource, query_resource) - client = _make_client(self.PROJECT, connection=connection) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - - result = job.result() - - self.assertIsInstance(result, _EmptyRowIterator) - self.assertEqual(list(result), []) - - def test_result_invokes_begins(self): - begun_resource = self._make_resource() - incomplete_resource = { - "jobComplete": False, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - } - query_resource = copy.deepcopy(incomplete_resource) - query_resource["jobComplete"] = True - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, - incomplete_resource, - query_resource, - done_resource, - query_resource, - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - job.result() - - self.assertEqual(len(connection.api_request.call_args_list), 4) - begin_request = connection.api_request.call_args_list[0] - query_request = connection.api_request.call_args_list[2] - reload_request = connection.api_request.call_args_list[3] - self.assertEqual(begin_request[1]["method"], "POST") - self.assertEqual(query_request[1]["method"], "GET") - self.assertEqual(reload_request[1]["method"], "GET") - - def test_result_w_timeout(self): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection(begun_resource, query_resource, done_resource) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - with freezegun.freeze_time("1970-01-01 00:00:00", tick=False): - job.result(timeout=1.0) - - self.assertEqual(len(connection.api_request.call_args_list), 3) - begin_request = connection.api_request.call_args_list[0] - query_request = connection.api_request.call_args_list[1] - reload_request = connection.api_request.call_args_list[2] - self.assertEqual(begin_request[1]["method"], "POST") - self.assertEqual(query_request[1]["method"], "GET") - self.assertEqual( - query_request[1]["path"], - "/projects/{}/queries/{}".format(self.PROJECT, self.JOB_ID), - ) - self.assertEqual(query_request[1]["query_params"]["timeoutMs"], 900) - self.assertEqual(reload_request[1]["method"], "GET") - - def test_result_w_page_size(self): - # Arrange - query_results_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - "totalRows": "4", - } - job_resource = self._make_resource(started=True, ended=True) - q_config = job_resource["configuration"]["query"] - q_config["destinationTable"] = { - "projectId": self.PROJECT, - "datasetId": self.DS_ID, - "tableId": self.TABLE_ID, - } - tabledata_resource = { - "totalRows": 4, - "pageToken": "some-page-token", - "rows": [ - {"f": [{"v": "row1"}]}, - {"f": [{"v": "row2"}]}, - {"f": [{"v": "row3"}]}, - ], - } - tabledata_resource_page_2 = {"totalRows": 4, "rows": [{"f": [{"v": "row4"}]}]} - conn = _make_connection( - query_results_resource, tabledata_resource, tabledata_resource_page_2 - ) - client = _make_client(self.PROJECT, connection=conn) - job = self._get_target_class().from_api_repr(job_resource, client) - - # Act - result = job.result(page_size=3) - - # Assert - actual_rows = list(result) - self.assertEqual(len(actual_rows), 4) - - tabledata_path = "/projects/%s/datasets/%s/tables/%s/data" % ( - self.PROJECT, - self.DS_ID, - self.TABLE_ID, - ) - conn.api_request.assert_has_calls( - [ - mock.call( - method="GET", - path=tabledata_path, - query_params={"maxResults": 3}, - timeout=None, - ), - mock.call( - method="GET", - path=tabledata_path, - query_params={"pageToken": "some-page-token", "maxResults": 3}, - timeout=None, - ), - ] - ) - - def test_result_with_start_index(self): - from google.cloud.bigquery.table import RowIterator - - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - "totalRows": "5", - } - tabledata_resource = { - "totalRows": "5", - "pageToken": None, - "rows": [ - {"f": [{"v": "abc"}]}, - {"f": [{"v": "def"}]}, - {"f": [{"v": "ghi"}]}, - {"f": [{"v": "jkl"}]}, - ], - } - connection = _make_connection(query_resource, tabledata_resource) - client = _make_client(self.PROJECT, connection=connection) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - - start_index = 1 - - result = job.result(start_index=start_index) - - self.assertIsInstance(result, RowIterator) - self.assertEqual(result.total_rows, 5) - - rows = list(result) - - self.assertEqual(len(rows), 4) - self.assertEqual(len(connection.api_request.call_args_list), 2) - tabledata_list_request = connection.api_request.call_args_list[1] - self.assertEqual( - tabledata_list_request[1]["query_params"]["startIndex"], start_index - ) - - def test_result_error(self): - from google.cloud import exceptions - - query = textwrap.dedent( - """ - SELECT foo, bar - FROM table_baz - WHERE foo == bar""" - ) - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, query, client) - error_result = { - "debugInfo": "DEBUG", - "location": "LOCATION", - "message": "MESSAGE", - "reason": "invalid", - } - job._properties["status"] = { - "errorResult": error_result, - "errors": [error_result], - "state": "DONE", - } - job._set_future_result() - - with self.assertRaises(exceptions.GoogleCloudError) as exc_info: - job.result() - - self.assertIsInstance(exc_info.exception, exceptions.GoogleCloudError) - self.assertEqual(exc_info.exception.code, http_client.BAD_REQUEST) - - exc_job_instance = getattr(exc_info.exception, "query_job", None) - self.assertIs(exc_job_instance, job) - - full_text = str(exc_info.exception) - assert job.job_id in full_text - assert "Query Job SQL Follows" in full_text - - for i, line in enumerate(query.splitlines(), start=1): - expected_line = "{}:{}".format(i, line) - assert expected_line in full_text - - def test_result_transport_timeout_error(self): - query = textwrap.dedent( - """ - SELECT foo, bar - FROM table_baz - WHERE foo == bar""" - ) - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, query, client) - call_api_patch = mock.patch( - "google.cloud.bigquery.client.Client._call_api", - autospec=True, - side_effect=requests.exceptions.Timeout("Server response took too long."), - ) - - # Make sure that timeout errors get rebranded to concurrent futures timeout. - with call_api_patch, self.assertRaises(concurrent.futures.TimeoutError): - job.result(timeout=1) - - def test__begin_error(self): - from google.cloud import exceptions - - query = textwrap.dedent( - """ - SELECT foo, bar - FROM table_baz - WHERE foo == bar""" - ) - - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, query, client) - call_api_patch = mock.patch( - "google.cloud.bigquery.client.Client._call_api", - autospec=True, - side_effect=exceptions.BadRequest("Syntax error in SQL query"), - ) - - with call_api_patch, self.assertRaises(exceptions.GoogleCloudError) as exc_info: - job.result() - - self.assertIsInstance(exc_info.exception, exceptions.GoogleCloudError) - self.assertEqual(exc_info.exception.code, http_client.BAD_REQUEST) - - exc_job_instance = getattr(exc_info.exception, "query_job", None) - self.assertIs(exc_job_instance, job) - - full_text = str(exc_info.exception) - assert job.job_id in full_text - assert "Query Job SQL Follows" in full_text - - for i, line in enumerate(query.splitlines(), start=1): - expected_line = "{}:{}".format(i, line) - assert expected_line in full_text - - def test__begin_w_timeout(self): - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one(self.JOB_ID, self.QUERY, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin(timeout=7.5) - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": {"query": self.QUERY, "useLegacySql": False} - }, - }, - timeout=7.5, - ) - - def test_begin_w_bound_client(self): - from google.cloud.bigquery.dataset import DatasetReference - from google.cloud.bigquery.job import QueryJobConfig - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - DS_ID = "DATASET" - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - - config = QueryJobConfig() - config.default_dataset = DatasetReference(self.PROJECT, DS_ID) - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - self.assertIsNone(job.default_dataset) - self.assertEqual(job.udf_resources, []) - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": { - "query": self.QUERY, - "useLegacySql": False, - "defaultDataset": { - "projectId": self.PROJECT, - "datasetId": DS_ID, - }, - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_alternate_client(self): - from google.cloud.bigquery.dataset import DatasetReference - from google.cloud.bigquery.job import CreateDisposition - from google.cloud.bigquery.job import QueryJobConfig - from google.cloud.bigquery.job import QueryPriority - from google.cloud.bigquery.job import SchemaUpdateOption - from google.cloud.bigquery.job import WriteDisposition - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - TABLE = "TABLE" - DS_ID = "DATASET" - RESOURCE = self._make_resource(ended=True) - QUERY_CONFIGURATION = { - "query": self.QUERY, - "allowLargeResults": True, - "createDisposition": CreateDisposition.CREATE_NEVER, - "defaultDataset": {"projectId": self.PROJECT, "datasetId": DS_ID}, - "destinationTable": { - "projectId": self.PROJECT, - "datasetId": DS_ID, - "tableId": TABLE, - }, - "flattenResults": True, - "priority": QueryPriority.INTERACTIVE, - "useQueryCache": True, - "useLegacySql": True, - "writeDisposition": WriteDisposition.WRITE_TRUNCATE, - "maximumBillingTier": 4, - "maximumBytesBilled": "123456", - "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_RELAXATION], - } - RESOURCE["configuration"]["query"] = QUERY_CONFIGURATION - RESOURCE["configuration"]["dryRun"] = True - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - dataset_ref = DatasetReference(self.PROJECT, DS_ID) - table_ref = dataset_ref.table(TABLE) - - config = QueryJobConfig() - config.allow_large_results = True - config.create_disposition = CreateDisposition.CREATE_NEVER - config.default_dataset = dataset_ref - config.destination = table_ref - config.dry_run = True - config.flatten_results = True - config.maximum_billing_tier = 4 - config.priority = QueryPriority.INTERACTIVE - config.use_legacy_sql = True - config.use_query_cache = True - config.write_disposition = WriteDisposition.WRITE_TRUNCATE - config.maximum_bytes_billed = 123456 - config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_RELAXATION] - job = self._make_one(self.JOB_ID, self.QUERY, client1, job_config=config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": {"dryRun": True, "query": QUERY_CONFIGURATION}, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_udf(self): - from google.cloud.bigquery.job import QueryJobConfig - from google.cloud.bigquery.query import UDFResource - - RESOURCE_URI = "gs://some-bucket/js/lib.js" - INLINE_UDF_CODE = 'var someCode = "here";' - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - RESOURCE["configuration"]["query"]["userDefinedFunctionResources"] = [ - {"resourceUri": RESOURCE_URI}, - {"inlineCode": INLINE_UDF_CODE}, - ] - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - udf_resources = [ - UDFResource("resourceUri", RESOURCE_URI), - UDFResource("inlineCode", INLINE_UDF_CODE), - ] - config = QueryJobConfig() - config.udf_resources = udf_resources - config.use_legacy_sql = True - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - self.assertEqual(job.udf_resources, udf_resources) - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": { - "query": self.QUERY, - "useLegacySql": True, - "userDefinedFunctionResources": [ - {"resourceUri": RESOURCE_URI}, - {"inlineCode": INLINE_UDF_CODE}, - ], - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_named_query_parameter(self): - from google.cloud.bigquery.job import QueryJobConfig - from google.cloud.bigquery.query import ScalarQueryParameter - - query_parameters = [ScalarQueryParameter("foo", "INT64", 123)] - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - config = RESOURCE["configuration"]["query"] - config["parameterMode"] = "NAMED" - config["queryParameters"] = [ - { - "name": "foo", - "parameterType": {"type": "INT64"}, - "parameterValue": {"value": "123"}, - } - ] - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - jconfig = QueryJobConfig() - jconfig.query_parameters = query_parameters - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=jconfig) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - self.assertEqual(job.query_parameters, query_parameters) - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": { - "query": self.QUERY, - "useLegacySql": False, - "parameterMode": "NAMED", - "queryParameters": config["queryParameters"], - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_positional_query_parameter(self): - from google.cloud.bigquery.job import QueryJobConfig - from google.cloud.bigquery.query import ScalarQueryParameter - - query_parameters = [ScalarQueryParameter.positional("INT64", 123)] - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - config = RESOURCE["configuration"]["query"] - config["parameterMode"] = "POSITIONAL" - config["queryParameters"] = [ - {"parameterType": {"type": "INT64"}, "parameterValue": {"value": "123"}} - ] - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - jconfig = QueryJobConfig() - jconfig.query_parameters = query_parameters - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=jconfig) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - self.assertEqual(job.query_parameters, query_parameters) - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": { - "query": self.QUERY, - "useLegacySql": False, - "parameterMode": "POSITIONAL", - "queryParameters": config["queryParameters"], - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_begin_w_table_defs(self): - from google.cloud.bigquery.job import QueryJobConfig - from google.cloud.bigquery.external_config import ExternalConfig - from google.cloud.bigquery.external_config import BigtableColumn - from google.cloud.bigquery.external_config import BigtableColumnFamily - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - - bt_config = ExternalConfig("BIGTABLE") - bt_config.ignore_unknown_values = True - bt_config.options.read_rowkey_as_string = True - cf = BigtableColumnFamily() - cf.family_id = "cf" - col = BigtableColumn() - col.field_name = "fn" - cf.columns = [col] - bt_config.options.column_families = [cf] - BT_CONFIG_RESOURCE = { - "sourceFormat": "BIGTABLE", - "ignoreUnknownValues": True, - "bigtableOptions": { - "readRowkeyAsString": True, - "columnFamilies": [ - {"familyId": "cf", "columns": [{"fieldName": "fn"}]} - ], - }, - } - CSV_CONFIG_RESOURCE = { - "sourceFormat": "CSV", - "maxBadRecords": 8, - "csvOptions": {"allowJaggedRows": True}, - } - csv_config = ExternalConfig("CSV") - csv_config.max_bad_records = 8 - csv_config.options.allow_jagged_rows = True - bt_table = "bigtable-table" - csv_table = "csv-table" - RESOURCE["configuration"]["query"]["tableDefinitions"] = { - bt_table: BT_CONFIG_RESOURCE, - csv_table: CSV_CONFIG_RESOURCE, - } - want_resource = copy.deepcopy(RESOURCE) - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - config = QueryJobConfig() - config.table_definitions = {bt_table: bt_config, csv_table: csv_config} - config.use_legacy_sql = True - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": { - "query": self.QUERY, - "useLegacySql": True, - "tableDefinitions": { - bt_table: BT_CONFIG_RESOURCE, - csv_table: CSV_CONFIG_RESOURCE, - }, - } - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, want_resource) - - def test_dry_run_query(self): - from google.cloud.bigquery.job import QueryJobConfig - - PATH = "/projects/%s/jobs" % (self.PROJECT,) - RESOURCE = self._make_resource() - # Ensure None for missing server-set props - del RESOURCE["statistics"]["creationTime"] - del RESOURCE["etag"] - del RESOURCE["selfLink"] - del RESOURCE["user_email"] - RESOURCE["configuration"]["dryRun"] = True - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - config = QueryJobConfig() - config.dry_run = True - job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job._begin() - - final_attributes.assert_called_with({"path": PATH}, client, job) - self.assertEqual(job.udf_resources, []) - conn.api_request.assert_called_once_with( - method="POST", - path=PATH, - data={ - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "configuration": { - "query": {"query": self.QUERY, "useLegacySql": False}, - "dryRun": True, - }, - }, - timeout=None, - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_exists_miss_w_bound_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn = _make_connection() - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one(self.JOB_ID, self.QUERY, client) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertFalse(job.exists()) - - final_attributes.assert_called_with({"path": PATH}, client, job) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None - ) - - def test_exists_hit_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection({}) - client2 = _make_client(project=self.PROJECT, connection=conn2) - job = self._make_one(self.JOB_ID, self.QUERY, client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - self.assertTrue(job.exists(client=client2)) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={"fields": "id"}, timeout=None - ) - - def test_reload_w_bound_client(self): - from google.cloud.bigquery.dataset import DatasetReference - from google.cloud.bigquery.job import QueryJobConfig - - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - DS_ID = "DATASET" - DEST_TABLE = "dest_table" - RESOURCE = self._make_resource() - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - dataset_ref = DatasetReference(self.PROJECT, DS_ID) - table_ref = dataset_ref.table(DEST_TABLE) - config = QueryJobConfig() - config.destination = table_ref - job = self._make_one(self.JOB_ID, None, client, job_config=config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload() - - final_attributes.assert_called_with({"path": PATH}, client, job) - - self.assertNotEqual(job.destination, table_ref) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_reload_w_alternate_client(self): - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - DS_ID = "DATASET" - DEST_TABLE = "dest_table" - RESOURCE = self._make_resource() - q_config = RESOURCE["configuration"]["query"] - q_config["destinationTable"] = { - "projectId": self.PROJECT, - "datasetId": DS_ID, - "tableId": DEST_TABLE, - } - conn1 = _make_connection() - client1 = _make_client(project=self.PROJECT, connection=conn1) - conn2 = _make_connection(RESOURCE) - client2 = _make_client(project=self.PROJECT, connection=conn2) - job = self._make_one(self.JOB_ID, self.QUERY, client1) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload(client=client2) - - final_attributes.assert_called_with({"path": PATH}, client2, job) - - conn1.api_request.assert_not_called() - conn2.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=None - ) - self._verifyResourceProperties(job, RESOURCE) - - def test_reload_w_timeout(self): - from google.cloud.bigquery.dataset import DatasetReference - from google.cloud.bigquery.job import QueryJobConfig - - PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) - DS_ID = "DATASET" - DEST_TABLE = "dest_table" - RESOURCE = self._make_resource() - conn = _make_connection(RESOURCE) - client = _make_client(project=self.PROJECT, connection=conn) - dataset_ref = DatasetReference(self.PROJECT, DS_ID) - table_ref = dataset_ref.table(DEST_TABLE) - config = QueryJobConfig() - config.destination = table_ref - job = self._make_one(self.JOB_ID, None, client, job_config=config) - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - job.reload(timeout=4.2) - - final_attributes.assert_called_with({"path": PATH}, client, job) - - self.assertNotEqual(job.destination, table_ref) - - conn.api_request.assert_called_once_with( - method="GET", path=PATH, query_params={}, timeout=4.2 - ) - - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_to_arrow(self): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "4", - "schema": { - "fields": [ - { - "name": "spouse_1", - "type": "RECORD", - "fields": [ - {"name": "name", "type": "STRING", "mode": "NULLABLE"}, - {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, - ], - }, - { - "name": "spouse_2", - "type": "RECORD", - "fields": [ - {"name": "name", "type": "STRING", "mode": "NULLABLE"}, - {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, - ], - }, - ] - }, - } - tabledata_resource = { - "rows": [ - { - "f": [ - {"v": {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}}, - {"v": {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}}, - ] - }, - { - "f": [ - {"v": {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}}, - {"v": {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}}, - ] - }, - ] - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, query_resource, done_resource, tabledata_resource - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - tbl = job.to_arrow(create_bqstorage_client=False) - - self.assertIsInstance(tbl, pyarrow.Table) - self.assertEqual(tbl.num_rows, 2) - - # Check the schema. - self.assertEqual(tbl.schema[0].name, "spouse_1") - self.assertEqual(tbl.schema[0].type[0].name, "name") - self.assertEqual(tbl.schema[0].type[1].name, "age") - self.assertTrue(pyarrow.types.is_struct(tbl.schema[0].type)) - self.assertTrue(pyarrow.types.is_string(tbl.schema[0].type[0].type)) - self.assertTrue(pyarrow.types.is_int64(tbl.schema[0].type[1].type)) - self.assertEqual(tbl.schema[1].name, "spouse_2") - self.assertEqual(tbl.schema[1].type[0].name, "name") - self.assertEqual(tbl.schema[1].type[1].name, "age") - self.assertTrue(pyarrow.types.is_struct(tbl.schema[1].type)) - self.assertTrue(pyarrow.types.is_string(tbl.schema[1].type[0].type)) - self.assertTrue(pyarrow.types.is_int64(tbl.schema[1].type[1].type)) - - # Check the data. - tbl_data = tbl.to_pydict() - spouse_1 = tbl_data["spouse_1"] - self.assertEqual( - spouse_1, - [ - {"name": "Phred Phlyntstone", "age": 32}, - {"name": "Bhettye Rhubble", "age": 27}, - ], - ) - spouse_2 = tbl_data["spouse_2"] - self.assertEqual( - spouse_2, - [ - {"name": "Wylma Phlyntstone", "age": 29}, - {"name": "Bharney Rhubble", "age": 33}, - ], - ) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe(self): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "4", - "schema": { - "fields": [ - {"name": "name", "type": "STRING", "mode": "NULLABLE"}, - {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, - ] - }, - } - tabledata_resource = { - "rows": [ - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, - {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, - ] - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, query_resource, done_resource, tabledata_resource - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - df = job.to_dataframe(create_bqstorage_client=False) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 4) # verify the number of rows - self.assertEqual(list(df), ["name", "age"]) # verify the column names - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_ddl_query(self): - # Destination table may have no schema for some DDL and DML queries. - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": []}, - } - connection = _make_connection(query_resource) - client = _make_client(self.PROJECT, connection=connection) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - - df = job.to_dataframe() - - self.assertEqual(len(df), 0) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" - ) - def test_to_dataframe_bqstorage(self): - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "4", - "schema": { - "fields": [ - {"name": "name", "type": "STRING", "mode": "NULLABLE"}, - {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, - ] - }, - } - connection = _make_connection(query_resource) - client = _make_client(self.PROJECT, connection=connection) - resource = self._make_resource(ended=True) - job = self._get_target_class().from_api_repr(resource, client) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - session = bigquery_storage_v1.types.ReadSession() - session.avro_schema.schema = json.dumps( - { - "type": "record", - "name": "__root__", - "fields": [ - {"name": "name", "type": ["null", "string"]}, - {"name": "age", "type": ["null", "long"]}, - ], - } - ) - bqstorage_client.create_read_session.return_value = session - - job.to_dataframe(bqstorage_client=bqstorage_client) - - destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( - **resource["configuration"]["query"]["destinationTable"] - ) - expected_session = bigquery_storage_v1.types.ReadSession( - table=destination_table, - data_format=bigquery_storage_v1.enums.DataFormat.ARROW, - ) - bqstorage_client.create_read_session.assert_called_once_with( - parent="projects/{}".format(self.PROJECT), - read_session=expected_session, - max_stream_count=0, # Use default number of streams for best performance. - ) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_column_dtypes(self): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "4", - "schema": { - "fields": [ - {"name": "start_timestamp", "type": "TIMESTAMP"}, - {"name": "seconds", "type": "INT64"}, - {"name": "miles", "type": "FLOAT64"}, - {"name": "km", "type": "FLOAT64"}, - {"name": "payment_type", "type": "STRING"}, - {"name": "complete", "type": "BOOL"}, - {"name": "date", "type": "DATE"}, - ] - }, - } - row_data = [ - [ - "1.4338368E9", - "420", - "1.1", - "1.77", - "Cto_dataframeash", - "true", - "1999-12-01", - ], - ["1.3878117E9", "2580", "17.7", "28.5", "Cash", "false", "1953-06-14"], - ["1.3855653E9", "2280", "4.4", "7.1", "Credit", "true", "1981-11-04"], - ] - rows = [{"f": [{"v": field} for field in row]} for row in row_data] - query_resource["rows"] = rows - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, query_resource, done_resource, query_resource - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - df = job.to_dataframe(dtypes={"km": "float16"}, create_bqstorage_client=False) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 3) # verify the number of rows - exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] - self.assertEqual(list(df), exp_columns) # verify the column names - - self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]") - self.assertEqual(df.seconds.dtype.name, "int64") - self.assertEqual(df.miles.dtype.name, "float64") - self.assertEqual(df.km.dtype.name, "float16") - self.assertEqual(df.payment_type.dtype.name, "object") - self.assertEqual(df.complete.dtype.name, "bool") - self.assertEqual(df.date.dtype.name, "object") - - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_column_date_dtypes(self): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "1", - "schema": {"fields": [{"name": "date", "type": "DATE"}]}, - } - row_data = [ - ["1999-12-01"], - ] - rows = [{"f": [{"v": field} for field in row]} for row in row_data] - query_resource["rows"] = rows - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, query_resource, done_resource, query_resource - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 1) # verify the number of rows - exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] - self.assertEqual(list(df), exp_columns) # verify the column names - - self.assertEqual(df.date.dtype.name, "datetime64[ns]") - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_column_date_dtypes_wo_pyarrow(self): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "1", - "schema": {"fields": [{"name": "date", "type": "DATE"}]}, - } - row_data = [ - ["1999-12-01"], - ] - rows = [{"f": [{"v": field} for field in row]} for row in row_data] - query_resource["rows"] = rows - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, query_resource, done_resource, query_resource - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - with warnings.catch_warnings(record=True) as warned: - df = job.to_dataframe( - date_as_object=False, create_bqstorage_client=False - ) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 1) # verify the number of rows - exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] - self.assertEqual(list(df), exp_columns) # verify the column names - - self.assertEqual(df.date.dtype.name, "object") - - assert len(warned) == 1 - warning = warned[0] - assert "without pyarrow" in str(warning) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(tqdm is None, "Requires `tqdm`") - @mock.patch("tqdm.tqdm") - def test_to_dataframe_with_progress_bar(self, tqdm_mock): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "4", - "schema": { - "fields": [{"name": "name", "type": "STRING", "mode": "NULLABLE"}] - }, - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, - query_resource, - done_resource, - query_resource, - query_resource, - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - job.to_dataframe(progress_bar_type=None, create_bqstorage_client=False) - tqdm_mock.assert_not_called() - - job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False) - tqdm_mock.assert_called() - - def test_iter(self): - import types - - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "0", - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection(begun_resource, query_resource, done_resource) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - self.assertIsInstance(iter(job), types.GeneratorType) - - -class TestQueryPlanEntryStep(unittest.TestCase, _Base): - KIND = "KIND" - SUBSTEPS = ("SUB1", "SUB2") - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import QueryPlanEntryStep - - return QueryPlanEntryStep - - def _make_one(self, *args, **kw): - return self._get_target_class()(*args, **kw) - - def test_ctor(self): - step = self._make_one(self.KIND, self.SUBSTEPS) - self.assertEqual(step.kind, self.KIND) - self.assertEqual(step.substeps, list(self.SUBSTEPS)) - - def test_from_api_repr_empty(self): - klass = self._get_target_class() - step = klass.from_api_repr({}) - self.assertIsNone(step.kind) - self.assertEqual(step.substeps, []) - - def test_from_api_repr_normal(self): - resource = {"kind": self.KIND, "substeps": self.SUBSTEPS} - klass = self._get_target_class() - step = klass.from_api_repr(resource) - self.assertEqual(step.kind, self.KIND) - self.assertEqual(step.substeps, list(self.SUBSTEPS)) - - def test___eq___mismatched_type(self): - step = self._make_one(self.KIND, self.SUBSTEPS) - self.assertNotEqual(step, object()) - - def test___eq___mismatch_kind(self): - step = self._make_one(self.KIND, self.SUBSTEPS) - other = self._make_one("OTHER", self.SUBSTEPS) - self.assertNotEqual(step, other) - - def test___eq___mismatch_substeps(self): - step = self._make_one(self.KIND, self.SUBSTEPS) - other = self._make_one(self.KIND, ()) - self.assertNotEqual(step, other) - - def test___eq___hit(self): - step = self._make_one(self.KIND, self.SUBSTEPS) - other = self._make_one(self.KIND, self.SUBSTEPS) - self.assertEqual(step, other) - - def test___eq___wrong_type(self): - step = self._make_one(self.KIND, self.SUBSTEPS) - self.assertFalse(step == "hello") - - -class TestQueryPlanEntry(unittest.TestCase, _Base): - NAME = "NAME" - ENTRY_ID = 1234 - START_MS = 1522540800000 - END_MS = 1522540804000 - INPUT_STAGES = (88, 101) - PARALLEL_INPUTS = 1000 - COMPLETED_PARALLEL_INPUTS = 5 - WAIT_MS_AVG = 33 - WAIT_MS_MAX = 400 - WAIT_RATIO_AVG = 2.71828 - WAIT_RATIO_MAX = 3.14159 - READ_MS_AVG = 45 - READ_MS_MAX = 90 - READ_RATIO_AVG = 1.41421 - READ_RATIO_MAX = 1.73205 - COMPUTE_MS_AVG = 55 - COMPUTE_MS_MAX = 99 - COMPUTE_RATIO_AVG = 0.69315 - COMPUTE_RATIO_MAX = 1.09861 - WRITE_MS_AVG = 203 - WRITE_MS_MAX = 340 - WRITE_RATIO_AVG = 3.32193 - WRITE_RATIO_MAX = 2.30258 - RECORDS_READ = 100 - RECORDS_WRITTEN = 1 - STATUS = "STATUS" - SHUFFLE_OUTPUT_BYTES = 1024 - SHUFFLE_OUTPUT_BYTES_SPILLED = 1 - - START_RFC3339_MICROS = "2018-04-01T00:00:00.000000Z" - END_RFC3339_MICROS = "2018-04-01T00:00:04.000000Z" - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import QueryPlanEntry - - return QueryPlanEntry - - def test_from_api_repr_empty(self): - klass = self._get_target_class() - - entry = klass.from_api_repr({}) - - self.assertIsNone(entry.name) - self.assertIsNone(entry.entry_id) - self.assertEqual(entry.input_stages, []) - self.assertIsNone(entry.start) - self.assertIsNone(entry.end) - self.assertIsNone(entry.parallel_inputs) - self.assertIsNone(entry.completed_parallel_inputs) - self.assertIsNone(entry.wait_ms_avg) - self.assertIsNone(entry.wait_ms_max) - self.assertIsNone(entry.wait_ratio_avg) - self.assertIsNone(entry.wait_ratio_max) - self.assertIsNone(entry.read_ms_avg) - self.assertIsNone(entry.read_ms_max) - self.assertIsNone(entry.read_ratio_avg) - self.assertIsNone(entry.read_ratio_max) - self.assertIsNone(entry.compute_ms_avg) - self.assertIsNone(entry.compute_ms_max) - self.assertIsNone(entry.compute_ratio_avg) - self.assertIsNone(entry.compute_ratio_max) - self.assertIsNone(entry.write_ms_avg) - self.assertIsNone(entry.write_ms_max) - self.assertIsNone(entry.write_ratio_avg) - self.assertIsNone(entry.write_ratio_max) - self.assertIsNone(entry.records_read) - self.assertIsNone(entry.records_written) - self.assertIsNone(entry.status) - self.assertIsNone(entry.shuffle_output_bytes) - self.assertIsNone(entry.shuffle_output_bytes_spilled) - self.assertEqual(entry.steps, []) - - def test_from_api_repr_normal(self): - from google.cloud.bigquery.job import QueryPlanEntryStep - - steps = [ - QueryPlanEntryStep( - kind=TestQueryPlanEntryStep.KIND, - substeps=TestQueryPlanEntryStep.SUBSTEPS, - ) - ] - resource = { - "name": self.NAME, - "id": self.ENTRY_ID, - "inputStages": self.INPUT_STAGES, - "startMs": self.START_MS, - "endMs": self.END_MS, - "waitMsAvg": self.WAIT_MS_AVG, - "waitMsMax": self.WAIT_MS_MAX, - "waitRatioAvg": self.WAIT_RATIO_AVG, - "waitRatioMax": self.WAIT_RATIO_MAX, - "readMsAvg": self.READ_MS_AVG, - "readMsMax": self.READ_MS_MAX, - "readRatioAvg": self.READ_RATIO_AVG, - "readRatioMax": self.READ_RATIO_MAX, - "computeMsAvg": self.COMPUTE_MS_AVG, - "computeMsMax": self.COMPUTE_MS_MAX, - "computeRatioAvg": self.COMPUTE_RATIO_AVG, - "computeRatioMax": self.COMPUTE_RATIO_MAX, - "writeMsAvg": self.WRITE_MS_AVG, - "writeMsMax": self.WRITE_MS_MAX, - "writeRatioAvg": self.WRITE_RATIO_AVG, - "writeRatioMax": self.WRITE_RATIO_MAX, - "recordsRead": self.RECORDS_READ, - "recordsWritten": self.RECORDS_WRITTEN, - "status": self.STATUS, - "shuffleOutputBytes": self.SHUFFLE_OUTPUT_BYTES, - "shuffleOutputBytesSpilled": self.SHUFFLE_OUTPUT_BYTES_SPILLED, - "steps": [ - { - "kind": TestQueryPlanEntryStep.KIND, - "substeps": TestQueryPlanEntryStep.SUBSTEPS, - } - ], - } - klass = self._get_target_class() - - entry = klass.from_api_repr(resource) - self.assertEqual(entry.name, self.NAME) - self.assertEqual(entry.entry_id, self.ENTRY_ID) - self.assertEqual(entry.wait_ratio_avg, self.WAIT_RATIO_AVG) - self.assertEqual(entry.wait_ratio_max, self.WAIT_RATIO_MAX) - self.assertEqual(entry.read_ratio_avg, self.READ_RATIO_AVG) - self.assertEqual(entry.read_ratio_max, self.READ_RATIO_MAX) - self.assertEqual(entry.compute_ratio_avg, self.COMPUTE_RATIO_AVG) - self.assertEqual(entry.compute_ratio_max, self.COMPUTE_RATIO_MAX) - self.assertEqual(entry.write_ratio_avg, self.WRITE_RATIO_AVG) - self.assertEqual(entry.write_ratio_max, self.WRITE_RATIO_MAX) - self.assertEqual(entry.records_read, self.RECORDS_READ) - self.assertEqual(entry.records_written, self.RECORDS_WRITTEN) - self.assertEqual(entry.status, self.STATUS) - self.assertEqual(entry.steps, steps) - - def test_start(self): - from google.cloud._helpers import _RFC3339_MICROS - - klass = self._get_target_class() - - entry = klass.from_api_repr({}) - self.assertEqual(entry.start, None) - - entry._properties["startMs"] = self.START_MS - self.assertEqual( - entry.start.strftime(_RFC3339_MICROS), self.START_RFC3339_MICROS - ) - - def test_end(self): - from google.cloud._helpers import _RFC3339_MICROS - - klass = self._get_target_class() - - entry = klass.from_api_repr({}) - self.assertEqual(entry.end, None) - - entry._properties["endMs"] = self.END_MS - self.assertEqual(entry.end.strftime(_RFC3339_MICROS), self.END_RFC3339_MICROS) - - -class TestScriptStackFrame(unittest.TestCase, _Base): - def _make_one(self, resource): - from google.cloud.bigquery.job import ScriptStackFrame - - return ScriptStackFrame(resource) - - def test_procedure_id(self): - frame = self._make_one({"procedureId": "some-procedure"}) - self.assertEqual(frame.procedure_id, "some-procedure") - del frame._properties["procedureId"] - self.assertIsNone(frame.procedure_id) - - def test_start_line(self): - frame = self._make_one({"startLine": 5}) - self.assertEqual(frame.start_line, 5) - frame._properties["startLine"] = "5" - self.assertEqual(frame.start_line, 5) - - def test_start_column(self): - frame = self._make_one({"startColumn": 29}) - self.assertEqual(frame.start_column, 29) - frame._properties["startColumn"] = "29" - self.assertEqual(frame.start_column, 29) - - def test_end_line(self): - frame = self._make_one({"endLine": 9}) - self.assertEqual(frame.end_line, 9) - frame._properties["endLine"] = "9" - self.assertEqual(frame.end_line, 9) - - def test_end_column(self): - frame = self._make_one({"endColumn": 14}) - self.assertEqual(frame.end_column, 14) - frame._properties["endColumn"] = "14" - self.assertEqual(frame.end_column, 14) - - def test_text(self): - frame = self._make_one({"text": "QUERY TEXT"}) - self.assertEqual(frame.text, "QUERY TEXT") - - -class TestScriptStatistics(unittest.TestCase, _Base): - def _make_one(self, resource): - from google.cloud.bigquery.job import ScriptStatistics - - return ScriptStatistics(resource) - - def test_evalutation_kind(self): - stats = self._make_one({"evaluationKind": "EXPRESSION"}) - self.assertEqual(stats.evaluation_kind, "EXPRESSION") - self.assertEqual(stats.stack_frames, []) - - def test_stack_frames(self): - stats = self._make_one( - { - "stackFrames": [ - { - "procedureId": "some-procedure", - "startLine": 5, - "startColumn": 29, - "endLine": 9, - "endColumn": 14, - "text": "QUERY TEXT", - }, - {}, - ] - } - ) - stack_frames = stats.stack_frames - self.assertEqual(len(stack_frames), 2) - stack_frame = stack_frames[0] - self.assertEqual(stack_frame.procedure_id, "some-procedure") - self.assertEqual(stack_frame.start_line, 5) - self.assertEqual(stack_frame.start_column, 29) - self.assertEqual(stack_frame.end_line, 9) - self.assertEqual(stack_frame.end_column, 14) - self.assertEqual(stack_frame.text, "QUERY TEXT") - stack_frame = stack_frames[1] - self.assertIsNone(stack_frame.procedure_id) - self.assertIsNone(stack_frame.start_line) - self.assertIsNone(stack_frame.start_column) - self.assertIsNone(stack_frame.end_line) - self.assertIsNone(stack_frame.end_column) - self.assertIsNone(stack_frame.text) - - -class TestTimelineEntry(unittest.TestCase, _Base): - ELAPSED_MS = 101 - ACTIVE_UNITS = 50 - PENDING_UNITS = 98 - COMPLETED_UNITS = 520 - SLOT_MILLIS = 12029 - - @staticmethod - def _get_target_class(): - from google.cloud.bigquery.job import TimelineEntry - - return TimelineEntry - - def test_from_api_repr_empty(self): - klass = self._get_target_class() - entry = klass.from_api_repr({}) - self.assertIsNone(entry.elapsed_ms) - self.assertIsNone(entry.active_units) - self.assertIsNone(entry.pending_units) - self.assertIsNone(entry.completed_units) - self.assertIsNone(entry.slot_millis) - - def test_from_api_repr_normal(self): - resource = { - "elapsedMs": self.ELAPSED_MS, - "activeUnits": self.ACTIVE_UNITS, - "pendingUnits": self.PENDING_UNITS, - "completedUnits": self.COMPLETED_UNITS, - "totalSlotMs": self.SLOT_MILLIS, - } - klass = self._get_target_class() - - entry = klass.from_api_repr(resource) - self.assertEqual(entry.elapsed_ms, self.ELAPSED_MS) - self.assertEqual(entry.active_units, self.ACTIVE_UNITS) - self.assertEqual(entry.pending_units, self.PENDING_UNITS) - self.assertEqual(entry.completed_units, self.COMPLETED_UNITS) - self.assertEqual(entry.slot_millis, self.SLOT_MILLIS) - - -@pytest.mark.parametrize( - "query,expected", - ( - (None, False), - ("", False), - ("select name, age from table", False), - ("select name, age from table LIMIT 10;", False), - ("select name, age from table order by other_column;", True), - ("Select name, age From table Order By other_column", True), - ("SELECT name, age FROM table ORDER BY other_column;", True), - ("select name, age from table order\nby other_column", True), - ("Select name, age From table Order\nBy other_column;", True), - ("SELECT name, age FROM table ORDER\nBY other_column", True), - ("SelecT name, age froM table OrdeR \n\t BY other_column;", True), - ), -) -def test__contains_order_by(query, expected): - from google.cloud.bigquery import job as mut - - if expected: - assert mut._contains_order_by(query) - else: - assert not mut._contains_order_by(query) - - -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -@pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" -) -@pytest.mark.parametrize( - "query", - ( - "select name, age from table order by other_column;", - "Select name, age From table Order By other_column;", - "SELECT name, age FROM table ORDER BY other_column;", - "select name, age from table order\nby other_column;", - "Select name, age From table Order\nBy other_column;", - "SELECT name, age FROM table ORDER\nBY other_column;", - "SelecT name, age froM table OrdeR \n\t BY other_column;", - ), -) -def test_to_dataframe_bqstorage_preserve_order(query): - from google.cloud.bigquery.job import QueryJob as target_class - - job_resource = _make_job_resource( - project_id="test-project", job_type="query", ended=True - ) - job_resource["configuration"]["query"]["query"] = query - job_resource["status"] = {"state": "DONE"} - get_query_results_resource = { - "jobComplete": True, - "jobReference": {"projectId": "test-project", "jobId": "test-job"}, - "schema": { - "fields": [ - {"name": "name", "type": "STRING", "mode": "NULLABLE"}, - {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, - ] - }, - "totalRows": "4", - } - connection = _make_connection(get_query_results_resource, job_resource) - client = _make_client(connection=connection) - job = target_class.from_api_repr(job_resource, client) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - session = bigquery_storage_v1.types.ReadSession() - session.avro_schema.schema = json.dumps( - { - "type": "record", - "name": "__root__", - "fields": [ - {"name": "name", "type": ["null", "string"]}, - {"name": "age", "type": ["null", "long"]}, - ], - } - ) - bqstorage_client.create_read_session.return_value = session - - job.to_dataframe(bqstorage_client=bqstorage_client) - - destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( - **job_resource["configuration"]["query"]["destinationTable"] - ) - expected_session = bigquery_storage_v1.types.ReadSession( - table=destination_table, data_format=bigquery_storage_v1.enums.DataFormat.ARROW, - ) - bqstorage_client.create_read_session.assert_called_once_with( - parent="projects/test-project", - read_session=expected_session, - max_stream_count=1, # Use a single stream to preserve row order. - ) diff --git a/tests/unit/test_job_retry.py b/tests/unit/test_job_retry.py new file mode 100644 index 000000000..b2095d2f2 --- /dev/null +++ b/tests/unit/test_job_retry.py @@ -0,0 +1,247 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import re + +import mock +import pytest + +import google.api_core.exceptions +import google.api_core.retry + +from .helpers import make_connection + + +# With job_retry_on_query, we're testing 4 scenarios: +# - No `job_retry` passed, retry on default rateLimitExceeded. +# - Pass NotFound retry to `query`. +# - Pass NotFound retry to `result`. +# - Pass BadRequest retry to query, with the value passed to `result` overriding. +@pytest.mark.parametrize("job_retry_on_query", [None, "Query", "Result", "Both"]) +@mock.patch("time.sleep") +def test_retry_failed_jobs(sleep, client, job_retry_on_query): + """ + Test retry of job failures, as opposed to API-invocation failures. + """ + + retry_notfound = google.api_core.retry.Retry( + predicate=google.api_core.retry.if_exception_type( + google.api_core.exceptions.NotFound + ) + ) + retry_badrequest = google.api_core.retry.Retry( + predicate=google.api_core.retry.if_exception_type( + google.api_core.exceptions.BadRequest + ) + ) + + if job_retry_on_query is None: + reason = "rateLimitExceeded" + else: + reason = "notFound" + + err = dict(reason=reason) + responses = [ + dict(status=dict(state="DONE", errors=[err], errorResult=err)), + dict(status=dict(state="DONE", errors=[err], errorResult=err)), + dict(status=dict(state="DONE", errors=[err], errorResult=err)), + dict(status=dict(state="DONE")), + dict(rows=[{"f": [{"v": "1"}]}], totalRows="1"), + ] + + def api_request(method, path, query_params=None, data=None, **kw): + response = responses.pop(0) + if data: + response["jobReference"] = data["jobReference"] + else: + response["jobReference"] = dict( + jobId=path.split("/")[-1], projectId="PROJECT" + ) + return response + + conn = client._connection = make_connection() + conn.api_request.side_effect = api_request + + if job_retry_on_query == "Query": + job_retry = dict(job_retry=retry_notfound) + elif job_retry_on_query == "Both": + # This will be overridden in `result` + job_retry = dict(job_retry=retry_badrequest) + else: + job_retry = {} + job = client.query("select 1", **job_retry) + + orig_job_id = job.job_id + job_retry = ( + dict(job_retry=retry_notfound) + if job_retry_on_query in ("Result", "Both") + else {} + ) + result = job.result(**job_retry) + assert result.total_rows == 1 + assert not responses # We made all the calls we expected to. + + # The job adjusts it's job id based on the id of the last attempt. + assert job.job_id != orig_job_id + assert job.job_id == conn.mock_calls[3][2]["data"]["jobReference"]["jobId"] + + # We had to sleep three times + assert len(sleep.mock_calls) == 3 + + # Sleeps are random, however they're more than 0 + assert min(c[1][0] for c in sleep.mock_calls) > 0 + + # They're at most 2 * (multiplier**(number of sleeps - 1)) * initial + # The default multiplier is 2 + assert max(c[1][0] for c in sleep.mock_calls) <= 8 + + # We can ask for the result again: + responses = [ + dict(rows=[{"f": [{"v": "1"}]}], totalRows="1"), + ] + orig_job_id = job.job_id + result = job.result() + assert result.total_rows == 1 + assert not responses # We made all the calls we expected to. + + # We wouldn't (and didn't) fail, because we're dealing with a successful job. + # So the job id hasn't changed. + assert job.job_id == orig_job_id + + +# With job_retry_on_query, we're testing 4 scenarios: +# - Pass None retry to `query`. +# - Pass None retry to `result`. +@pytest.mark.parametrize("job_retry_on_query", ["Query", "Result"]) +@mock.patch("time.sleep") +def test_disable_retry_failed_jobs(sleep, client, job_retry_on_query): + """ + Test retry of job failures, as opposed to API-invocation failures. + """ + err = dict(reason="rateLimitExceeded") + responses = [dict(status=dict(state="DONE", errors=[err], errorResult=err))] * 3 + + def api_request(method, path, query_params=None, data=None, **kw): + response = responses.pop(0) + response["jobReference"] = data["jobReference"] + return response + + conn = client._connection = make_connection() + conn.api_request.side_effect = api_request + + if job_retry_on_query == "Query": + job_retry = dict(job_retry=None) + else: + job_retry = {} + job = client.query("select 1", **job_retry) + + orig_job_id = job.job_id + job_retry = dict(job_retry=None) if job_retry_on_query == "Result" else {} + with pytest.raises(google.api_core.exceptions.Forbidden): + job.result(**job_retry) + + assert job.job_id == orig_job_id + assert len(sleep.mock_calls) == 0 + + +@mock.patch("google.api_core.retry.datetime_helpers") +@mock.patch("time.sleep") +def test_retry_failed_jobs_after_retry_failed(sleep, datetime_helpers, client): + """ + If at first you don't succeed, maybe you will later. :) + """ + conn = client._connection = make_connection() + + datetime_helpers.utcnow.return_value = datetime.datetime(2021, 7, 29, 10, 43, 2) + + err = dict(reason="rateLimitExceeded") + + def api_request(method, path, query_params=None, data=None, **kw): + calls = sleep.mock_calls + if calls: + datetime_helpers.utcnow.return_value += datetime.timedelta( + seconds=calls[-1][1][0] + ) + response = dict(status=dict(state="DONE", errors=[err], errorResult=err)) + response["jobReference"] = data["jobReference"] + return response + + conn.api_request.side_effect = api_request + + job = client.query("select 1") + orig_job_id = job.job_id + + with pytest.raises(google.api_core.exceptions.RetryError): + job.result() + + # We never got a successful job, so the job id never changed: + assert job.job_id == orig_job_id + + # We failed because we couldn't succeed after 120 seconds. + # But we can try again: + err2 = dict(reason="backendError") # We also retry on this + responses = [ + dict(status=dict(state="DONE", errors=[err2], errorResult=err2)), + dict(status=dict(state="DONE", errors=[err], errorResult=err)), + dict(status=dict(state="DONE", errors=[err2], errorResult=err2)), + dict(status=dict(state="DONE")), + dict(rows=[{"f": [{"v": "1"}]}], totalRows="1"), + ] + + def api_request(method, path, query_params=None, data=None, **kw): + calls = sleep.mock_calls + datetime_helpers.utcnow.return_value += datetime.timedelta( + seconds=calls[-1][1][0] + ) + response = responses.pop(0) + if data: + response["jobReference"] = data["jobReference"] + else: + response["jobReference"] = dict( + jobId=path.split("/")[-1], projectId="PROJECT" + ) + return response + + conn.api_request.side_effect = api_request + result = job.result() + assert result.total_rows == 1 + assert not responses # We made all the calls we expected to. + assert job.job_id != orig_job_id + + +def test_raises_on_job_retry_on_query_with_non_retryable_jobs(client): + with pytest.raises( + TypeError, + match=re.escape( + "`job_retry` was provided, but the returned job is" + " not retryable, because a custom `job_id` was" + " provided." + ), + ): + client.query("select 42", job_id=42, job_retry=google.api_core.retry.Retry()) + + +def test_raises_on_job_retry_on_result_with_non_retryable_jobs(client): + client._connection = make_connection({}) + job = client.query("select 42", job_id=42) + with pytest.raises( + TypeError, + match=re.escape( + "`job_retry` was provided, but this job is" + " not retryable, because a custom `job_id` was" + " provided to the query that created this job." + ), + ): + job.result(job_retry=google.api_core.retry.Retry()) diff --git a/tests/unit/test_list_datasets.py b/tests/unit/test_list_datasets.py new file mode 100644 index 000000000..6f0b55c5e --- /dev/null +++ b/tests/unit/test_list_datasets.py @@ -0,0 +1,125 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mock +import pytest + +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +from .helpers import make_connection + + +@pytest.mark.parametrize( + "extra,query", [({}, {}), (dict(page_size=42), dict(maxResults=42))] +) +def test_list_datasets_defaults(client, PROJECT, extra, query): + from google.cloud.bigquery.dataset import DatasetListItem + + DATASET_1 = "dataset_one" + DATASET_2 = "dataset_two" + PATH = "projects/%s/datasets" % PROJECT + TOKEN = "TOKEN" + DATA = { + "nextPageToken": TOKEN, + "datasets": [ + { + "kind": "bigquery#dataset", + "id": "%s:%s" % (PROJECT, DATASET_1), + "datasetReference": {"datasetId": DATASET_1, "projectId": PROJECT}, + "friendlyName": None, + }, + { + "kind": "bigquery#dataset", + "id": "%s:%s" % (PROJECT, DATASET_2), + "datasetReference": {"datasetId": DATASET_2, "projectId": PROJECT}, + "friendlyName": "Two", + }, + ], + } + conn = client._connection = make_connection(DATA) + + iterator = client.list_datasets(**extra) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + page = next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) + datasets = list(page) + token = iterator.next_page_token + + assert len(datasets) == len(DATA["datasets"]) + for found, expected in zip(datasets, DATA["datasets"]): + assert isinstance(found, DatasetListItem) + assert found.full_dataset_id == expected["id"] + assert found.friendly_name == expected["friendlyName"] + assert token == TOKEN + + conn.api_request.assert_called_once_with( + method="GET", path="/%s" % PATH, query_params=query, timeout=DEFAULT_TIMEOUT + ) + + +def test_list_datasets_w_project_and_timeout(client, PROJECT): + conn = client._connection = make_connection({}) + + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + list(client.list_datasets(project="other-project", timeout=7.5)) + + final_attributes.assert_called_once_with( + {"path": "/projects/other-project/datasets"}, client, None + ) + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/other-project/datasets", + query_params={}, + timeout=7.5, + ) + + +def test_list_datasets_explicit_response_missing_datasets_key(client, PROJECT): + PATH = "projects/%s/datasets" % PROJECT + TOKEN = "TOKEN" + FILTER = "FILTER" + DATA = {} + conn = client._connection = make_connection(DATA) + + iterator = client.list_datasets( + include_all=True, filter=FILTER, max_results=3, page_token=TOKEN + ) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + page = next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) + datasets = list(page) + token = iterator.next_page_token + + assert len(datasets) == 0 + assert token is None + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % PATH, + query_params={ + "all": True, + "filter": FILTER, + "maxResults": 3, + "pageToken": TOKEN, + }, + timeout=DEFAULT_TIMEOUT, + ) diff --git a/tests/unit/test_list_jobs.py b/tests/unit/test_list_jobs.py new file mode 100644 index 000000000..1fb40d446 --- /dev/null +++ b/tests/unit/test_list_jobs.py @@ -0,0 +1,292 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +import mock +import pytest + +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +from .helpers import make_connection + + +@pytest.mark.parametrize( + "extra,query", [({}, {}), (dict(page_size=42), dict(maxResults=42))] +) +def test_list_jobs_defaults(client, PROJECT, DS_ID, extra, query): + from google.cloud.bigquery.job import CopyJob + from google.cloud.bigquery.job import CreateDisposition + from google.cloud.bigquery.job import ExtractJob + from google.cloud.bigquery.job import LoadJob + from google.cloud.bigquery.job import QueryJob + from google.cloud.bigquery.job import WriteDisposition + + SOURCE_TABLE = "source_table" + DESTINATION_TABLE = "destination_table" + QUERY_DESTINATION_TABLE = "query_destination_table" + SOURCE_URI = "gs://test_bucket/src_object*" + DESTINATION_URI = "gs://test_bucket/dst_object*" + JOB_TYPES = { + "load_job": LoadJob, + "copy_job": CopyJob, + "extract_job": ExtractJob, + "query_job": QueryJob, + } + PATH = "projects/%s/jobs" % PROJECT + TOKEN = "TOKEN" + QUERY = "SELECT * from test_dataset:test_table" + ASYNC_QUERY_DATA = { + "id": "%s:%s" % (PROJECT, "query_job"), + "jobReference": {"projectId": PROJECT, "jobId": "query_job"}, + "state": "DONE", + "configuration": { + "query": { + "query": QUERY, + "destinationTable": { + "projectId": PROJECT, + "datasetId": DS_ID, + "tableId": QUERY_DESTINATION_TABLE, + }, + "createDisposition": CreateDisposition.CREATE_IF_NEEDED, + "writeDisposition": WriteDisposition.WRITE_TRUNCATE, + } + }, + } + EXTRACT_DATA = { + "id": "%s:%s" % (PROJECT, "extract_job"), + "jobReference": {"projectId": PROJECT, "jobId": "extract_job"}, + "state": "DONE", + "configuration": { + "extract": { + "sourceTable": { + "projectId": PROJECT, + "datasetId": DS_ID, + "tableId": SOURCE_TABLE, + }, + "destinationUris": [DESTINATION_URI], + } + }, + } + COPY_DATA = { + "id": "%s:%s" % (PROJECT, "copy_job"), + "jobReference": {"projectId": PROJECT, "jobId": "copy_job"}, + "state": "DONE", + "configuration": { + "copy": { + "sourceTables": [ + {"projectId": PROJECT, "datasetId": DS_ID, "tableId": SOURCE_TABLE} + ], + "destinationTable": { + "projectId": PROJECT, + "datasetId": DS_ID, + "tableId": DESTINATION_TABLE, + }, + } + }, + } + LOAD_DATA = { + "id": "%s:%s" % (PROJECT, "load_job"), + "jobReference": {"projectId": PROJECT, "jobId": "load_job"}, + "state": "DONE", + "configuration": { + "load": { + "destinationTable": { + "projectId": PROJECT, + "datasetId": DS_ID, + "tableId": SOURCE_TABLE, + }, + "sourceUris": [SOURCE_URI], + } + }, + } + DATA = { + "nextPageToken": TOKEN, + "jobs": [ASYNC_QUERY_DATA, EXTRACT_DATA, COPY_DATA, LOAD_DATA], + } + conn = client._connection = make_connection(DATA) + + iterator = client.list_jobs(**extra) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + page = next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) + jobs = list(page) + token = iterator.next_page_token + + assert len(jobs) == len(DATA["jobs"]) + for found, expected in zip(jobs, DATA["jobs"]): + name = expected["jobReference"]["jobId"] + assert isinstance(found, JOB_TYPES[name]) + assert found.job_id == name + assert token == TOKEN + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % PATH, + query_params=dict({"projection": "full"}, **query), + timeout=DEFAULT_TIMEOUT, + ) + + +def test_list_jobs_load_job_wo_sourceUris(client, PROJECT, DS_ID): + from google.cloud.bigquery.job import LoadJob + + SOURCE_TABLE = "source_table" + JOB_TYPES = {"load_job": LoadJob} + PATH = "projects/%s/jobs" % PROJECT + TOKEN = "TOKEN" + LOAD_DATA = { + "id": "%s:%s" % (PROJECT, "load_job"), + "jobReference": {"projectId": PROJECT, "jobId": "load_job"}, + "state": "DONE", + "configuration": { + "load": { + "destinationTable": { + "projectId": PROJECT, + "datasetId": DS_ID, + "tableId": SOURCE_TABLE, + } + } + }, + } + DATA = {"nextPageToken": TOKEN, "jobs": [LOAD_DATA]} + conn = client._connection = make_connection(DATA) + + iterator = client.list_jobs() + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + page = next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) + jobs = list(page) + token = iterator.next_page_token + + assert len(jobs) == len(DATA["jobs"]) + for found, expected in zip(jobs, DATA["jobs"]): + name = expected["jobReference"]["jobId"] + assert isinstance(found, JOB_TYPES[name]) + assert found.job_id == name + assert token == TOKEN + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % PATH, + query_params={"projection": "full"}, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_list_jobs_explicit_missing(client, PROJECT): + PATH = "projects/%s/jobs" % PROJECT + DATA = {} + TOKEN = "TOKEN" + conn = client._connection = make_connection(DATA) + + iterator = client.list_jobs( + max_results=1000, page_token=TOKEN, all_users=True, state_filter="done" + ) + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + page = next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/%s" % PATH}, client, None) + jobs = list(page) + token = iterator.next_page_token + + assert len(jobs) == 0 + assert token is None + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % PATH, + query_params={ + "projection": "full", + "maxResults": 1000, + "pageToken": TOKEN, + "allUsers": True, + "stateFilter": "done", + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_list_jobs_w_project(client, PROJECT): + conn = client._connection = make_connection({}) + + list(client.list_jobs(project="other-project")) + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/other-project/jobs", + query_params={"projection": "full"}, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_list_jobs_w_timeout(client, PROJECT): + conn = client._connection = make_connection({}) + + list(client.list_jobs(timeout=7.5)) + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/{}/jobs".format(PROJECT), + query_params={"projection": "full"}, + timeout=7.5, + ) + + +def test_list_jobs_w_time_filter(client, PROJECT): + conn = client._connection = make_connection({}) + + # One millisecond after the unix epoch. + start_time = datetime.datetime(1970, 1, 1, 0, 0, 0, 1000) + # One millisecond after the the 2038 31-bit signed int rollover + end_time = datetime.datetime(2038, 1, 19, 3, 14, 7, 1000) + end_time_millis = (((2 ** 31) - 1) * 1000) + 1 + + list(client.list_jobs(min_creation_time=start_time, max_creation_time=end_time)) + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/%s/jobs" % PROJECT, + query_params={ + "projection": "full", + "minCreationTime": "1", + "maxCreationTime": str(end_time_millis), + }, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_list_jobs_w_parent_job_filter(client, PROJECT): + from google.cloud.bigquery import job + + conn = client._connection = make_connection({}, {}) + + parent_job_args = ["parent-job-123", job._AsyncJob("parent-job-123", client)] + + for parent_job in parent_job_args: + list(client.list_jobs(parent_job=parent_job)) + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/%s/jobs" % PROJECT, + query_params={"projection": "full", "parentJobId": "parent-job-123"}, + timeout=DEFAULT_TIMEOUT, + ) + conn.api_request.reset_mock() diff --git a/tests/unit/test_list_models.py b/tests/unit/test_list_models.py new file mode 100644 index 000000000..b14852338 --- /dev/null +++ b/tests/unit/test_list_models.py @@ -0,0 +1,93 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +from .helpers import make_connection, dataset_polymorphic + + +def test_list_models_empty_w_timeout(client, PROJECT, DS_ID): + path = "/projects/{}/datasets/{}/models".format(PROJECT, DS_ID) + conn = client._connection = make_connection({}) + + dataset_id = "{}.{}".format(PROJECT, DS_ID) + iterator = client.list_models(dataset_id, timeout=7.5) + page = next(iterator.pages) + models = list(page) + token = iterator.next_page_token + + assert models == [] + assert token is None + conn.api_request.assert_called_once_with( + method="GET", path=path, query_params={}, timeout=7.5 + ) + + +@pytest.mark.parametrize( + "extra,query", [({}, {}), (dict(page_size=42), dict(maxResults=42))] +) +@dataset_polymorphic +def test_list_models_defaults( + make_dataset, get_reference, client, PROJECT, DS_ID, extra, query, +): + from google.cloud.bigquery.model import Model + + MODEL_1 = "model_one" + MODEL_2 = "model_two" + PATH = "projects/%s/datasets/%s/models" % (PROJECT, DS_ID) + TOKEN = "TOKEN" + DATA = { + "nextPageToken": TOKEN, + "models": [ + { + "modelReference": { + "modelId": MODEL_1, + "datasetId": DS_ID, + "projectId": PROJECT, + } + }, + { + "modelReference": { + "modelId": MODEL_2, + "datasetId": DS_ID, + "projectId": PROJECT, + } + }, + ], + } + + conn = client._connection = make_connection(DATA) + dataset = make_dataset(PROJECT, DS_ID) + + iterator = client.list_models(dataset, **extra) + assert iterator.dataset == get_reference(dataset) + page = next(iterator.pages) + models = list(page) + token = iterator.next_page_token + + assert len(models) == len(DATA["models"]) + for found, expected in zip(models, DATA["models"]): + assert isinstance(found, Model) + assert found.model_id == expected["modelReference"]["modelId"] + assert token == TOKEN + + conn.api_request.assert_called_once_with( + method="GET", path="/%s" % PATH, query_params=query, timeout=DEFAULT_TIMEOUT + ) + + +def test_list_models_wrong_type(client): + with pytest.raises(TypeError): + client.list_models(42) diff --git a/tests/unit/test_list_projects.py b/tests/unit/test_list_projects.py new file mode 100644 index 000000000..190612b44 --- /dev/null +++ b/tests/unit/test_list_projects.py @@ -0,0 +1,120 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mock +import pytest + +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +from .helpers import make_connection + + +@pytest.mark.parametrize( + "extra,query", [({}, {}), (dict(page_size=42), dict(maxResults=42))] +) +def test_list_projects_defaults(client, PROJECT, extra, query): + from google.cloud.bigquery.client import Project + + PROJECT_2 = "PROJECT_TWO" + TOKEN = "TOKEN" + DATA = { + "nextPageToken": TOKEN, + "projects": [ + { + "kind": "bigquery#project", + "id": PROJECT, + "numericId": 1, + "projectReference": {"projectId": PROJECT}, + "friendlyName": "One", + }, + { + "kind": "bigquery#project", + "id": PROJECT_2, + "numericId": 2, + "projectReference": {"projectId": PROJECT_2}, + "friendlyName": "Two", + }, + ], + } + conn = client._connection = make_connection(DATA) + iterator = client.list_projects(**extra) + + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + page = next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/projects"}, client, None) + projects = list(page) + token = iterator.next_page_token + + assert len(projects) == len(DATA["projects"]) + for found, expected in zip(projects, DATA["projects"]): + assert isinstance(found, Project) + assert found.project_id == expected["id"] + assert found.numeric_id == expected["numericId"] + assert found.friendly_name == expected["friendlyName"] + assert token == TOKEN + + conn.api_request.assert_called_once_with( + method="GET", path="/projects", query_params=query, timeout=DEFAULT_TIMEOUT + ) + + +def test_list_projects_w_timeout(client): + TOKEN = "TOKEN" + DATA = { + "nextPageToken": TOKEN, + "projects": [], + } + conn = client._connection = make_connection(DATA) + + iterator = client.list_projects(timeout=7.5) + + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/projects"}, client, None) + + conn.api_request.assert_called_once_with( + method="GET", path="/projects", query_params={}, timeout=7.5 + ) + + +def test_list_projects_explicit_response_missing_projects_key(client): + TOKEN = "TOKEN" + DATA = {} + conn = client._connection = make_connection(DATA) + + iterator = client.list_projects(max_results=3, page_token=TOKEN) + + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + page = next(iterator.pages) + + final_attributes.assert_called_once_with({"path": "/projects"}, client, None) + projects = list(page) + token = iterator.next_page_token + + assert len(projects) == 0 + assert token is None + + conn.api_request.assert_called_once_with( + method="GET", + path="/projects", + query_params={"maxResults": 3, "pageToken": TOKEN}, + timeout=DEFAULT_TIMEOUT, + ) diff --git a/tests/unit/test_list_routines.py b/tests/unit/test_list_routines.py new file mode 100644 index 000000000..80e62d6bd --- /dev/null +++ b/tests/unit/test_list_routines.py @@ -0,0 +1,96 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +from .helpers import make_connection, dataset_polymorphic + + +def test_list_routines_empty_w_timeout(client): + conn = client._connection = make_connection({}) + + iterator = client.list_routines("test-routines.test_routines", timeout=7.5) + page = next(iterator.pages) + routines = list(page) + token = iterator.next_page_token + + assert routines == [] + assert token is None + conn.api_request.assert_called_once_with( + method="GET", + path="/projects/test-routines/datasets/test_routines/routines", + query_params={}, + timeout=7.5, + ) + + +@pytest.mark.parametrize( + "extra,query", [({}, {}), (dict(page_size=42), dict(maxResults=42))] +) +@dataset_polymorphic +def test_list_routines_defaults( + make_dataset, get_reference, client, PROJECT, extra, query +): + from google.cloud.bigquery.routine import Routine + + project_id = PROJECT + dataset_id = "test_routines" + path = f"/projects/{PROJECT}/datasets/test_routines/routines" + routine_1 = "routine_one" + routine_2 = "routine_two" + token = "TOKEN" + resource = { + "nextPageToken": token, + "routines": [ + { + "routineReference": { + "routineId": routine_1, + "datasetId": dataset_id, + "projectId": project_id, + } + }, + { + "routineReference": { + "routineId": routine_2, + "datasetId": dataset_id, + "projectId": project_id, + } + }, + ], + } + + conn = client._connection = make_connection(resource) + dataset = make_dataset(client.project, dataset_id) + + iterator = client.list_routines(dataset, **extra) + assert iterator.dataset == get_reference(dataset) + page = next(iterator.pages) + routines = list(page) + actual_token = iterator.next_page_token + + assert len(routines) == len(resource["routines"]) + for found, expected in zip(routines, resource["routines"]): + assert isinstance(found, Routine) + assert found.routine_id == expected["routineReference"]["routineId"] + assert actual_token == token + + conn.api_request.assert_called_once_with( + method="GET", path=path, query_params=query, timeout=DEFAULT_TIMEOUT + ) + + +def test_list_routines_wrong_type(client): + with pytest.raises(TypeError): + client.list_routines(42) diff --git a/tests/unit/test_list_tables.py b/tests/unit/test_list_tables.py new file mode 100644 index 000000000..8360f6605 --- /dev/null +++ b/tests/unit/test_list_tables.py @@ -0,0 +1,180 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import google.cloud.bigquery.dataset +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +from .helpers import make_connection, dataset_polymorphic + + +@dataset_polymorphic +def test_list_tables_empty_w_timeout( + make_dataset, get_reference, client, PROJECT, DS_ID +): + path = "/projects/{}/datasets/{}/tables".format(PROJECT, DS_ID) + conn = client._connection = make_connection({}) + + dataset = make_dataset(PROJECT, DS_ID) + iterator = client.list_tables(dataset, timeout=7.5) + assert iterator.dataset == get_reference(dataset) + page = next(iterator.pages) + tables = list(page) + token = iterator.next_page_token + + assert tables == [] + assert token is None + conn.api_request.assert_called_once_with( + method="GET", path=path, query_params={}, timeout=7.5 + ) + + +@dataset_polymorphic +def test_list_tables_defaults(make_dataset, get_reference, client, PROJECT, DS_ID): + from google.cloud.bigquery.table import TableListItem + + TABLE_1 = "table_one" + TABLE_2 = "table_two" + PATH = "projects/%s/datasets/%s/tables" % (PROJECT, DS_ID) + TOKEN = "TOKEN" + DATA = { + "nextPageToken": TOKEN, + "tables": [ + { + "kind": "bigquery#table", + "id": "%s:%s.%s" % (PROJECT, DS_ID, TABLE_1), + "tableReference": { + "tableId": TABLE_1, + "datasetId": DS_ID, + "projectId": PROJECT, + }, + "type": "TABLE", + }, + { + "kind": "bigquery#table", + "id": "%s:%s.%s" % (PROJECT, DS_ID, TABLE_2), + "tableReference": { + "tableId": TABLE_2, + "datasetId": DS_ID, + "projectId": PROJECT, + }, + "type": "TABLE", + }, + ], + } + + conn = client._connection = make_connection(DATA) + dataset = make_dataset(PROJECT, DS_ID) + + iterator = client.list_tables(dataset) + assert iterator.dataset == get_reference(dataset) + page = next(iterator.pages) + tables = list(page) + token = iterator.next_page_token + + assert len(tables) == len(DATA["tables"]) + for found, expected in zip(tables, DATA["tables"]): + assert isinstance(found, TableListItem) + assert found.full_table_id == expected["id"] + assert found.table_type == expected["type"] + assert token == TOKEN + + conn.api_request.assert_called_once_with( + method="GET", path="/%s" % PATH, query_params={}, timeout=DEFAULT_TIMEOUT + ) + + +def test_list_tables_explicit(client, PROJECT, DS_ID): + from google.cloud.bigquery.table import TableListItem + + TABLE_1 = "table_one" + TABLE_2 = "table_two" + PATH = "projects/%s/datasets/%s/tables" % (PROJECT, DS_ID) + TOKEN = "TOKEN" + DATA = { + "tables": [ + { + "kind": "bigquery#dataset", + "id": "%s:%s.%s" % (PROJECT, DS_ID, TABLE_1), + "tableReference": { + "tableId": TABLE_1, + "datasetId": DS_ID, + "projectId": PROJECT, + }, + "type": "TABLE", + }, + { + "kind": "bigquery#dataset", + "id": "%s:%s.%s" % (PROJECT, DS_ID, TABLE_2), + "tableReference": { + "tableId": TABLE_2, + "datasetId": DS_ID, + "projectId": PROJECT, + }, + "type": "TABLE", + }, + ] + } + + conn = client._connection = make_connection(DATA) + dataset = google.cloud.bigquery.dataset.DatasetReference(PROJECT, DS_ID) + + iterator = client.list_tables( + # Test with string for dataset ID. + DS_ID, + max_results=3, + page_token=TOKEN, + ) + assert iterator.dataset == dataset + page = next(iterator.pages) + tables = list(page) + token = iterator.next_page_token + + assert len(tables) == len(DATA["tables"]) + for found, expected in zip(tables, DATA["tables"]): + assert isinstance(found, TableListItem) + assert found.full_table_id == expected["id"] + assert found.table_type == expected["type"] + assert token is None + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % PATH, + query_params={"maxResults": 3, "pageToken": TOKEN}, + timeout=DEFAULT_TIMEOUT, + ) + + +def test_list_tables_wrong_type(client): + with pytest.raises(TypeError): + client.list_tables(42) + + +@dataset_polymorphic +def test_list_tables_page_size(make_dataset, get_reference, client, PROJECT, DS_ID): + path = "/projects/{}/datasets/{}/tables".format(PROJECT, DS_ID) + conn = client._connection = make_connection({}) + + dataset = make_dataset(PROJECT, DS_ID) + iterator = client.list_tables(dataset, timeout=7.5, page_size=42) + assert iterator.dataset == get_reference(dataset) + page = next(iterator.pages) + tables = list(page) + token = iterator.next_page_token + + assert tables == [] + assert token is None + conn.api_request.assert_called_once_with( + method="GET", path=path, query_params=dict(maxResults=42), timeout=7.5 + ) diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index c4527c837..36cbf4993 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -19,7 +19,6 @@ import mock import pytest -import six try: import pandas @@ -33,6 +32,7 @@ from google.cloud.bigquery import job from google.cloud.bigquery import table from google.cloud.bigquery.magics import magics +from google.cloud.bigquery.retry import DEFAULT_TIMEOUT from tests.unit.helpers import make_connection from test_utils.imports import maybe_fail_import @@ -41,7 +41,7 @@ io = pytest.importorskip("IPython.utils.io") tools = pytest.importorskip("IPython.testing.tools") interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") -bigquery_storage_v1 = pytest.importorskip("google.cloud.bigquery_storage_v1") +bigquery_storage = pytest.importorskip("google.cloud.bigquery_storage") @pytest.fixture(scope="session") @@ -83,8 +83,8 @@ def missing_bq_storage(): def fail_if(name, globals, locals, fromlist, level): # NOTE: *very* simplified, assuming a straightforward absolute import - return "bigquery_storage_v1" in name or ( - fromlist is not None and "bigquery_storage_v1" in fromlist + return "bigquery_storage" in name or ( + fromlist is not None and "bigquery_storage" in fromlist ) return maybe_fail_import(predicate=fail_if) @@ -101,27 +101,38 @@ def fail_if(name, globals, locals, fromlist, level): return maybe_fail_import(predicate=fail_if) -JOB_REFERENCE_RESOURCE = {"projectId": "its-a-project-eh", "jobId": "some-random-id"} +PROJECT_ID = "its-a-project-eh" +JOB_ID = "some-random-id" +JOB_REFERENCE_RESOURCE = {"projectId": PROJECT_ID, "jobId": JOB_ID} +DATASET_ID = "dest_dataset" +TABLE_ID = "dest_table" TABLE_REFERENCE_RESOURCE = { - "projectId": "its-a-project-eh", - "datasetId": "ds", - "tableId": "persons", + "projectId": PROJECT_ID, + "datasetId": DATASET_ID, + "tableId": TABLE_ID, } +QUERY_STRING = "SELECT 42 AS the_answer FROM `life.the_universe.and_everything`;" QUERY_RESOURCE = { "jobReference": JOB_REFERENCE_RESOURCE, "configuration": { "query": { "destinationTable": TABLE_REFERENCE_RESOURCE, - "query": "SELECT 42 FROM `life.the_universe.and_everything`;", + "query": QUERY_STRING, "queryParameters": [], "useLegacySql": False, } }, "status": {"state": "DONE"}, } +QUERY_RESULTS_RESOURCE = { + "jobReference": JOB_REFERENCE_RESOURCE, + "totalRows": 1, + "jobComplete": True, + "schema": {"fields": [{"name": "the_answer", "type": "INTEGER"}]}, +} -def test_context_credentials_auto_set_w_application_default_credentials(): +def test_context_with_default_credentials(): """When Application Default Credentials are set, the context credentials will be created the first time it is called """ @@ -142,6 +153,50 @@ def test_context_credentials_auto_set_w_application_default_credentials(): assert default_mock.call_count == 2 +@pytest.mark.usefixtures("ipython_interactive") +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_context_with_default_connection(): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._credentials = None + magics.context._project = None + magics.context._connection = None + + default_credentials = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + credentials_patch = mock.patch( + "google.auth.default", return_value=(default_credentials, "project-from-env") + ) + default_conn = make_connection(QUERY_RESOURCE, QUERY_RESULTS_RESOURCE) + conn_patch = mock.patch("google.cloud.bigquery.client.Connection", autospec=True) + list_rows_patch = mock.patch( + "google.cloud.bigquery.client.Client._list_rows_from_query_results", + return_value=google.cloud.bigquery.table._EmptyRowIterator(), + ) + + with conn_patch as conn, credentials_patch, list_rows_patch as list_rows: + conn.return_value = default_conn + ip.run_cell_magic("bigquery", "", QUERY_STRING) + + # Check that query actually starts the job. + conn.assert_called() + list_rows.assert_called() + begin_call = mock.call( + method="POST", + path="/projects/project-from-env/jobs", + data=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT_ID}/queries/{JOB_ID}", + query_params=mock.ANY, + timeout=mock.ANY, + ) + default_conn.api_request.assert_has_calls([begin_call, query_results_call]) + + def test_context_credentials_and_project_can_be_set_explicitly(): project1 = "one-project-55564" project2 = "other-project-52569" @@ -163,93 +218,47 @@ def test_context_credentials_and_project_can_be_set_explicitly(): @pytest.mark.usefixtures("ipython_interactive") @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_context_connection_can_be_overriden(): +def test_context_with_custom_connection(): ip = IPython.get_ipython() ip.extension_manager.load_extension("google.cloud.bigquery") magics.context._project = None magics.context._credentials = None - - credentials_mock = mock.create_autospec( - google.auth.credentials.Credentials, instance=True - ) - project = "project-123" - default_patch = mock.patch( - "google.auth.default", return_value=(credentials_mock, project) - ) - job_reference = copy.deepcopy(JOB_REFERENCE_RESOURCE) - job_reference["projectId"] = project - - query = "select * from persons" - resource = copy.deepcopy(QUERY_RESOURCE) - resource["jobReference"] = job_reference - resource["configuration"]["query"]["query"] = query - data = {"jobReference": job_reference, "totalRows": 0, "rows": []} - - conn = magics.context._connection = make_connection(resource, data) - list_rows_patch = mock.patch( - "google.cloud.bigquery.client.Client.list_rows", - return_value=google.cloud.bigquery.table._EmptyRowIterator(), + context_conn = magics.context._connection = make_connection( + QUERY_RESOURCE, QUERY_RESULTS_RESOURCE ) - with list_rows_patch as list_rows, default_patch: - ip.run_cell_magic("bigquery", "", query) - - # Check that query actually starts the job. - list_rows.assert_called() - assert len(conn.api_request.call_args_list) == 2 - _, req = conn.api_request.call_args_list[0] - assert req["method"] == "POST" - assert req["path"] == "/projects/{}/jobs".format(project) - sent = req["data"] - assert isinstance(sent["jobReference"]["jobId"], six.string_types) - sent_config = sent["configuration"]["query"] - assert sent_config["query"] == query - - -@pytest.mark.usefixtures("ipython_interactive") -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_context_no_connection(): - ip = IPython.get_ipython() - ip.extension_manager.load_extension("google.cloud.bigquery") - magics.context._project = None - magics.context._credentials = None - magics.context._connection = None - credentials_mock = mock.create_autospec( + default_credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) - project = "project-123" - default_patch = mock.patch( - "google.auth.default", return_value=(credentials_mock, project) + credentials_patch = mock.patch( + "google.auth.default", return_value=(default_credentials, "project-from-env") ) - job_reference = copy.deepcopy(JOB_REFERENCE_RESOURCE) - job_reference["projectId"] = project - - query = "select * from persons" - resource = copy.deepcopy(QUERY_RESOURCE) - resource["jobReference"] = job_reference - resource["configuration"]["query"]["query"] = query - data = {"jobReference": job_reference, "totalRows": 0, "rows": []} - - conn_mock = make_connection(resource, data, data, data) + default_conn = make_connection() conn_patch = mock.patch("google.cloud.bigquery.client.Connection", autospec=True) list_rows_patch = mock.patch( - "google.cloud.bigquery.client.Client.list_rows", + "google.cloud.bigquery.client.Client._list_rows_from_query_results", return_value=google.cloud.bigquery.table._EmptyRowIterator(), ) - with conn_patch as conn, list_rows_patch as list_rows, default_patch: - conn.return_value = conn_mock - ip.run_cell_magic("bigquery", "", query) - # Check that query actually starts the job. + with conn_patch as conn, credentials_patch, list_rows_patch as list_rows: + conn.return_value = default_conn + ip.run_cell_magic("bigquery", "", QUERY_STRING) + list_rows.assert_called() - assert len(conn_mock.api_request.call_args_list) == 2 - _, req = conn_mock.api_request.call_args_list[0] - assert req["method"] == "POST" - assert req["path"] == "/projects/{}/jobs".format(project) - sent = req["data"] - assert isinstance(sent["jobReference"]["jobId"], six.string_types) - sent_config = sent["configuration"]["query"] - assert sent_config["query"] == query + default_conn.api_request.assert_not_called() + begin_call = mock.call( + method="POST", + path="/projects/project-from-env/jobs", + data=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT_ID}/queries/{JOB_ID}", + query_params=mock.ANY, + timeout=mock.ANY, + ) + context_conn.api_request.assert_has_calls([begin_call, query_results_call]) def test__run_query(): @@ -309,28 +318,37 @@ def test__make_bqstorage_client_false(): credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) - got = magics._make_bqstorage_client(False, credentials_mock) + test_client = bigquery.Client( + project="test_project", credentials=credentials_mock, location="test_location" + ) + got = magics._make_bqstorage_client(test_client, False, {}) assert got is None @pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) def test__make_bqstorage_client_true(): credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) - got = magics._make_bqstorage_client(True, credentials_mock) - assert isinstance(got, bigquery_storage_v1.BigQueryReadClient) + test_client = bigquery.Client( + project="test_project", credentials=credentials_mock, location="test_location" + ) + got = magics._make_bqstorage_client(test_client, True, {}) + assert isinstance(got, bigquery_storage.BigQueryReadClient) def test__make_bqstorage_client_true_raises_import_error(missing_bq_storage): credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + test_client = bigquery.Client( + project="test_project", credentials=credentials_mock, location="test_location" + ) with pytest.raises(ImportError) as exc_context, missing_bq_storage: - magics._make_bqstorage_client(True, credentials_mock) + magics._make_bqstorage_client(test_client, True, {}) error_msg = str(exc_context.value) assert "google-cloud-bigquery-storage" in error_msg @@ -338,7 +356,35 @@ def test__make_bqstorage_client_true_raises_import_error(missing_bq_storage): @pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +def test__make_bqstorage_client_true_obsolete_dependency(): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + credentials_mock = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + test_client = bigquery.Client( + project="test_project", credentials=credentials_mock, location="test_location" + ) + + patcher = mock.patch( + "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + ) + with patcher, warnings.catch_warnings(record=True) as warned: + got = magics._make_bqstorage_client(test_client, True, {}) + + assert got is None + + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." + + +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test__make_bqstorage_client_true_missing_gapic(missing_grpcio_lib): @@ -347,7 +393,7 @@ def test__make_bqstorage_client_true_missing_gapic(missing_grpcio_lib): ) with pytest.raises(ImportError) as exc_context, missing_grpcio_lib: - magics._make_bqstorage_client(True, credentials_mock) + magics._make_bqstorage_client(True, credentials_mock, {}) assert "grpcio" in str(exc_context.value) @@ -396,7 +442,7 @@ def test_extension_load(): @pytest.mark.usefixtures("ipython_interactive") @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) def test_bigquery_magic_without_optional_arguments(monkeypatch): ip = IPython.get_ipython() @@ -410,14 +456,14 @@ def test_bigquery_magic_without_optional_arguments(monkeypatch): monkeypatch.setattr(magics.context, "_credentials", mock_credentials) # Mock out the BigQuery Storage API. - bqstorage_mock = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_mock = mock.create_autospec(bigquery_storage.BigQueryReadClient) bqstorage_instance_mock = mock.create_autospec( - bigquery_storage_v1.BigQueryReadClient, instance=True + bigquery_storage.BigQueryReadClient, instance=True ) - bqstorage_instance_mock.transport = mock.Mock() + bqstorage_instance_mock._transport = mock.Mock() bqstorage_mock.return_value = bqstorage_instance_mock bqstorage_client_patch = mock.patch( - "google.cloud.bigquery_storage_v1.BigQueryReadClient", bqstorage_mock + "google.cloud.bigquery_storage.BigQueryReadClient", bqstorage_mock ) sql = "SELECT 17 AS num" @@ -559,7 +605,7 @@ def test_bigquery_magic_clears_display_in_verbose_mode(): @pytest.mark.usefixtures("ipython_interactive") @pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) def test_bigquery_magic_with_bqstorage_from_argument(monkeypatch): ip = IPython.get_ipython() @@ -573,14 +619,14 @@ def test_bigquery_magic_with_bqstorage_from_argument(monkeypatch): monkeypatch.setattr(magics.context, "_credentials", mock_credentials) # Mock out the BigQuery Storage API. - bqstorage_mock = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_mock = mock.create_autospec(bigquery_storage.BigQueryReadClient) bqstorage_instance_mock = mock.create_autospec( - bigquery_storage_v1.BigQueryReadClient, instance=True + bigquery_storage.BigQueryReadClient, instance=True ) - bqstorage_instance_mock.transport = mock.Mock() + bqstorage_instance_mock._transport = mock.Mock() bqstorage_mock.return_value = bqstorage_instance_mock bqstorage_client_patch = mock.patch( - "google.cloud.bigquery_storage_v1.BigQueryReadClient", bqstorage_mock + "google.cloud.bigquery_storage.BigQueryReadClient", bqstorage_mock ) sql = "SELECT 17 AS num" @@ -615,7 +661,9 @@ def warning_match(warning): assert client_info.user_agent == "ipython-" + IPython.__version__ query_job_mock.to_dataframe.assert_called_once_with( - bqstorage_client=bqstorage_instance_mock + bqstorage_client=bqstorage_instance_mock, + create_bqstorage_client=mock.ANY, + progress_bar_type="tqdm", ) assert isinstance(return_value, pandas.DataFrame) @@ -623,7 +671,7 @@ def warning_match(warning): @pytest.mark.usefixtures("ipython_interactive") @pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) def test_bigquery_magic_with_rest_client_requested(monkeypatch): ip = IPython.get_ipython() @@ -637,9 +685,9 @@ def test_bigquery_magic_with_rest_client_requested(monkeypatch): monkeypatch.setattr(magics.context, "_credentials", mock_credentials) # Mock out the BigQuery Storage API. - bqstorage_mock = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_mock = mock.create_autospec(bigquery_storage.BigQueryReadClient) bqstorage_client_patch = mock.patch( - "google.cloud.bigquery_storage_v1.BigQueryReadClient", bqstorage_mock + "google.cloud.bigquery_storage.BigQueryReadClient", bqstorage_mock ) sql = "SELECT 17 AS num" @@ -657,7 +705,11 @@ def test_bigquery_magic_with_rest_client_requested(monkeypatch): return_value = ip.run_cell_magic("bigquery", "--use_rest_api", sql) bqstorage_mock.assert_not_called() - query_job_mock.to_dataframe.assert_called_once_with(bqstorage_client=None) + query_job_mock.to_dataframe.assert_called_once_with( + bqstorage_client=None, + create_bqstorage_client=False, + progress_bar_type="tqdm", + ) assert isinstance(return_value, pandas.DataFrame) @@ -710,7 +762,12 @@ def test_bigquery_magic_w_max_results_valid_calls_queryjob_result(): client_query_mock.return_value = query_job_mock ip.run_cell_magic("bigquery", "--max_results=5", sql) - query_job_mock.result.assert_called_with(max_results=5) + query_job_mock.result.assert_called_with(max_results=5) + query_job_mock.result.return_value.to_dataframe.assert_called_once_with( + bqstorage_client=None, + create_bqstorage_client=False, + progress_bar_type=mock.ANY, + ) @pytest.mark.usefixtures("ipython_interactive") @@ -841,7 +898,7 @@ def test_bigquery_magic_w_table_id_and_destination_var(ipython_ns_cleanup): @pytest.mark.usefixtures("ipython_interactive") @pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_bigquery_magic_w_table_id_and_bqstorage_client(): @@ -864,24 +921,25 @@ def test_bigquery_magic_w_table_id_and_bqstorage_client(): "google.cloud.bigquery.magics.magics.bigquery.Client", autospec=True ) - bqstorage_mock = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_mock = mock.create_autospec(bigquery_storage.BigQueryReadClient) bqstorage_instance_mock = mock.create_autospec( - bigquery_storage_v1.BigQueryReadClient, instance=True + bigquery_storage.BigQueryReadClient, instance=True ) - bqstorage_instance_mock.transport = mock.Mock() + bqstorage_instance_mock._transport = mock.Mock() bqstorage_mock.return_value = bqstorage_instance_mock bqstorage_client_patch = mock.patch( - "google.cloud.bigquery_storage_v1.BigQueryReadClient", bqstorage_mock + "google.cloud.bigquery_storage.BigQueryReadClient", bqstorage_mock ) table_id = "bigquery-public-data.samples.shakespeare" with default_patch, client_patch as client_mock, bqstorage_client_patch: + client_mock()._ensure_bqstorage_client.return_value = bqstorage_instance_mock client_mock().list_rows.return_value = row_iterator_mock ip.run_cell_magic("bigquery", "--max_results=5", table_id) row_iterator_mock.to_dataframe.assert_called_once_with( - bqstorage_client=bqstorage_instance_mock + bqstorage_client=bqstorage_instance_mock, create_bqstorage_client=mock.ANY, ) @@ -1060,6 +1118,7 @@ def test_bigquery_magic_w_maximum_bytes_billed_overrides_context(param_value, ex resource = copy.deepcopy(QUERY_RESOURCE) resource["jobReference"] = job_reference resource["configuration"]["query"]["query"] = query + query_results = {"jobReference": job_reference, "totalRows": 0, "jobComplete": True} data = {"jobReference": job_reference, "totalRows": 0, "rows": []} credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True @@ -1067,9 +1126,9 @@ def test_bigquery_magic_w_maximum_bytes_billed_overrides_context(param_value, ex default_patch = mock.patch( "google.auth.default", return_value=(credentials_mock, "general-project") ) - conn = magics.context._connection = make_connection(resource, data) + conn = magics.context._connection = make_connection(resource, query_results, data) list_rows_patch = mock.patch( - "google.cloud.bigquery.client.Client.list_rows", + "google.cloud.bigquery.client.Client._list_rows_from_query_results", return_value=google.cloud.bigquery.table._EmptyRowIterator(), ) with list_rows_patch, default_patch: @@ -1098,6 +1157,7 @@ def test_bigquery_magic_w_maximum_bytes_billed_w_context_inplace(): resource = copy.deepcopy(QUERY_RESOURCE) resource["jobReference"] = job_reference resource["configuration"]["query"]["query"] = query + query_results = {"jobReference": job_reference, "totalRows": 0, "jobComplete": True} data = {"jobReference": job_reference, "totalRows": 0, "rows": []} credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True @@ -1105,9 +1165,9 @@ def test_bigquery_magic_w_maximum_bytes_billed_w_context_inplace(): default_patch = mock.patch( "google.auth.default", return_value=(credentials_mock, "general-project") ) - conn = magics.context._connection = make_connection(resource, data) + conn = magics.context._connection = make_connection(resource, query_results, data) list_rows_patch = mock.patch( - "google.cloud.bigquery.client.Client.list_rows", + "google.cloud.bigquery.client.Client._list_rows_from_query_results", return_value=google.cloud.bigquery.table._EmptyRowIterator(), ) with list_rows_patch, default_patch: @@ -1136,6 +1196,7 @@ def test_bigquery_magic_w_maximum_bytes_billed_w_context_setter(): resource = copy.deepcopy(QUERY_RESOURCE) resource["jobReference"] = job_reference resource["configuration"]["query"]["query"] = query + query_results = {"jobReference": job_reference, "totalRows": 0, "jobComplete": True} data = {"jobReference": job_reference, "totalRows": 0, "rows": []} credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True @@ -1143,9 +1204,9 @@ def test_bigquery_magic_w_maximum_bytes_billed_w_context_setter(): default_patch = mock.patch( "google.auth.default", return_value=(credentials_mock, "general-project") ) - conn = magics.context._connection = make_connection(resource, data) + conn = magics.context._connection = make_connection(resource, query_results, data) list_rows_patch = mock.patch( - "google.cloud.bigquery.client.Client.list_rows", + "google.cloud.bigquery.client.Client._list_rows_from_query_results", return_value=google.cloud.bigquery.table._EmptyRowIterator(), ) with list_rows_patch, default_patch: @@ -1156,6 +1217,73 @@ def test_bigquery_magic_w_maximum_bytes_billed_w_context_setter(): assert sent_config["maximumBytesBilled"] == "10203" +@pytest.mark.usefixtures("ipython_interactive") +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_bigquery_magic_w_progress_bar_type_w_context_setter(monkeypatch): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._project = None + + magics.context.progress_bar_type = "tqdm_gui" + + mock_credentials = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + + # Set up the context with monkeypatch so that it's reset for subsequent + # tests. + monkeypatch.setattr(magics.context, "_credentials", mock_credentials) + + # Mock out the BigQuery Storage API. + bqstorage_mock = mock.create_autospec(bigquery_storage.BigQueryReadClient) + bqstorage_client_patch = mock.patch( + "google.cloud.bigquery_storage.BigQueryReadClient", bqstorage_mock + ) + + sql = "SELECT 17 AS num" + result = pandas.DataFrame([17], columns=["num"]) + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + query_job_mock = mock.create_autospec( + google.cloud.bigquery.job.QueryJob, instance=True + ) + query_job_mock.to_dataframe.return_value = result + with run_query_patch as run_query_mock, bqstorage_client_patch: + run_query_mock.return_value = query_job_mock + + return_value = ip.run_cell_magic("bigquery", "--use_rest_api", sql) + + bqstorage_mock.assert_not_called() + query_job_mock.to_dataframe.assert_called_once_with( + bqstorage_client=None, + create_bqstorage_client=False, + progress_bar_type=magics.context.progress_bar_type, + ) + + assert isinstance(return_value, pandas.DataFrame) + + +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_progress_bar_type(): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context.progress_bar_type = None + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", "--progress_bar_type=tqdm_gui", "SELECT 17 as num" + ) + + progress_bar_used = run_query_mock.mock_calls[1][2]["progress_bar_type"] + assert progress_bar_used == "tqdm_gui" + # context progress bar type should not change + assert magics.context.progress_bar_type is None + + @pytest.mark.usefixtures("ipython_interactive") def test_bigquery_magic_with_project(): ip = IPython.get_ipython() @@ -1180,6 +1308,96 @@ def test_bigquery_magic_with_project(): assert magics.context.project == "general-project" +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bigquery_api_endpoint(ipython_ns_cleanup): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bigquery_api_endpoint=https://bigquery_api.endpoint.com", + "SELECT 17 as num", + ) + + connection_used = run_query_mock.call_args_list[0][0][0]._connection + assert connection_used.API_BASE_URL == "https://bigquery_api.endpoint.com" + # context client options should not change + assert magics.context.bigquery_client_options.api_endpoint is None + + +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bigquery_api_endpoint_context_dict(): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + magics.context.bigquery_client_options = {} + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bigquery_api_endpoint=https://bigquery_api.endpoint.com", + "SELECT 17 as num", + ) + + connection_used = run_query_mock.call_args_list[0][0][0]._connection + assert connection_used.API_BASE_URL == "https://bigquery_api.endpoint.com" + # context client options should not change + assert magics.context.bigquery_client_options == {} + + +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bqstorage_api_endpoint(ipython_ns_cleanup): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bqstorage_api_endpoint=https://bqstorage_api.endpoint.com", + "SELECT 17 as num", + ) + + client_used = run_query_mock.mock_calls[1][2]["bqstorage_client"] + assert client_used._transport._host == "https://bqstorage_api.endpoint.com" + # context client options should not change + assert magics.context.bqstorage_client_options.api_endpoint is None + + +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bqstorage_api_endpoint_context_dict(): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + magics.context.bqstorage_client_options = {} + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bqstorage_api_endpoint=https://bqstorage_api.endpoint.com", + "SELECT 17 as num", + ) + + client_used = run_query_mock.mock_calls[1][2]["bqstorage_client"] + assert client_used._transport._host == "https://bqstorage_api.endpoint.com" + # context client options should not change + assert magics.context.bqstorage_client_options == {} + + @pytest.mark.usefixtures("ipython_interactive") def test_bigquery_magic_with_multiple_options(): ip = IPython.get_ipython() diff --git a/tests/unit/test_opentelemetry_tracing.py b/tests/unit/test_opentelemetry_tracing.py index 1c35b0a82..726e3cf6f 100644 --- a/tests/unit/test_opentelemetry_tracing.py +++ b/tests/unit/test_opentelemetry_tracing.py @@ -13,6 +13,7 @@ # limitations under the License. import datetime +import importlib import sys import mock @@ -25,10 +26,9 @@ from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( InMemorySpanExporter, ) -except ImportError: +except ImportError: # pragma: NO COVER opentelemetry = None import pytest -from six.moves import reload_module from google.cloud.bigquery import opentelemetry_tracing @@ -39,7 +39,7 @@ @pytest.mark.skipif(opentelemetry is None, reason="Require `opentelemetry`") @pytest.fixture def setup(): - reload_module(opentelemetry_tracing) + importlib.reload(opentelemetry_tracing) tracer_provider = TracerProvider() memory_exporter = InMemorySpanExporter() span_processor = SimpleExportSpanProcessor(memory_exporter) @@ -51,9 +51,21 @@ def setup(): @pytest.mark.skipif(opentelemetry is None, reason="Require `opentelemetry`") def test_opentelemetry_not_installed(setup, monkeypatch): monkeypatch.setitem(sys.modules, "opentelemetry", None) - reload_module(opentelemetry_tracing) + importlib.reload(opentelemetry_tracing) + assert not opentelemetry_tracing._warned_telemetry with opentelemetry_tracing.create_span("No-op for opentelemetry") as span: assert span is None + assert opentelemetry_tracing._warned_telemetry + + +@pytest.mark.skipif(opentelemetry is None, reason="Require `opentelemetry`") +def test_opentelemetry_not_installed_doesnt_warn(setup, monkeypatch): + monkeypatch.setitem(sys.modules, "opentelemetry", None) + importlib.reload(opentelemetry_tracing) + opentelemetry_tracing._warned_telemetry = True + with opentelemetry_tracing.create_span("No-op for opentelemetry") as span: + assert span is None + assert opentelemetry_tracing._warned_telemetry @pytest.mark.skipif(opentelemetry is None, reason="Require `opentelemetry`") diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index a7c639ed1..69a6772e5 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -13,6 +13,7 @@ # limitations under the License. import datetime +import decimal import unittest import mock @@ -43,6 +44,338 @@ def test___eq__(self): self.assertNotEqual(udf, wrong_type) +class Test__AbstractQueryParameterType(unittest.TestCase): + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.query import _AbstractQueryParameterType + + return _AbstractQueryParameterType + + @classmethod + def _make_one(cls, *args, **kw): + return cls._get_target_class()(*args, **kw) + + def test_from_api_virtual(self): + klass = self._get_target_class() + with self.assertRaises(NotImplementedError): + klass.from_api_repr({}) + + def test_to_api_virtual(self): + param_type = self._make_one() + with self.assertRaises(NotImplementedError): + param_type.to_api_repr() + + +class Test_ScalarQueryParameterType(unittest.TestCase): + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.query import ScalarQueryParameterType + + return ScalarQueryParameterType + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def test_from_api_repr(self): + klass = self._get_target_class() + result = klass.from_api_repr({"type": "BOOLEAN"}) + self.assertEqual(result._type, "BOOLEAN") + self.assertIsNone(result.name) + self.assertIsNone(result.description) + + def test_to_api_repr(self): + param_type = self._make_one("BYTES", name="foo", description="bar") + result = param_type.to_api_repr() + self.assertEqual(result, {"type": "BYTES"}) + + def test_repr_no_optional_attrs(self): + param_type = self._make_one("BYTES") + self.assertEqual(repr(param_type), "ScalarQueryParameterType('BYTES')") + + def test_repr_all_optional_attrs(self): + param_type = self._make_one("BYTES", name="foo", description="this is foo") + self.assertEqual( + repr(param_type), + "ScalarQueryParameterType('BYTES', name='foo', description='this is foo')", + ) + + def test_with_name_returns_copy_w_changed_name(self): + param_type = self._make_one("BOOLEAN", name=None, description="Some checkbox.") + modified_type = param_type.with_name("allow_emails") + + self.assertIsNot(modified_type, param_type) # Result is a copy. + self.assertEqual(modified_type.name, "allow_emails") + + # The rest of the The rest of the fields should have been preserved. + self.assertEqual(modified_type._type, param_type._type) + self.assertEqual(modified_type.description, param_type.description) + + def test_with_name_clearing_the_value(self): + param_type = self._make_one( + "BOOLEAN", name="allow_emails", description="Some checkbox." + ) + modified_type = param_type.with_name(None) + + self.assertIsNone(modified_type.name) + self.assertEqual(param_type.name, "allow_emails") # original unchanged + + +class Test_ArrayQueryParameterType(unittest.TestCase): + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.query import ArrayQueryParameterType + + return ArrayQueryParameterType + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def test_from_api_repr(self): + from google.cloud.bigquery.query import StructQueryParameterType + + api_resource = { + "type": "ARRAY", + "arrayType": { + "type": "STRUCT", + "structTypes": [ + { + "name": "weight", + "type": {"type": "INTEGER"}, + "description": "in kg", + }, + {"name": "last_name", "type": {"type": "STRING"}}, + ], + }, + } + + klass = self._get_target_class() + result = klass.from_api_repr(api_resource) + + self.assertIsNone(result.name) + self.assertIsNone(result.description) + item_type = result._array_type + self.assertIsInstance(item_type, StructQueryParameterType) + + self.assertIsNone(item_type.name) + self.assertIsNone(item_type.description) + + field = item_type.fields[0] + self.assertEqual(field.name, "weight") + self.assertEqual(field.description, "in kg") + self.assertEqual(field._type, "INTEGER") + + field = item_type.fields[1] + self.assertEqual(field.name, "last_name") + self.assertIsNone(field.description) + self.assertEqual(field._type, "STRING") + + def test_to_api_repr(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + from google.cloud.bigquery.query import StructQueryParameterType + + array_item_type = StructQueryParameterType( + ScalarQueryParameterType("INTEGER", name="weight", description="in kg"), + ScalarQueryParameterType("STRING", name="last_name"), + ) + param_type = self._make_one(array_item_type, name="foo", description="bar") + + result = param_type.to_api_repr() + + expected_result = { + "type": "ARRAY", + "arrayType": { + "type": "STRUCT", + "structTypes": [ + { + "name": "weight", + "type": {"type": "INTEGER"}, + "description": "in kg", + }, + {"name": "last_name", "type": {"type": "STRING"}}, + ], + }, + } + self.assertEqual(result, expected_result) + + def test_repr_no_optional_attrs(self): + param_type = self._make_one("BOOLEAN") + self.assertEqual(repr(param_type), "ArrayQueryParameterType('BOOLEAN')") + + def test_repr_all_optional_attrs(self): + param_type = self._make_one("INT64", name="bar", description="this is bar") + self.assertEqual( + repr(param_type), + "ArrayQueryParameterType('INT64', name='bar', description='this is bar')", + ) + + +class Test_StructQueryParameterType(unittest.TestCase): + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.query import StructQueryParameterType + + return StructQueryParameterType + + def _make_one(self, *args, **kw): + return self._get_target_class()(*args, **kw) + + def test_raises_error_without_any_fields(self): + with self.assertRaisesRegex(ValueError, ".*at least one field.*"): + self._make_one() + + def test_from_api_repr(self): + from google.cloud.bigquery.query import ArrayQueryParameterType + from google.cloud.bigquery.query import ScalarQueryParameterType + + api_resource = { + "type": "STRUCT", + "structTypes": [ + { + "name": "age", + "type": {"type": "INTEGER"}, + "description": "in years", + }, + { + "name": "aliases", + "type": {"type": "ARRAY", "arrayType": {"type": "STRING"}}, + }, + { + "description": "a nested struct", + "type": { + "type": "STRUCT", + "structTypes": [ + {"type": {"type": "DATE"}, "name": "nested_date"}, + { + "type": {"type": "BOOLEAN"}, + "description": "nested bool field", + }, + ], + }, + }, + ], + } + + klass = self._get_target_class() + result = klass.from_api_repr(api_resource) + + self.assertIsNone(result.name) + self.assertIsNone(result.description) + self.assertEqual(len(result.fields), 3) + + field = result.fields[0] + self.assertIsInstance(field, ScalarQueryParameterType) + self.assertEqual(field.name, "age") + self.assertEqual(field.description, "in years") + + field = result.fields[1] + self.assertIsInstance(field, ArrayQueryParameterType) + self.assertEqual(field.name, "aliases") + self.assertIsNone(field.description) + self.assertIsInstance(field._array_type, ScalarQueryParameterType) + self.assertEqual(field._array_type._type, "STRING") + + field = result.fields[2] + self.assertIsInstance(field, self._get_target_class()) + self.assertIsNone(field.name) + self.assertEqual(field.description, "a nested struct") + + date_field = field.fields[0] + self.assertEqual(date_field._type, "DATE") + self.assertEqual(date_field.name, "nested_date") + self.assertIsNone(date_field.description) + + bool_field = field.fields[1] + self.assertEqual(bool_field._type, "BOOLEAN") + self.assertIsNone(bool_field.name) + self.assertEqual(bool_field.description, "nested bool field") + + def test_to_api_repr(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + + int_type = ScalarQueryParameterType("INTEGER", description="in years") + date_type = ScalarQueryParameterType("DATE", name="day_of_birth") + param_type = self._make_one(int_type, date_type, name="foo", description="bar") + + result = param_type.to_api_repr() + + expected_result = { + "type": "STRUCT", + "structTypes": [ + {"type": {"type": "INTEGER"}, "description": "in years"}, + {"name": "day_of_birth", "type": {"type": "DATE"}}, + ], + } + self.assertEqual(result, expected_result) + + def test_to_api_repr_nested(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + + struct_class = self._get_target_class() + + int_type = ScalarQueryParameterType("INTEGER", description="in years") + nested_struct_type = struct_class( + ScalarQueryParameterType("DATE", name="nested_date"), + ScalarQueryParameterType("BOOLEAN", description="nested bool field"), + name="nested", + ) + param_type = self._make_one( + int_type, nested_struct_type, name="foo", description="bar" + ) + + result = param_type.to_api_repr() + + expected_result = { + "type": "STRUCT", + "structTypes": [ + {"type": {"type": "INTEGER"}, "description": "in years"}, + { + "name": "nested", + "type": { + "type": "STRUCT", + "structTypes": [ + {"type": {"type": "DATE"}, "name": "nested_date"}, + { + "type": {"type": "BOOLEAN"}, + "description": "nested bool field", + }, + ], + }, + }, + ], + } + self.assertEqual(result, expected_result) + + def test_repr_no_optional_attrs(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + + param_type = self._make_one( + ScalarQueryParameterType("BOOLEAN"), ScalarQueryParameterType("STRING") + ) + expected = ( + "StructQueryParameterType(" + "ScalarQueryParameterType('BOOLEAN'), ScalarQueryParameterType('STRING')" + ")" + ) + self.assertEqual(repr(param_type), expected) + + def test_repr_all_optional_attrs(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + + param_type = self._make_one( + ScalarQueryParameterType("BOOLEAN"), + ScalarQueryParameterType("STRING"), + name="data_record", + description="this is it", + ) + expected = ( + "StructQueryParameterType(" + "ScalarQueryParameterType('BOOLEAN'), ScalarQueryParameterType('STRING'), " + "name='data_record', description='this is it'" + ")" + ) + self.assertEqual(repr(param_type), expected) + + class Test__AbstractQueryParameter(unittest.TestCase): @staticmethod def _get_target_class(): @@ -98,6 +431,18 @@ def test_positional(self): self.assertEqual(param.type_, "INT64") self.assertEqual(param.value, 123) + def test_ctor_w_scalar_query_parameter_type(self): + from google.cloud.bigquery import enums + + param = self._make_one( + name="foo", + type_=enums.SqlParameterScalarTypes.BIGNUMERIC, + value=decimal.Decimal("123.456"), + ) + self.assertEqual(param.name, "foo") + self.assertEqual(param.type_, "BIGNUMERIC") + self.assertEqual(param.value, decimal.Decimal("123.456")) + def test_from_api_repr_w_name(self): RESOURCE = { "name": "foo", @@ -166,6 +511,16 @@ def test_to_api_repr_w_numeric(self): param = klass.positional(type_="NUMERIC", value="123456789.123456789") self.assertEqual(param.to_api_repr(), EXPECTED) + def test_to_api_repr_w_bignumeric(self): + big_num_string = "{d38}.{d38}".format(d38="9" * 38) + EXPECTED = { + "parameterType": {"type": "BIGNUMERIC"}, + "parameterValue": {"value": big_num_string}, + } + klass = self._get_target_class() + param = klass.positional(type_="BIGNUMERIC", value=big_num_string) + self.assertEqual(param.to_api_repr(), EXPECTED) + def test_to_api_repr_w_bool(self): EXPECTED = { "parameterType": {"type": "BOOL"}, @@ -330,6 +685,10 @@ def test_ctor(self): self.assertEqual(param.array_type, "INT64") self.assertEqual(param.values, [1, 2]) + def test_ctor_empty_struct_array_wo_type_info(self): + with self.assertRaisesRegex(ValueError, r"(?i)missing.*struct.*type info.*"): + self._make_one(name="foo", array_type="STRUCT", values=[]) + def test___eq__(self): param = self._make_one(name="foo", array_type="INT64", values=[123]) self.assertEqual(param, param) @@ -383,6 +742,16 @@ def test_from_api_repr_wo_values(self): self.assertEqual(param.array_type, "INT64") self.assertEqual(param.values, []) + def test_from_api_repr_w_none_values(self): + RESOURCE = { + "parameterType": {"type": "ARRAY", "arrayType": {"type": "INT64"}}, + "parameterValue": {"arrayValues": [{"value": "1"}, {"value": None}]}, + } + klass = self._get_target_class() + param = klass.from_api_repr(RESOURCE) + self.assertEqual(param.array_type, "INT64") + self.assertEqual(param.values, [1, None]) + def test_from_api_repr_w_struct_type(self): from google.cloud.bigquery.query import StructQueryParameter @@ -447,6 +816,19 @@ def test_to_api_repr_wo_name(self): param = klass.positional(array_type="INT64", values=[1, 2]) self.assertEqual(param.to_api_repr(), EXPECTED) + def test_to_api_repr_array_type_as_type_instance(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + + EXPECTED = { + "parameterType": {"type": "ARRAY", "arrayType": {"type": "BOOLEAN"}}, + "parameterValue": {"arrayValues": [{"value": "true"}, {"value": "false"}]}, + } + klass = self._get_target_class() + param = klass.positional( + array_type=ScalarQueryParameterType("BOOLEAN"), values=[True, False], + ) + self.assertEqual(param.to_api_repr(), EXPECTED) + def test_to_api_repr_w_unknown_type(self): EXPECTED = { "parameterType": {"type": "ARRAY", "arrayType": {"type": "UNKNOWN"}}, @@ -483,6 +865,31 @@ def test_to_api_repr_w_record_type(self): param = klass.positional(array_type="RECORD", values=[struct]) self.assertEqual(param.to_api_repr(), EXPECTED) + def test_to_api_repr_w_empty_array_of_records_type(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + from google.cloud.bigquery.query import StructQueryParameterType + + EXPECTED = { + "parameterType": { + "type": "ARRAY", + "arrayType": { + "type": "STRUCT", + "structTypes": [ + {"name": "foo", "type": {"type": "STRING"}}, + {"name": "bar", "type": {"type": "INT64"}}, + ], + }, + }, + "parameterValue": {"arrayValues": []}, + } + item_type = StructQueryParameterType( + ScalarQueryParameterType("STRING", name="foo"), + ScalarQueryParameterType("INT64", name="bar"), + ) + klass = self._get_target_class() + param = klass.positional(array_type=item_type, values=[]) + self.assertEqual(param.to_api_repr(), EXPECTED) + def test___eq___wrong_type(self): field = self._make_one("test", "STRING", ["value"]) other = object() @@ -527,11 +934,38 @@ def test___ne___different_values(self): field2 = self._make_one("test", "INT64", [12]) self.assertNotEqual(field1, field2) - def test___repr__(self): + def test___repr__array_type_str(self): field1 = self._make_one("field1", "STRING", ["value"]) expected = "ArrayQueryParameter('field1', 'STRING', ['value'])" self.assertEqual(repr(field1), expected) + def test___repr__array_type_scalar_type_instance(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + + int_items = self._make_one( + "int_items", ScalarQueryParameterType("INTEGER"), [64] + ) + expected = "ArrayQueryParameter('int_items', 'INTEGER', [64])" + self.assertEqual(repr(int_items), expected) + + def test___repr__array_type_struct_type_instance(self): + from google.cloud.bigquery.query import ScalarQueryParameterType + from google.cloud.bigquery.query import StructQueryParameterType + + struct_items = self._make_one( + "struct_items", + StructQueryParameterType( + ScalarQueryParameterType("INTEGER", name="age"), + ScalarQueryParameterType("STRING", name="last_name"), + ), + [{"age": 18, "last_name": "Doe"}], + ) + expected = ( + "ArrayQueryParameter('struct_items', 'STRUCT', " + "[{'age': 18, 'last_name': 'Doe'}])" + ) + self.assertEqual(repr(struct_items), expected) + class Test_StructQueryParameter(unittest.TestCase): @staticmethod @@ -881,7 +1315,7 @@ def _verifySchema(self, query, resource): self.assertEqual(found.description, expected.get("description")) self.assertEqual(found.fields, expected.get("fields", ())) else: - self.assertEqual(query.schema, ()) + self.assertEqual(query.schema, []) def test_ctor_defaults(self): query = self._make_one(self._make_resource()) @@ -891,7 +1325,7 @@ def test_ctor_defaults(self): self.assertIsNone(query.page_token) self.assertEqual(query.project, self.PROJECT) self.assertEqual(query.rows, []) - self.assertEqual(query.schema, ()) + self.assertEqual(query.schema, []) self.assertIsNone(query.total_rows) self.assertIsNone(query.total_bytes_processed) diff --git a/tests/unit/test_retry.py b/tests/unit/test_retry.py index d9f867cb3..e0a992f78 100644 --- a/tests/unit/test_retry.py +++ b/tests/unit/test_retry.py @@ -15,6 +15,7 @@ import unittest import mock +import requests.exceptions class Test_should_retry(unittest.TestCase): @@ -42,6 +43,36 @@ def test_w_rateLimitExceeded(self): exc = mock.Mock(errors=[{"reason": "rateLimitExceeded"}], spec=["errors"]) self.assertTrue(self._call_fut(exc)) + def test_w_unstructured_connectionerror(self): + exc = ConnectionError() + self.assertTrue(self._call_fut(exc)) + + def test_w_unstructured_requests_connectionerror(self): + exc = requests.exceptions.ConnectionError() + self.assertTrue(self._call_fut(exc)) + + def test_w_unstructured_requests_chunked_encoding_error(self): + exc = requests.exceptions.ChunkedEncodingError() + self.assertTrue(self._call_fut(exc)) + + def test_w_unstructured_requests_connecttimeout(self): + exc = requests.exceptions.ConnectTimeout() + self.assertTrue(self._call_fut(exc)) + + def test_w_unstructured_requests_readtimeout(self): + exc = requests.exceptions.ReadTimeout() + self.assertTrue(self._call_fut(exc)) + + def test_w_unstructured_requests_timeout(self): + exc = requests.exceptions.Timeout() + self.assertTrue(self._call_fut(exc)) + + def test_w_auth_transporterror(self): + from google.auth.exceptions import TransportError + + exc = TransportError("testing") + self.assertTrue(self._call_fut(exc)) + def test_w_unstructured_too_many_requests(self): from google.api_core.exceptions import TooManyRequests @@ -67,3 +98,27 @@ def test_w_unstructured_bad_gateway(self): exc = BadGateway("testing") self.assertTrue(self._call_fut(exc)) + + +def test_DEFAULT_JOB_RETRY_predicate(): + from google.cloud.bigquery.retry import DEFAULT_JOB_RETRY + from google.api_core.exceptions import ClientError + + assert not DEFAULT_JOB_RETRY._predicate(TypeError()) + assert not DEFAULT_JOB_RETRY._predicate(ClientError("fail")) + assert not DEFAULT_JOB_RETRY._predicate( + ClientError("fail", errors=[dict(reason="idk")]) + ) + + assert DEFAULT_JOB_RETRY._predicate( + ClientError("fail", errors=[dict(reason="rateLimitExceeded")]) + ) + assert DEFAULT_JOB_RETRY._predicate( + ClientError("fail", errors=[dict(reason="backendError")]) + ) + + +def test_DEFAULT_JOB_RETRY_deadline(): + from google.cloud.bigquery.retry import DEFAULT_JOB_RETRY + + assert DEFAULT_JOB_RETRY._deadline == 600 diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 9f7ee7bb3..d0b5ca54c 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from google.cloud.bigquery.schema import PolicyTagList import unittest import mock +import pytest class TestSchemaField(unittest.TestCase): @@ -35,19 +37,20 @@ def _make_one(self, *args, **kw): def test_constructor_defaults(self): field = self._make_one("test", "STRING") - self.assertEqual(field._name, "test") - self.assertEqual(field._field_type, "STRING") - self.assertEqual(field._mode, "NULLABLE") - self.assertIsNone(field._description) - self.assertEqual(field._fields, ()) + self.assertEqual(field.name, "test") + self.assertEqual(field.field_type, "STRING") + self.assertEqual(field.mode, "NULLABLE") + self.assertIsNone(field.description) + self.assertEqual(field.fields, ()) + self.assertEqual(field.policy_tags, PolicyTagList()) def test_constructor_explicit(self): field = self._make_one("test", "STRING", mode="REQUIRED", description="Testing") - self.assertEqual(field._name, "test") - self.assertEqual(field._field_type, "STRING") - self.assertEqual(field._mode, "REQUIRED") - self.assertEqual(field._description, "Testing") - self.assertEqual(field._fields, ()) + self.assertEqual(field.name, "test") + self.assertEqual(field.field_type, "STRING") + self.assertEqual(field.mode, "REQUIRED") + self.assertEqual(field.description, "Testing") + self.assertEqual(field.fields, ()) def test_constructor_subfields(self): sub_field1 = self._make_one("area_code", "STRING") @@ -55,13 +58,13 @@ def test_constructor_subfields(self): field = self._make_one( "phone_number", "RECORD", fields=[sub_field1, sub_field2] ) - self.assertEqual(field._name, "phone_number") - self.assertEqual(field._field_type, "RECORD") - self.assertEqual(field._mode, "NULLABLE") - self.assertIsNone(field._description) - self.assertEqual(len(field._fields), 2) - self.assertIs(field._fields[0], sub_field1) - self.assertIs(field._fields[1], sub_field2) + self.assertEqual(field.name, "phone_number") + self.assertEqual(field.field_type, "RECORD") + self.assertEqual(field.mode, "NULLABLE") + self.assertIsNone(field.description) + self.assertEqual(len(field.fields), 2) + self.assertEqual(field.fields[0], sub_field1) + self.assertEqual(field.fields[1], sub_field2) def test_constructor_with_policy_tags(self): from google.cloud.bigquery.schema import PolicyTagList @@ -70,12 +73,12 @@ def test_constructor_with_policy_tags(self): field = self._make_one( "test", "STRING", mode="REQUIRED", description="Testing", policy_tags=policy ) - self.assertEqual(field._name, "test") - self.assertEqual(field._field_type, "STRING") - self.assertEqual(field._mode, "REQUIRED") - self.assertEqual(field._description, "Testing") - self.assertEqual(field._fields, ()) - self.assertEqual(field._policy_tags, policy) + self.assertEqual(field.name, "test") + self.assertEqual(field.field_type, "STRING") + self.assertEqual(field.mode, "REQUIRED") + self.assertEqual(field.description, "Testing") + self.assertEqual(field.fields, ()) + self.assertEqual(field.policy_tags, policy) def test_to_api_repr(self): from google.cloud.bigquery.schema import PolicyTagList @@ -92,7 +95,6 @@ def test_to_api_repr(self): "mode": "NULLABLE", "name": "foo", "type": "INTEGER", - "description": None, "policyTags": {"names": ["foo", "bar"]}, }, ) @@ -109,13 +111,12 @@ def test_to_api_repr_with_subfield(self): "mode": "NULLABLE", "name": "bar", "type": "INTEGER", - "description": None, + "policyTags": {"names": []}, } ], "mode": "REQUIRED", "name": "foo", "type": record_type, - "description": None, }, ) @@ -168,17 +169,17 @@ def test_from_api_repr_defaults(self): def test_name_property(self): name = "lemon-ness" schema_field = self._make_one(name, "INTEGER") - self.assertIs(schema_field.name, name) + self.assertEqual(schema_field.name, name) def test_field_type_property(self): field_type = "BOOLEAN" schema_field = self._make_one("whether", field_type) - self.assertIs(schema_field.field_type, field_type) + self.assertEqual(schema_field.field_type, field_type) def test_mode_property(self): mode = "REPEATED" schema_field = self._make_one("again", "FLOAT", mode=mode) - self.assertIs(schema_field.mode, mode) + self.assertEqual(schema_field.mode, mode) def test_is_nullable(self): mode = "NULLABLE" @@ -193,28 +194,28 @@ def test_is_not_nullable(self): def test_description_property(self): description = "It holds some data." schema_field = self._make_one("do", "TIMESTAMP", description=description) - self.assertIs(schema_field.description, description) + self.assertEqual(schema_field.description, description) def test_fields_property(self): sub_field1 = self._make_one("one", "STRING") sub_field2 = self._make_one("fish", "INTEGER") fields = (sub_field1, sub_field2) schema_field = self._make_one("boat", "RECORD", fields=fields) - self.assertIs(schema_field.fields, fields) + self.assertEqual(schema_field.fields, fields) def test_to_standard_sql_simple_type(self): sql_type = self._get_standard_sql_data_type_class() examples = ( # a few legacy types - ("INTEGER", sql_type.INT64), - ("FLOAT", sql_type.FLOAT64), - ("BOOLEAN", sql_type.BOOL), - ("DATETIME", sql_type.DATETIME), + ("INTEGER", sql_type.TypeKind.INT64), + ("FLOAT", sql_type.TypeKind.FLOAT64), + ("BOOLEAN", sql_type.TypeKind.BOOL), + ("DATETIME", sql_type.TypeKind.DATETIME), # a few standard types - ("INT64", sql_type.INT64), - ("FLOAT64", sql_type.FLOAT64), - ("BOOL", sql_type.BOOL), - ("GEOGRAPHY", sql_type.GEOGRAPHY), + ("INT64", sql_type.TypeKind.INT64), + ("FLOAT64", sql_type.TypeKind.FLOAT64), + ("BOOL", sql_type.TypeKind.BOOL), + ("GEOGRAPHY", sql_type.TypeKind.GEOGRAPHY), ) for legacy_type, standard_type in examples: field = self._make_one("some_field", legacy_type) @@ -258,26 +259,26 @@ def test_to_standard_sql_struct_type(self): # level 2 fields sub_sub_field_date = types.StandardSqlField( - name="date_field", type=sql_type(type_kind=sql_type.DATE) + name="date_field", type=sql_type(type_kind=sql_type.TypeKind.DATE) ) sub_sub_field_time = types.StandardSqlField( - name="time_field", type=sql_type(type_kind=sql_type.TIME) + name="time_field", type=sql_type(type_kind=sql_type.TypeKind.TIME) ) # level 1 fields sub_field_struct = types.StandardSqlField( - name="last_used", type=sql_type(type_kind=sql_type.STRUCT) + name="last_used", type=sql_type(type_kind=sql_type.TypeKind.STRUCT) ) sub_field_struct.type.struct_type.fields.extend( [sub_sub_field_date, sub_sub_field_time] ) sub_field_bytes = types.StandardSqlField( - name="image_content", type=sql_type(type_kind=sql_type.BYTES) + name="image_content", type=sql_type(type_kind=sql_type.TypeKind.BYTES) ) # level 0 (top level) expected_result = types.StandardSqlField( - name="image_usage", type=sql_type(type_kind=sql_type.STRUCT) + name="image_usage", type=sql_type(type_kind=sql_type.TypeKind.STRUCT) ) expected_result.type.struct_type.fields.extend( [sub_field_bytes, sub_field_struct] @@ -304,8 +305,8 @@ def test_to_standard_sql_array_type_simple(self): sql_type = self._get_standard_sql_data_type_class() # construct expected result object - expected_sql_type = sql_type(type_kind=sql_type.ARRAY) - expected_sql_type.array_element_type.type_kind = sql_type.INT64 + expected_sql_type = sql_type(type_kind=sql_type.TypeKind.ARRAY) + expected_sql_type.array_element_type.type_kind = sql_type.TypeKind.INT64 expected_result = types.StandardSqlField( name="valid_numbers", type=expected_sql_type ) @@ -323,19 +324,19 @@ def test_to_standard_sql_array_type_struct(self): # define person STRUCT name_field = types.StandardSqlField( - name="name", type=sql_type(type_kind=sql_type.STRING) + name="name", type=sql_type(type_kind=sql_type.TypeKind.STRING) ) age_field = types.StandardSqlField( - name="age", type=sql_type(type_kind=sql_type.INT64) + name="age", type=sql_type(type_kind=sql_type.TypeKind.INT64) ) person_struct = types.StandardSqlField( - name="person_info", type=sql_type(type_kind=sql_type.STRUCT) + name="person_info", type=sql_type(type_kind=sql_type.TypeKind.STRUCT) ) person_struct.type.struct_type.fields.extend([name_field, age_field]) # define expected result - an ARRAY of person structs expected_sql_type = sql_type( - type_kind=sql_type.ARRAY, array_element_type=person_struct.type + type_kind=sql_type.TypeKind.ARRAY, array_element_type=person_struct.type ) expected_result = types.StandardSqlField( name="known_people", type=expected_sql_type @@ -358,7 +359,9 @@ def test_to_standard_sql_unknown_type(self): standard_field = field.to_standard_sql() self.assertEqual(standard_field.name, "weird_field") - self.assertEqual(standard_field.type.type_kind, sql_type.TYPE_KIND_UNSPECIFIED) + self.assertEqual( + standard_field.type.type_kind, sql_type.TypeKind.TYPE_KIND_UNSPECIFIED + ) def test___eq___wrong_type(self): field = self._make_one("test", "STRING") @@ -410,6 +413,23 @@ def test___eq___hit_w_fields(self): other = self._make_one("test", "RECORD", fields=[sub1, sub2]) self.assertEqual(field, other) + def test___eq___hit_w_policy_tags(self): + field = self._make_one( + "test", + "STRING", + mode="REQUIRED", + description="Testing", + policy_tags=PolicyTagList(names=["foo", "bar"]), + ) + other = self._make_one( + "test", + "STRING", + mode="REQUIRED", + description="Testing", + policy_tags=PolicyTagList(names=["bar", "foo"]), + ) + self.assertEqual(field, other) # Policy tags order does not matter. + def test___ne___wrong_type(self): field = self._make_one("toast", "INTEGER") other = object() @@ -432,6 +452,23 @@ def test___ne___different_values(self): ) self.assertNotEqual(field1, field2) + def test___ne___different_policy_tags(self): + field = self._make_one( + "test", + "STRING", + mode="REQUIRED", + description="Testing", + policy_tags=PolicyTagList(names=["foo", "bar"]), + ) + other = self._make_one( + "test", + "STRING", + mode="REQUIRED", + description="Testing", + policy_tags=PolicyTagList(names=["foo", "baz"]), + ) + self.assertNotEqual(field, other) + def test___hash__set_equality(self): sub1 = self._make_one("sub1", "STRING") sub2 = self._make_one("sub2", "STRING") @@ -452,7 +489,7 @@ def test___hash__not_equals(self): def test___repr__(self): field1 = self._make_one("field1", "STRING") - expected = "SchemaField('field1', 'STRING', 'NULLABLE', None, (), None)" + expected = "SchemaField('field1', 'STRING', 'NULLABLE', None, (), ())" self.assertEqual(repr(field1), expected) @@ -535,12 +572,17 @@ def test_defaults(self): "name": "full_name", "type": "STRING", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, ) self.assertEqual( resource[1], - {"name": "age", "type": "INTEGER", "mode": "REQUIRED", "description": None}, + { + "name": "age", + "type": "INTEGER", + "mode": "REQUIRED", + "policyTags": {"names": []}, + }, ) def test_w_description(self): @@ -550,7 +592,13 @@ def test_w_description(self): full_name = SchemaField( "full_name", "STRING", mode="REQUIRED", description=DESCRIPTION ) - age = SchemaField("age", "INTEGER", mode="REQUIRED") + age = SchemaField( + "age", + "INTEGER", + mode="REQUIRED", + # Explicitly unset description. + description=None, + ) resource = self._call_fut([full_name, age]) self.assertEqual(len(resource), 2) self.assertEqual( @@ -560,11 +608,18 @@ def test_w_description(self): "type": "STRING", "mode": "REQUIRED", "description": DESCRIPTION, + "policyTags": {"names": []}, }, ) self.assertEqual( resource[1], - {"name": "age", "type": "INTEGER", "mode": "REQUIRED", "description": None}, + { + "name": "age", + "type": "INTEGER", + "mode": "REQUIRED", + "description": None, + "policyTags": {"names": []}, + }, ) def test_w_subfields(self): @@ -584,7 +639,7 @@ def test_w_subfields(self): "name": "full_name", "type": "STRING", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, ) self.assertEqual( @@ -593,19 +648,18 @@ def test_w_subfields(self): "name": "phone", "type": "RECORD", "mode": "REPEATED", - "description": None, "fields": [ { "name": "type", "type": "STRING", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, { "name": "number", "type": "STRING", "mode": "REQUIRED", - "description": None, + "policyTags": {"names": []}, }, ], }, @@ -740,3 +794,165 @@ def test___hash__not_equals(self): set_one = {policy1} set_two = {policy2} self.assertNotEqual(set_one, set_two) + + +@pytest.mark.parametrize( + "api,expect,key2", + [ + ( + dict(name="n", type="NUMERIC"), + ("n", "NUMERIC", None, None, None), + ("n", "NUMERIC"), + ), + ( + dict(name="n", type="NUMERIC", precision=9), + ("n", "NUMERIC", 9, None, None), + ("n", "NUMERIC(9)"), + ), + ( + dict(name="n", type="NUMERIC", precision=9, scale=2), + ("n", "NUMERIC", 9, 2, None), + ("n", "NUMERIC(9, 2)"), + ), + ( + dict(name="n", type="BIGNUMERIC"), + ("n", "BIGNUMERIC", None, None, None), + ("n", "BIGNUMERIC"), + ), + ( + dict(name="n", type="BIGNUMERIC", precision=40), + ("n", "BIGNUMERIC", 40, None, None), + ("n", "BIGNUMERIC(40)"), + ), + ( + dict(name="n", type="BIGNUMERIC", precision=40, scale=2), + ("n", "BIGNUMERIC", 40, 2, None), + ("n", "BIGNUMERIC(40, 2)"), + ), + ( + dict(name="n", type="STRING"), + ("n", "STRING", None, None, None), + ("n", "STRING"), + ), + ( + dict(name="n", type="STRING", maxLength=9), + ("n", "STRING", None, None, 9), + ("n", "STRING(9)"), + ), + ( + dict(name="n", type="BYTES"), + ("n", "BYTES", None, None, None), + ("n", "BYTES"), + ), + ( + dict(name="n", type="BYTES", maxLength=9), + ("n", "BYTES", None, None, 9), + ("n", "BYTES(9)"), + ), + ], +) +def test_from_api_repr_parameterized(api, expect, key2): + from google.cloud.bigquery.schema import SchemaField + + field = SchemaField.from_api_repr(api) + + assert ( + field.name, + field.field_type, + field.precision, + field.scale, + field.max_length, + ) == expect + + assert field._key()[:2] == key2 + + +@pytest.mark.parametrize( + "field,api", + [ + ( + dict(name="n", field_type="NUMERIC"), + dict(name="n", type="NUMERIC", mode="NULLABLE", policyTags={"names": []}), + ), + ( + dict(name="n", field_type="NUMERIC", precision=9), + dict( + name="n", + type="NUMERIC", + mode="NULLABLE", + precision=9, + policyTags={"names": []}, + ), + ), + ( + dict(name="n", field_type="NUMERIC", precision=9, scale=2), + dict( + name="n", + type="NUMERIC", + mode="NULLABLE", + precision=9, + scale=2, + policyTags={"names": []}, + ), + ), + ( + dict(name="n", field_type="BIGNUMERIC"), + dict( + name="n", type="BIGNUMERIC", mode="NULLABLE", policyTags={"names": []} + ), + ), + ( + dict(name="n", field_type="BIGNUMERIC", precision=40), + dict( + name="n", + type="BIGNUMERIC", + mode="NULLABLE", + precision=40, + policyTags={"names": []}, + ), + ), + ( + dict(name="n", field_type="BIGNUMERIC", precision=40, scale=2), + dict( + name="n", + type="BIGNUMERIC", + mode="NULLABLE", + precision=40, + scale=2, + policyTags={"names": []}, + ), + ), + ( + dict(name="n", field_type="STRING"), + dict(name="n", type="STRING", mode="NULLABLE", policyTags={"names": []}), + ), + ( + dict(name="n", field_type="STRING", max_length=9), + dict( + name="n", + type="STRING", + mode="NULLABLE", + maxLength=9, + policyTags={"names": []}, + ), + ), + ( + dict(name="n", field_type="BYTES"), + dict(name="n", type="BYTES", mode="NULLABLE", policyTags={"names": []}), + ), + ( + dict(name="n", field_type="BYTES", max_length=9), + dict( + name="n", + type="BYTES", + mode="NULLABLE", + maxLength=9, + policyTags={"names": []}, + ), + ), + ], +) +def test_to_api_repr_parameterized(field, api): + from google.cloud.bigquery.schema import SchemaField + + assert SchemaField(**field).to_api_repr() == api diff --git a/tests/unit/test_signature_compatibility.py b/tests/unit/test_signature_compatibility.py index 6002ae3e8..07b823e2c 100644 --- a/tests/unit/test_signature_compatibility.py +++ b/tests/unit/test_signature_compatibility.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import OrderedDict import inspect import pytest @@ -31,21 +32,31 @@ def row_iterator_class(): return RowIterator -@pytest.mark.skipif( - not hasattr(inspect, "signature"), - reason="inspect.signature() is not availalbe in older Python versions", -) def test_to_arrow_method_signatures_match(query_job_class, row_iterator_class): - sig = inspect.signature(query_job_class.to_arrow) - sig2 = inspect.signature(row_iterator_class.to_arrow) - assert sig == sig2 + query_job_sig = inspect.signature(query_job_class.to_arrow) + iterator_sig = inspect.signature(row_iterator_class.to_arrow) + + assert "max_results" in query_job_sig.parameters + + # Compare the signatures while ignoring the max_results parameter, which is + # specific to the method on QueryJob. + params = OrderedDict(query_job_sig.parameters) + del params["max_results"] + query_job_sig = query_job_sig.replace(parameters=params.values()) + + assert query_job_sig == iterator_sig -@pytest.mark.skipif( - not hasattr(inspect, "signature"), - reason="inspect.signature() is not availalbe in older Python versions", -) def test_to_dataframe_method_signatures_match(query_job_class, row_iterator_class): - sig = inspect.signature(query_job_class.to_dataframe) - sig2 = inspect.signature(row_iterator_class.to_dataframe) - assert sig == sig2 + query_job_sig = inspect.signature(query_job_class.to_dataframe) + iterator_sig = inspect.signature(row_iterator_class.to_dataframe) + + assert "max_results" in query_job_sig.parameters + + # Compare the signatures while ignoring the max_results parameter, which is + # specific to the method on QueryJob. + params = OrderedDict(query_job_sig.parameters) + del params["max_results"] + query_job_sig = query_job_sig.replace(parameters=params.values()) + + assert query_job_sig == iterator_sig diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 10bedfee1..1ce930ee4 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -12,39 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -import datetime as dt -import itertools +import datetime import logging +import re import time +import types import unittest import warnings import mock import pytest -import six import google.api_core.exceptions +from test_utils.imports import maybe_fail_import try: - from google.cloud import bigquery_storage_v1 - from google.cloud import bigquery_storage_v1beta1 - from google.cloud.bigquery_storage_v1.gapic.transports import ( - big_query_read_grpc_transport, - ) - from google.cloud.bigquery_storage_v1beta1.gapic.transports import ( - big_query_storage_grpc_transport as big_query_storage_grpc_transport_v1beta1, + from google.cloud import bigquery_storage + from google.cloud.bigquery_storage_v1.services.big_query_read.transports import ( + grpc as big_query_read_grpc_transport, ) except ImportError: # pragma: NO COVER - bigquery_storage_v1 = None - bigquery_storage_v1beta1 = None + bigquery_storage = None big_query_read_grpc_transport = None - big_query_storage_grpc_transport_v1beta1 = None try: import pandas except (ImportError, AttributeError): # pragma: NO COVER pandas = None +try: + import geopandas +except (ImportError, AttributeError): # pragma: NO COVER + geopandas = None + try: import pyarrow import pyarrow.types @@ -113,8 +113,6 @@ def _make_one(self, *args, **kw): return self._get_target_class()(*args, **kw) def test_ctor_defaults(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset_ref = DatasetReference("project_1", "dataset_1") table_ref = self._make_one(dataset_ref, "table_1") @@ -122,8 +120,6 @@ def test_ctor_defaults(self): self.assertEqual(table_ref.table_id, "table_1") def test_to_api_repr(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset_ref = DatasetReference("project_1", "dataset_1") table_ref = self._make_one(dataset_ref, "table_1") @@ -135,7 +131,6 @@ def test_to_api_repr(self): ) def test_from_api_repr(self): - from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import TableReference dataset_ref = DatasetReference("project_1", "dataset_1") @@ -202,8 +197,6 @@ def test_from_string_ignores_default_project(self): self.assertEqual(got.table_id, "string_table") def test___eq___wrong_type(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset_ref = DatasetReference("project_1", "dataset_1") table = self._make_one(dataset_ref, "table_1") other = object() @@ -211,8 +204,6 @@ def test___eq___wrong_type(self): self.assertEqual(table, mock.ANY) def test___eq___project_mismatch(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset = DatasetReference("project_1", "dataset_1") other_dataset = DatasetReference("project_2", "dataset_1") table = self._make_one(dataset, "table_1") @@ -220,8 +211,6 @@ def test___eq___project_mismatch(self): self.assertNotEqual(table, other) def test___eq___dataset_mismatch(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset = DatasetReference("project_1", "dataset_1") other_dataset = DatasetReference("project_1", "dataset_2") table = self._make_one(dataset, "table_1") @@ -229,24 +218,18 @@ def test___eq___dataset_mismatch(self): self.assertNotEqual(table, other) def test___eq___table_mismatch(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset = DatasetReference("project_1", "dataset_1") table = self._make_one(dataset, "table_1") other = self._make_one(dataset, "table_2") self.assertNotEqual(table, other) def test___eq___equality(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset = DatasetReference("project_1", "dataset_1") table = self._make_one(dataset, "table_1") other = self._make_one(dataset, "table_1") self.assertEqual(table, other) def test___hash__set_equality(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset = DatasetReference("project_1", "dataset_1") table1 = self._make_one(dataset, "table1") table2 = self._make_one(dataset, "table2") @@ -255,8 +238,6 @@ def test___hash__set_equality(self): self.assertEqual(set_one, set_two) def test___hash__not_equals(self): - from google.cloud.bigquery.dataset import DatasetReference - dataset = DatasetReference("project_1", "dataset_1") table1 = self._make_one(dataset, "table1") table2 = self._make_one(dataset, "table2") @@ -272,6 +253,11 @@ def test___repr__(self): ) self.assertEqual(repr(table1), expected) + def test___str__(self): + dataset = DatasetReference("project1", "dataset1") + table1 = self._make_one(dataset, "table1") + self.assertEqual(str(table1), "project1.dataset1.table1") + class TestTable(unittest.TestCase, _SchemaBase): @@ -287,6 +273,11 @@ def _get_target_class(): return Table def _make_one(self, *args, **kw): + if len(args) == 0: + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + args = (table_ref,) + return self._get_target_class()(*args, **kw) def _setUpConstants(self): @@ -567,6 +558,68 @@ def test_num_rows_getter(self): with self.assertRaises(ValueError): getattr(table, "num_rows") + def test__eq__wrong_type(self): + table = self._make_one("project_foo.dataset_bar.table_baz") + + class TableWannabe: + pass + + not_a_table = TableWannabe() + not_a_table._properties = table._properties + + assert table != not_a_table # Can't fake it. + + def test__eq__same_table_basic(self): + table_1 = self._make_one("project_foo.dataset_bar.table_baz") + table_2 = self._make_one("project_foo.dataset_bar.table_baz") + assert table_1 == table_2 + + def test__eq__same_table_multiple_properties(self): + from google.cloud.bigquery import SchemaField + + table_1 = self._make_one("project_foo.dataset_bar.table_baz") + table_1.require_partition_filter = True + table_1.labels = {"first": "one", "second": "two"} + + table_1.schema = [ + SchemaField("name", "STRING", "REQUIRED"), + SchemaField("age", "INTEGER", "NULLABLE"), + ] + + table_2 = self._make_one("project_foo.dataset_bar.table_baz") + table_2.require_partition_filter = True + table_2.labels = {"first": "one", "second": "two"} + table_2.schema = [ + SchemaField("name", "STRING", "REQUIRED"), + SchemaField("age", "INTEGER", "NULLABLE"), + ] + + assert table_1 == table_2 + + def test__eq__same_table_property_different(self): + table_1 = self._make_one("project_foo.dataset_bar.table_baz") + table_1.description = "This is table baz" + + table_2 = self._make_one("project_foo.dataset_bar.table_baz") + table_2.description = "This is also table baz" + + assert table_1 == table_2 # Still equal, only table reference is important. + + def test__eq__different_table(self): + table_1 = self._make_one("project_foo.dataset_bar.table_baz") + table_2 = self._make_one("project_foo.dataset_bar.table_baz_2") + + assert table_1 != table_2 + + def test_hashable(self): + table_1 = self._make_one("project_foo.dataset_bar.table_baz") + table_1.description = "This is a table" + + table_1b = self._make_one("project_foo.dataset_bar.table_baz") + table_1b.description = "Metadata is irrelevant for hashes" + + assert hash(table_1) == hash(table_1b) + def test_schema_setter_non_sequence(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) @@ -671,6 +724,40 @@ def test_props_set_by_server(self): self.assertEqual(table.full_table_id, TABLE_FULL_ID) self.assertEqual(table.table_type, "TABLE") + def test_snapshot_definition_not_set(self): + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + assert table.snapshot_definition is None + + def test_snapshot_definition_set(self): + from google.cloud._helpers import UTC + from google.cloud.bigquery.table import SnapshotDefinition + + dataset = DatasetReference(self.PROJECT, self.DS_ID) + table_ref = dataset.table(self.TABLE_NAME) + table = self._make_one(table_ref) + + table._properties["snapshotDefinition"] = { + "baseTableReference": { + "projectId": "project_x", + "datasetId": "dataset_y", + "tableId": "table_z", + }, + "snapshotTime": "2010-09-28T10:20:30.123Z", + } + + snapshot = table.snapshot_definition + + assert isinstance(snapshot, SnapshotDefinition) + assert snapshot.base_table_reference.path == ( + "/projects/project_x/datasets/dataset_y/tables/table_z" + ) + assert snapshot.snapshot_time == datetime.datetime( + 2010, 9, 28, 10, 20, 30, 123000, tzinfo=UTC + ) + def test_description_setter_bad_value(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) @@ -807,12 +894,59 @@ def test_labels_setter_bad_value(self): with self.assertRaises(ValueError): table.labels = 12345 + def test_mview_query(self): + table = self._make_one() + self.assertIsNone(table.mview_query) + table.mview_query = "SELECT name, SUM(number) FROM dset.tbl GROUP BY 1" + self.assertEqual( + table.mview_query, "SELECT name, SUM(number) FROM dset.tbl GROUP BY 1" + ) + del table.mview_query + self.assertIsNone(table.mview_query) + + def test_mview_last_refresh_time(self): + table = self._make_one() + self.assertIsNone(table.mview_last_refresh_time) + table._properties["materializedView"] = { + "lastRefreshTime": "1606751842496", + } + self.assertEqual( + table.mview_last_refresh_time, + datetime.datetime( + 2020, 11, 30, 15, 57, 22, 496000, tzinfo=datetime.timezone.utc + ), + ) + + def test_mview_enable_refresh(self): + table = self._make_one() + self.assertIsNone(table.mview_enable_refresh) + table.mview_enable_refresh = True + self.assertTrue(table.mview_enable_refresh) + table.mview_enable_refresh = False + self.assertFalse(table.mview_enable_refresh) + table.mview_enable_refresh = None + self.assertIsNone(table.mview_enable_refresh) + + def test_mview_refresh_interval(self): + table = self._make_one() + self.assertIsNone(table.mview_refresh_interval) + table.mview_refresh_interval = datetime.timedelta(minutes=30) + self.assertEqual(table.mview_refresh_interval, datetime.timedelta(minutes=30)) + self.assertEqual( + table._properties["materializedView"]["refreshIntervalMs"], "1800000" + ) + table.mview_refresh_interval = None + self.assertIsNone(table.mview_refresh_interval) + def test_from_string(self): cls = self._get_target_class() got = cls.from_string("string-project.string_dataset.string_table") self.assertEqual(got.project, "string-project") self.assertEqual(got.dataset_id, "string_dataset") self.assertEqual(got.table_id, "string_table") + self.assertEqual( + str(got.reference), "string-project.string_dataset.string_table" + ) def test_from_string_legacy_string(self): cls = self._get_target_class() @@ -1153,8 +1287,8 @@ def test_clustering_fields_setter_w_none(self): table._properties["clustering"] = {"fields": fields} table.clustering_fields = None - self.assertEqual(table.clustering_fields, None) - self.assertFalse("clustering" in table._properties) + self.assertIsNone(table.clustering_fields) + self.assertTrue("clustering" in table._properties) # None stored explicitly def test_clustering_fields_setter_w_none_noop(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) @@ -1162,8 +1296,8 @@ def test_clustering_fields_setter_w_none_noop(self): table = self._make_one(table_ref) table.clustering_fields = None - self.assertEqual(table.clustering_fields, None) - self.assertFalse("clustering" in table._properties) + self.assertIsNone(table.clustering_fields) + self.assertTrue("clustering" in table._properties) # None stored explicitly def test_encryption_configuration_setter(self): # Previously, the EncryptionConfiguration class was in the table module, not the @@ -1278,7 +1412,6 @@ def _make_one(self, *args, **kw): return self._get_target_class()(*args, **kw) def _setUpConstants(self): - import datetime from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.125 @@ -1440,6 +1573,199 @@ def test_labels_update_in_place(self): labels["foo"] = "bar" # update in place self.assertEqual(table.labels, {"foo": "bar"}) + def test_to_api_repr(self): + resource = { + "tableReference": { + "projectId": "testproject", + "datasetId": "testdataset", + "tableId": "testtable", + } + } + table = self._make_one(resource) + self.assertEqual(table.to_api_repr(), resource) + + def test__eq__wrong_type(self): + resource = { + "tableReference": { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_baz", + } + } + table = self._make_one(resource) + + class FakeTableListItem: + project = "project_foo" + dataset_id = "dataset_bar" + table_id = "table_baz" + + not_a_table = FakeTableListItem() + + assert table != not_a_table # Can't fake it. + + def test__eq__same_table(self): + resource = { + "tableReference": { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_baz", + } + } + table_1 = self._make_one(resource) + table_2 = self._make_one(resource) + + assert table_1 == table_2 + + def test__eq__same_table_property_different(self): + table_ref_resource = { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_baz", + } + + resource_1 = {"tableReference": table_ref_resource, "friendlyName": "Table One"} + table_1 = self._make_one(resource_1) + + resource_2 = {"tableReference": table_ref_resource, "friendlyName": "Table Two"} + table_2 = self._make_one(resource_2) + + assert table_1 == table_2 # Still equal, only table reference is important. + + def test__eq__different_table(self): + resource_1 = { + "tableReference": { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_baz", + } + } + table_1 = self._make_one(resource_1) + + resource_2 = { + "tableReference": { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_quux", + } + } + table_2 = self._make_one(resource_2) + + assert table_1 != table_2 + + def test_hashable(self): + resource = { + "tableReference": { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_baz", + } + } + table_item = self._make_one(resource) + table_item_2 = self._make_one(resource) + + assert hash(table_item) == hash(table_item_2) + + +class TestTableClassesInterchangeability: + @staticmethod + def _make_table(*args, **kwargs): + from google.cloud.bigquery.table import Table + + return Table(*args, **kwargs) + + @staticmethod + def _make_table_ref(*args, **kwargs): + from google.cloud.bigquery.table import TableReference + + return TableReference(*args, **kwargs) + + @staticmethod + def _make_table_list_item(*args, **kwargs): + from google.cloud.bigquery.table import TableListItem + + return TableListItem(*args, **kwargs) + + def test_table_eq_table_ref(self): + + table = self._make_table("project_foo.dataset_bar.table_baz") + dataset_ref = DatasetReference("project_foo", "dataset_bar") + table_ref = self._make_table_ref(dataset_ref, "table_baz") + + assert table == table_ref + assert table_ref == table + + def test_table_eq_table_list_item(self): + table = self._make_table("project_foo.dataset_bar.table_baz") + table_list_item = self._make_table_list_item( + { + "tableReference": { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_baz", + } + } + ) + + assert table == table_list_item + assert table_list_item == table + + def test_table_ref_eq_table_list_item(self): + + dataset_ref = DatasetReference("project_foo", "dataset_bar") + table_ref = self._make_table_ref(dataset_ref, "table_baz") + table_list_item = self._make_table_list_item( + { + "tableReference": { + "projectId": "project_foo", + "datasetId": "dataset_bar", + "tableId": "table_baz", + } + } + ) + + assert table_ref == table_list_item + assert table_list_item == table_ref + + +class TestSnapshotDefinition: + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.table import SnapshotDefinition + + return SnapshotDefinition + + @classmethod + def _make_one(cls, *args, **kwargs): + klass = cls._get_target_class() + return klass(*args, **kwargs) + + def test_ctor_empty_resource(self): + instance = self._make_one(resource={}) + assert instance.base_table_reference is None + assert instance.snapshot_time is None + + def test_ctor_full_resource(self): + from google.cloud._helpers import UTC + from google.cloud.bigquery.table import TableReference + + resource = { + "baseTableReference": { + "projectId": "my-project", + "datasetId": "your-dataset", + "tableId": "our-table", + }, + "snapshotTime": "2005-06-07T19:35:02.123Z", + } + instance = self._make_one(resource) + + expected_table_ref = TableReference.from_string( + "my-project.your-dataset.our-table" + ) + assert instance.base_table_reference == expected_table_ref + + expected_time = datetime.datetime(2005, 6, 7, 19, 35, 2, 123000, tzinfo=UTC) + assert instance.snapshot_time == expected_time + class TestRow(unittest.TestCase): def test_row(self): @@ -1503,6 +1829,46 @@ def test_to_dataframe(self): self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 0) # verify the number of rows + @mock.patch("google.cloud.bigquery.table.pandas", new=None) + def test_to_dataframe_iterable_error_if_pandas_is_none(self): + row_iterator = self._make_one() + with self.assertRaises(ValueError): + row_iterator.to_dataframe_iterable() + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_iterable(self): + row_iterator = self._make_one() + df_iter = row_iterator.to_dataframe_iterable() + + result = list(df_iter) + + self.assertEqual(len(result), 1) + df = result[0] + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 0) # Verify the number of rows. + self.assertEqual(len(df.columns), 0) + + @mock.patch("google.cloud.bigquery.table.geopandas", new=None) + def test_to_geodataframe_if_geopandas_is_none(self): + row_iterator = self._make_one() + with self.assertRaisesRegex( + ValueError, + re.escape( + "The geopandas library is not installed, please install " + "geopandas to use the to_geodataframe() function." + ), + ): + row_iterator.to_geodataframe(create_bqstorage_client=False) + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe(self): + row_iterator = self._make_one() + df = row_iterator.to_geodataframe(create_bqstorage_client=False) + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 0) # verify the number of rows + self.assertEqual(df.crs.srs, "EPSG:4326") + self.assertEqual(df.crs.name, "WGS 84") + class TestRowIterator(unittest.TestCase): def _class_under_test(self): @@ -1540,6 +1906,16 @@ def _make_one( client, api_request, path, schema, table=table, **kwargs ) + def _make_one_from_data(self, schema=(), rows=()): + from google.cloud.bigquery.schema import SchemaField + + schema = [SchemaField(*a) for a in schema] + rows = [{"f": [{"v": v} for v in row]} for row in rows] + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + return self._make_one(_mock_client(), api_request, path, schema) + def test_constructor(self): from google.cloud.bigquery.table import _item_to_row from google.cloud.bigquery.table import _rows_page_start @@ -1568,10 +1944,7 @@ def test_constructor_with_table(self): from google.cloud.bigquery.table import Table table = Table("proj.dset.tbl") - table._properties["numRows"] = 100 - - iterator = self._make_one(table=table) - + iterator = self._make_one(table=table, total_rows=100) self.assertIs(iterator._table, table) self.assertEqual(iterator.total_rows, 100) @@ -1609,19 +1982,53 @@ def test_iterate(self): rows_iter = iter(row_iterator) - val1 = six.next(rows_iter) + val1 = next(rows_iter) self.assertEqual(val1.name, "Phred Phlyntstone") self.assertEqual(row_iterator.num_results, 1) - val2 = six.next(rows_iter) + val2 = next(rows_iter) self.assertEqual(val2.name, "Bharney Rhubble") self.assertEqual(row_iterator.num_results, 2) with self.assertRaises(StopIteration): - six.next(rows_iter) + next(rows_iter) api_request.assert_called_once_with(method="GET", path=path, query_params={}) + def test_iterate_with_cached_first_page(self): + from google.cloud.bigquery.schema import SchemaField + + first_page = { + "rows": [ + {"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]}, + {"f": [{"v": "Bhetty Rhubble"}, {"v": "28"}]}, + ], + "pageToken": "next-page", + } + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one( + _mock_client(), api_request, path, schema, first_page_response=first_page + ) + rows = list(row_iterator) + self.assertEqual(len(rows), 4) + self.assertEqual(rows[0].age, 27) + self.assertEqual(rows[1].age, 28) + self.assertEqual(rows[2].age, 32) + self.assertEqual(rows[3].age, 33) + + api_request.assert_called_once_with( + method="GET", path=path, query_params={"pageToken": "next-page"} + ) + def test_page_size(self): from google.cloud.bigquery.schema import SchemaField @@ -1647,6 +2054,80 @@ def test_page_size(self): query_params={"maxResults": row_iterator._page_size}, ) + def test__is_completely_cached_returns_false_without_first_page(self): + iterator = self._make_one(first_page_response=None) + self.assertFalse(iterator._is_completely_cached()) + + def test__is_completely_cached_returns_false_with_page_token(self): + first_page = {"pageToken": "next-page"} + iterator = self._make_one(first_page_response=first_page) + self.assertFalse(iterator._is_completely_cached()) + + def test__is_completely_cached_returns_true(self): + first_page = {"rows": []} + iterator = self._make_one(first_page_response=first_page) + self.assertTrue(iterator._is_completely_cached()) + + def test__validate_bqstorage_returns_false_when_completely_cached(self): + first_page = {"rows": []} + iterator = self._make_one(first_page_response=first_page) + self.assertFalse( + iterator._validate_bqstorage( + bqstorage_client=None, create_bqstorage_client=True + ) + ) + + def test__validate_bqstorage_returns_false_if_max_results_set(self): + iterator = self._make_one( + max_results=10, first_page_response=None # not cached + ) + result = iterator._validate_bqstorage( + bqstorage_client=None, create_bqstorage_client=True + ) + self.assertFalse(result) + + def test__validate_bqstorage_returns_false_if_missing_dependency(self): + iterator = self._make_one(first_page_response=None) # not cached + + def fail_bqstorage_import(name, globals, locals, fromlist, level): + # NOTE: *very* simplified, assuming a straightforward absolute import + return "bigquery_storage" in name or ( + fromlist is not None and "bigquery_storage" in fromlist + ) + + no_bqstorage = maybe_fail_import(predicate=fail_bqstorage_import) + + with no_bqstorage: + result = iterator._validate_bqstorage( + bqstorage_client=None, create_bqstorage_client=True + ) + + self.assertFalse(result) + + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test__validate_bqstorage_returns_false_w_warning_if_obsolete_version(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + iterator = self._make_one(first_page_response=None) # not cached + + patcher = mock.patch( + "google.cloud.bigquery.table._helpers.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + ) + with patcher, warnings.catch_warnings(record=True) as warned: + result = iterator._validate_bqstorage( + bqstorage_client=None, create_bqstorage_client=True + ) + + self.assertFalse(result) + + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow(self): from google.cloud.bigquery.schema import SchemaField @@ -1846,9 +2327,9 @@ def test_to_arrow_w_empty_table(self): @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) - def test_to_arrow_max_results_w_create_bqstorage_warning(self): + def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): from google.cloud.bigquery.schema import SchemaField schema = [ @@ -1862,6 +2343,7 @@ def test_to_arrow_max_results_w_create_bqstorage_warning(self): path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) mock_client = _mock_client() + mock_bqstorage_client = mock.sentinel.bq_storage_client row_iterator = self._make_one( client=mock_client, @@ -1872,48 +2354,91 @@ def test_to_arrow_max_results_w_create_bqstorage_warning(self): ) with warnings.catch_warnings(record=True) as warned: - row_iterator.to_arrow(create_bqstorage_client=True) + row_iterator.to_arrow(bqstorage_client=mock_bqstorage_client) matches = [ warning for warning in warned if warning.category is UserWarning and "cannot use bqstorage_client" in str(warning).lower() - and "tabledata.list" in str(warning) + and "REST" in str(warning) ] self.assertEqual(len(matches), 1, msg="User warning was not emitted.") - mock_client._create_bqstorage_client.assert_not_called() + self.assertIn( + __file__, str(matches[0]), msg="Warning emitted with incorrect stacklevel" + ) + mock_client._ensure_bqstorage_client.assert_not_called() @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) - def test_to_arrow_w_bqstorage(self): - from google.cloud.bigquery import schema - from google.cloud.bigquery import table as mut - from google.cloud.bigquery_storage_v1 import reader + def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): + from google.cloud.bigquery.schema import SchemaField - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - bqstorage_client.transport = mock.create_autospec( - big_query_read_grpc_transport.BigQueryReadGrpcTransport - ) - streams = [ - # Use two streams we want to check frames are read from each stream. - {"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}, - {"name": "/projects/proj/dataset/dset/tables/tbl/streams/5678"}, + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), ] - session = bigquery_storage_v1.types.ReadSession(streams=streams) - arrow_schema = pyarrow.schema( - [ - pyarrow.field("colA", pyarrow.int64()), - # Not alphabetical to test column order. - pyarrow.field("colC", pyarrow.float64()), - pyarrow.field("colB", pyarrow.string()), - ] - ) - session.arrow_schema.serialized_schema = arrow_schema.serialize().to_pybytes() - bqstorage_client.create_read_session.return_value = session - + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + mock_client = _mock_client() + + row_iterator = self._make_one( + client=mock_client, + api_request=api_request, + path=path, + schema=schema, + max_results=42, + ) + + with warnings.catch_warnings(record=True) as warned: + row_iterator.to_arrow(create_bqstorage_client=True) + + matches = [ + warning + for warning in warned + if warning.category is UserWarning + and "cannot use bqstorage_client" in str(warning).lower() + and "REST" in str(warning) + ] + self.assertFalse(matches) + mock_client._ensure_bqstorage_client.assert_not_called() + + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test_to_arrow_w_bqstorage(self): + from google.cloud.bigquery import schema + from google.cloud.bigquery import table as mut + from google.cloud.bigquery_storage_v1 import reader + + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + bqstorage_client._transport = mock.create_autospec( + big_query_read_grpc_transport.BigQueryReadGrpcTransport + ) + streams = [ + # Use two streams we want to check frames are read from each stream. + {"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}, + {"name": "/projects/proj/dataset/dset/tables/tbl/streams/5678"}, + ] + session = bigquery_storage.types.ReadSession(streams=streams) + arrow_schema = pyarrow.schema( + [ + pyarrow.field("colA", pyarrow.int64()), + # Not alphabetical to test column order. + pyarrow.field("colC", pyarrow.float64()), + pyarrow.field("colB", pyarrow.string()), + ] + ) + session.arrow_schema.serialized_schema = arrow_schema.serialize().to_pybytes() + bqstorage_client.create_read_session.return_value = session + mock_rowstream = mock.create_autospec(reader.ReadRowsStream) bqstorage_client.read_rows.return_value = mock_rowstream @@ -1963,23 +2488,23 @@ def test_to_arrow_w_bqstorage(self): self.assertEqual(actual_tbl.num_rows, total_rows) # Don't close the client if it was passed in. - bqstorage_client.transport.channel.close.assert_not_called() + bqstorage_client._transport.grpc_channel.close.assert_not_called() @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_to_arrow_w_bqstorage_creates_client(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut mock_client = _mock_client() - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - bqstorage_client.transport = mock.create_autospec( + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + bqstorage_client._transport = mock.create_autospec( big_query_read_grpc_transport.BigQueryReadGrpcTransport ) - mock_client._create_bqstorage_client.return_value = bqstorage_client - session = bigquery_storage_v1.types.ReadSession() + mock_client._ensure_bqstorage_client.return_value = bqstorage_client + session = bigquery_storage.types.ReadSession() bqstorage_client.create_read_session.return_value = session row_iterator = mut.RowIterator( mock_client, @@ -1993,11 +2518,11 @@ def test_to_arrow_w_bqstorage_creates_client(self): table=mut.TableReference.from_string("proj.dset.tbl"), ) row_iterator.to_arrow(create_bqstorage_client=True) - mock_client._create_bqstorage_client.assert_called_once() - bqstorage_client.transport.channel.close.assert_called_once() + mock_client._ensure_bqstorage_client.assert_called_once() + bqstorage_client._transport.grpc_channel.close.assert_called_once() @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_to_arrow_create_bqstorage_client_wo_bqstorage(self): + def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): from google.cloud.bigquery.schema import SchemaField schema = [ @@ -2012,27 +2537,27 @@ def test_to_arrow_create_bqstorage_client_wo_bqstorage(self): api_request = mock.Mock(return_value={"rows": rows}) mock_client = _mock_client() - mock_client._create_bqstorage_client.return_value = None + mock_client._ensure_bqstorage_client.return_value = None row_iterator = self._make_one(mock_client, api_request, path, schema) tbl = row_iterator.to_arrow(create_bqstorage_client=True) # The client attempted to create a BQ Storage client, and even though # that was not possible, results were still returned without errors. - mock_client._create_bqstorage_client.assert_called_once() + mock_client._ensure_bqstorage_client.assert_called_once() self.assertIsInstance(tbl, pyarrow.Table) self.assertEqual(tbl.num_rows, 2) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_to_arrow_w_bqstorage_no_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - session = bigquery_storage_v1.types.ReadSession() + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() arrow_schema = pyarrow.schema( [ pyarrow.field("colA", pyarrow.string()), @@ -2115,7 +2640,6 @@ def test_to_arrow_w_pyarrow_none(self): @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_iterable(self): from google.cloud.bigquery.schema import SchemaField - import types schema = [ SchemaField("name", "STRING", mode="REQUIRED"), @@ -2155,9 +2679,51 @@ def test_to_dataframe_iterable(self): self.assertEqual(df_2["name"][0], "Sven") self.assertEqual(df_2["age"][0], 33) + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_iterable_with_dtypes(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + + path = "/foo" + api_request = mock.Mock( + side_effect=[ + { + "rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}], + "pageToken": "NEXTPAGE", + }, + {"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]}, + ] + ) + + row_iterator = self._make_one( + _mock_client(), api_request, path, schema, page_size=1, max_results=5 + ) + dfs = row_iterator.to_dataframe_iterable(dtypes={"age": "int32"}) + + self.assertIsInstance(dfs, types.GeneratorType) + + df_1 = next(dfs) + self.assertIsInstance(df_1, pandas.DataFrame) + self.assertEqual(df_1.name.dtype.name, "object") + self.assertEqual(df_1.age.dtype.name, "int32") + self.assertEqual(len(df_1), 1) # verify the number of rows + self.assertEqual( + df_1["name"][0], "Bengt" + ) # verify the first value of 'name' column + self.assertEqual(df_1["age"][0], 32) # verify the first value of 'age' column + + df_2 = next(dfs) + self.assertEqual(len(df_2), 1) # verify the number of rows + self.assertEqual(df_2["name"][0], "Sven") + self.assertEqual(df_2["age"][0], 33) + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_iterable_w_bqstorage(self): @@ -2173,8 +2739,8 @@ def test_to_dataframe_iterable_w_bqstorage(self): ] arrow_schema = pyarrow.schema(arrow_fields) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - bqstorage_client.transport = mock.create_autospec( + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + bqstorage_client._transport = mock.create_autospec( big_query_read_grpc_transport.BigQueryReadGrpcTransport ) streams = [ @@ -2182,7 +2748,7 @@ def test_to_dataframe_iterable_w_bqstorage(self): {"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}, {"name": "/projects/proj/dataset/dset/tables/tbl/streams/5678"}, ] - session = bigquery_storage_v1.types.ReadSession( + session = bigquery_storage.types.ReadSession( streams=streams, arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, ) @@ -2225,7 +2791,62 @@ def test_to_dataframe_iterable_w_bqstorage(self): self.assertEqual(len(got), total_pages) # Don't close the client if it was passed in. - bqstorage_client.transport.channel.close.assert_not_called() + bqstorage_client._transport.grpc_channel.close.assert_not_called() + + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_to_dataframe_iterable_w_bqstorage_max_results_warning(self): + from google.cloud.bigquery import schema + from google.cloud.bigquery import table as mut + + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + + iterator_schema = [ + schema.SchemaField("name", "STRING", mode="REQUIRED"), + schema.SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + path = "/foo" + api_request = mock.Mock( + side_effect=[ + { + "rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}], + "pageToken": "NEXTPAGE", + }, + {"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]}, + ] + ) + row_iterator = mut.RowIterator( + _mock_client(), + api_request, + path, + iterator_schema, + table=mut.TableReference.from_string("proj.dset.tbl"), + selected_fields=iterator_schema, + max_results=25, + ) + + with warnings.catch_warnings(record=True) as warned: + dfs = row_iterator.to_dataframe_iterable(bqstorage_client=bqstorage_client) + + # Was a warning emitted? + matches = [ + warning + for warning in warned + if warning.category is UserWarning + and "cannot use bqstorage_client" in str(warning).lower() + and "REST" in str(warning) + ] + assert len(matches) == 1, "User warning was not emitted." + assert __file__ in str(matches[0]), "Warning emitted with incorrect stacklevel" + + # Basic check of what we got as a result. + dataframes = list(dfs) + assert len(dataframes) == 2 + assert isinstance(dataframes[0], pandas.DataFrame) + assert isinstance(dataframes[1], pandas.DataFrame) @mock.patch("google.cloud.bigquery.table.pandas", new=None) def test_to_dataframe_iterable_error_if_pandas_is_none(self): @@ -2272,13 +2893,6 @@ def test_to_dataframe(self): self.assertEqual(df.name.dtype.name, "object") self.assertEqual(df.age.dtype.name, "int64") - @pytest.mark.xfail( - six.PY2, - reason=( - "Requires pyarrow>-1.0 to work, but the latter is not compatible " - "with Python 2 anymore." - ), - ) @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): @@ -2286,8 +2900,8 @@ def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): schema = [SchemaField("some_timestamp", "TIMESTAMP")] rows = [ - {"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC - {"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC + {"f": [{"v": "81953424000000000"}]}, # 4567-01-01 00:00:00 UTC + {"f": [{"v": "253402214400000000"}]}, # 9999-12-31 00:00:00 UTC ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) @@ -2295,21 +2909,18 @@ def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): df = row_iterator.to_dataframe(create_bqstorage_client=False) + tzinfo = datetime.timezone.utc self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 2) # verify the number of rows self.assertEqual(list(df.columns), ["some_timestamp"]) self.assertEqual( list(df["some_timestamp"]), - [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)], + [ + datetime.datetime(4567, 1, 1, tzinfo=tzinfo), + datetime.datetime(9999, 12, 31, tzinfo=tzinfo), + ], ) - @pytest.mark.xfail( - six.PY2, - reason=( - "Requires pyarrow>-1.0 to work, but the latter is not compatible " - "with Python 2 anymore." - ), - ) @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_datetime_out_of_pyarrow_bounds(self): @@ -2331,39 +2942,7 @@ def test_to_dataframe_datetime_out_of_pyarrow_bounds(self): self.assertEqual(list(df.columns), ["some_datetime"]) self.assertEqual( list(df["some_datetime"]), - [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)], - ) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_warning_wo_pyarrow(self): - from google.cloud.bigquery.client import PyarrowMissingWarning - from google.cloud.bigquery.schema import SchemaField - - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - rows = [ - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - ] - path = "/foo" - api_request = mock.Mock(return_value={"rows": rows}) - row_iterator = self._make_one(_mock_client(), api_request, path, schema) - - no_pyarrow_patch = mock.patch("google.cloud.bigquery.table.pyarrow", new=None) - catch_warnings = warnings.catch_warnings(record=True) - - with no_pyarrow_patch, catch_warnings as warned: - df = row_iterator.to_dataframe() - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 2) - matches = [ - warning for warning in warned if warning.category is PyarrowMissingWarning - ] - self.assertTrue( - matches, msg="A missing pyarrow deprecation warning was not raised." + [datetime.datetime(4567, 1, 1), datetime.datetime(9999, 12, 31)], ) @unittest.skipIf(pandas is None, "Requires `pandas`") @@ -2407,51 +2986,7 @@ def test_to_dataframe_progress_bar( self.assertEqual(len(df), 4) @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(tqdm is None, "Requires `tqdm`") - @mock.patch("tqdm.tqdm_gui") - @mock.patch("tqdm.tqdm_notebook") - @mock.patch("tqdm.tqdm") - def test_to_dataframe_progress_bar_wo_pyarrow( - self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock - ): - from google.cloud.bigquery.schema import SchemaField - - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - rows = [ - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, - {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, - ] - path = "/foo" - api_request = mock.Mock(return_value={"rows": rows}) - - progress_bars = ( - ("tqdm", tqdm_mock), - ("tqdm_notebook", tqdm_notebook_mock), - ("tqdm_gui", tqdm_gui_mock), - ) - - for progress_bar_type, progress_bar_mock in progress_bars: - row_iterator = self._make_one(_mock_client(), api_request, path, schema) - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - with warnings.catch_warnings(record=True) as warned: - df = row_iterator.to_dataframe(progress_bar_type=progress_bar_type) - - progress_bar_mock.assert_called() - progress_bar_mock().update.assert_called() - progress_bar_mock().close.assert_called_once() - self.assertEqual(len(df), 4) - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @mock.patch("google.cloud.bigquery.table.tqdm", new=None) + @mock.patch("google.cloud.bigquery._tqdm_helpers.tqdm", new=None) def test_to_dataframe_no_tqdm_no_progress_bar(self): from google.cloud.bigquery.schema import SchemaField @@ -2479,7 +3014,7 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): self.assertEqual(len(df), 4) @unittest.skipIf(pandas is None, "Requires `pandas`") - @mock.patch("google.cloud.bigquery.table.tqdm", new=None) + @mock.patch("google.cloud.bigquery._tqdm_helpers.tqdm", new=None) def test_to_dataframe_no_tqdm(self): from google.cloud.bigquery.schema import SchemaField @@ -2564,57 +3099,6 @@ def test_to_dataframe_w_empty_results(self): self.assertEqual(len(df), 0) # verify the number of rows self.assertEqual(list(df), ["name", "age"]) # verify the column names - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_w_empty_results_wo_pyarrow(self): - from google.cloud.bigquery.schema import SchemaField - - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - api_request = mock.Mock(return_value={"rows": []}) - row_iterator = self._make_one(_mock_client(), api_request, schema=schema) - - with warnings.catch_warnings(record=True) as warned: - df = row_iterator.to_dataframe() - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 0) # verify the number of rows - self.assertEqual(list(df), ["name", "age"]) # verify the column names - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_w_no_results_wo_pyarrow(self): - from google.cloud.bigquery.schema import SchemaField - - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - api_request = mock.Mock(return_value={"rows": []}) - row_iterator = self._make_one(_mock_client(), api_request, schema=schema) - - def empty_iterable(dtypes=None): - return [] - - row_iterator.to_dataframe_iterable = empty_iterable - - with warnings.catch_warnings(record=True) as warned: - df = row_iterator.to_dataframe() - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 0) # verify the number of rows - self.assertEqual(list(df), ["name", "age"]) # verify the column names - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) - @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_w_various_types_nullable(self): import datetime @@ -2630,9 +3114,9 @@ def test_to_dataframe_w_various_types_nullable(self): ] row_data = [ [None, None, None, None, None, None], - ["1.4338368E9", "420", "1.1", u"Cash", "true", "1999-12-01"], - ["1.3878117E9", "2580", "17.7", u"Cash", "false", "1953-06-14"], - ["1.3855653E9", "2280", "4.4", u"Credit", "true", "1981-11-04"], + ["1433836800000000", "420", "1.1", u"Cash", "true", "1999-12-01"], + ["1387811700000000", "2580", "17.7", u"Cash", "false", "1953-06-14"], + ["1385565300000000", "2280", "4.4", u"Credit", "true", "1981-11-04"], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" @@ -2652,7 +3136,7 @@ def test_to_dataframe_w_various_types_nullable(self): else: self.assertIsInstance(row.start_timestamp, pandas.Timestamp) self.assertIsInstance(row.seconds, float) - self.assertIsInstance(row.payment_type, six.string_types) + self.assertIsInstance(row.payment_type, str) self.assertIsInstance(row.complete, bool) self.assertIsInstance(row.date, datetime.date) @@ -2670,9 +3154,17 @@ def test_to_dataframe_column_dtypes(self): SchemaField("date", "DATE"), ] row_data = [ - ["1.4338368E9", "420", "1.1", "1.77", u"Cash", "true", "1999-12-01"], - ["1.3878117E9", "2580", "17.7", "28.5", u"Cash", "false", "1953-06-14"], - ["1.3855653E9", "2280", "4.4", "7.1", u"Credit", "true", "1981-11-04"], + ["1433836800000000", "420", "1.1", "1.77", u"Cash", "true", "1999-12-01"], + [ + "1387811700000000", + "2580", + "17.7", + "28.5", + u"Cash", + "false", + "1953-06-14", + ], + ["1385565300000000", "2280", "4.4", "7.1", u"Credit", "true", "1981-11-04"], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" @@ -2715,6 +3207,18 @@ def test_to_dataframe_error_if_pandas_is_none(self): with self.assertRaises(ValueError): row_iterator.to_dataframe() + @unittest.skipIf(pandas is None, "Requires `pandas`") + @mock.patch("google.cloud.bigquery.table.shapely", new=None) + def test_to_dataframe_error_if_shapely_is_none(self): + with self.assertRaisesRegex( + ValueError, + re.escape( + "The shapely library is not installed, please install " + "shapely to use the geography_as_object option." + ), + ): + self._make_one_from_data().to_dataframe(geography_as_object=True) + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_max_results_w_bqstorage_warning(self): from google.cloud.bigquery.schema import SchemaField @@ -2747,12 +3251,12 @@ def test_to_dataframe_max_results_w_bqstorage_warning(self): for warning in warned if warning.category is UserWarning and "cannot use bqstorage_client" in str(warning).lower() - and "tabledata.list" in str(warning) + and "REST" in str(warning) ] self.assertEqual(len(matches), 1, msg="User warning was not emitted.") @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_max_results_w_create_bqstorage_warning(self): + def test_to_dataframe_max_results_w_explicit_bqstorage_client_warning(self): from google.cloud.bigquery.schema import SchemaField schema = [ @@ -2766,6 +3270,7 @@ def test_to_dataframe_max_results_w_create_bqstorage_warning(self): path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) mock_client = _mock_client() + mock_bqstorage_client = mock.sentinel.bq_storage_client row_iterator = self._make_one( client=mock_client, @@ -2776,33 +3281,73 @@ def test_to_dataframe_max_results_w_create_bqstorage_warning(self): ) with warnings.catch_warnings(record=True) as warned: - row_iterator.to_dataframe(create_bqstorage_client=True) + row_iterator.to_dataframe(bqstorage_client=mock_bqstorage_client) matches = [ warning for warning in warned if warning.category is UserWarning and "cannot use bqstorage_client" in str(warning).lower() - and "tabledata.list" in str(warning) + and "REST" in str(warning) ] self.assertEqual(len(matches), 1, msg="User warning was not emitted.") - mock_client._create_bqstorage_client.assert_not_called() + self.assertIn( + __file__, str(matches[0]), msg="Warning emitted with incorrect stacklevel" + ) + mock_client._ensure_bqstorage_client.assert_not_called() + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_max_results_w_create_bqstorage_client_no_warning(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + mock_client = _mock_client() + + row_iterator = self._make_one( + client=mock_client, + api_request=api_request, + path=path, + schema=schema, + max_results=42, + ) + + with warnings.catch_warnings(record=True) as warned: + row_iterator.to_dataframe(create_bqstorage_client=True) + + matches = [ + warning + for warning in warned + if warning.category is UserWarning + and "cannot use bqstorage_client" in str(warning).lower() + and "REST" in str(warning) + ] + self.assertFalse(matches) + mock_client._ensure_bqstorage_client.assert_not_called() @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_to_dataframe_w_bqstorage_creates_client(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut mock_client = _mock_client() - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - bqstorage_client.transport = mock.create_autospec( + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + bqstorage_client._transport = mock.create_autospec( big_query_read_grpc_transport.BigQueryReadGrpcTransport ) - mock_client._create_bqstorage_client.return_value = bqstorage_client - session = bigquery_storage_v1.types.ReadSession() + mock_client._ensure_bqstorage_client.return_value = bqstorage_client + session = bigquery_storage.types.ReadSession() bqstorage_client.create_read_session.return_value = session row_iterator = mut.RowIterator( mock_client, @@ -2816,19 +3361,19 @@ def test_to_dataframe_w_bqstorage_creates_client(self): table=mut.TableReference.from_string("proj.dset.tbl"), ) row_iterator.to_dataframe(create_bqstorage_client=True) - mock_client._create_bqstorage_client.assert_called_once() - bqstorage_client.transport.channel.close.assert_called_once() + mock_client._ensure_bqstorage_client.assert_called_once() + bqstorage_client._transport.grpc_channel.close.assert_called_once() @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_to_dataframe_w_bqstorage_no_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - session = bigquery_storage_v1.types.ReadSession() + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() bqstorage_client.create_read_session.return_value = session row_iterator = mut.RowIterator( @@ -2848,55 +3393,16 @@ def test_to_dataframe_w_bqstorage_no_streams(self): self.assertEqual(list(got), column_names) self.assertTrue(got.empty) - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf( - bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`" - ) - def test_to_dataframe_w_bqstorage_v1beta1_no_streams(self): - from google.cloud.bigquery import schema - from google.cloud.bigquery import table as mut - - bqstorage_client = mock.create_autospec( - bigquery_storage_v1beta1.BigQueryStorageClient - ) - session = bigquery_storage_v1beta1.types.ReadSession() - bqstorage_client.create_read_session.return_value = session - - row_iterator = mut.RowIterator( - _mock_client(), - api_request=None, - path=None, - schema=[ - schema.SchemaField("colA", "INTEGER"), - schema.SchemaField("colC", "FLOAT"), - schema.SchemaField("colB", "STRING"), - ], - table=mut.TableReference.from_string("proj.dset.tbl"), - ) - - with warnings.catch_warnings(record=True) as warned: - got = row_iterator.to_dataframe(bqstorage_client) - - column_names = ["colA", "colC", "colB"] - self.assertEqual(list(got), column_names) - self.assertTrue(got.empty) - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue( - "Support for BigQuery Storage v1beta1 clients is deprecated" in str(warning) - ) - @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_logs_session(self): from google.cloud.bigquery.table import Table - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - session = bigquery_storage_v1.types.ReadSession() + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession() session.name = "projects/test-proj/locations/us/sessions/SOMESESSION" bqstorage_client.create_read_session.return_value = session mock_logger = mock.create_autospec(logging.Logger) @@ -2914,7 +3420,7 @@ def test_to_dataframe_w_bqstorage_logs_session(self): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_empty_streams(self): @@ -2930,8 +3436,8 @@ def test_to_dataframe_w_bqstorage_empty_streams(self): ] arrow_schema = pyarrow.schema(arrow_fields) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - session = bigquery_storage_v1.types.ReadSession( + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession( streams=[{"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}], arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, ) @@ -2969,7 +3475,7 @@ def test_to_dataframe_w_bqstorage_empty_streams(self): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_nonempty(self): @@ -2985,8 +3491,8 @@ def test_to_dataframe_w_bqstorage_nonempty(self): ] arrow_schema = pyarrow.schema(arrow_fields) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - bqstorage_client.transport = mock.create_autospec( + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + bqstorage_client._transport = mock.create_autospec( big_query_read_grpc_transport.BigQueryReadGrpcTransport ) streams = [ @@ -2994,7 +3500,7 @@ def test_to_dataframe_w_bqstorage_nonempty(self): {"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}, {"name": "/projects/proj/dataset/dset/tables/tbl/streams/5678"}, ] - session = bigquery_storage_v1.types.ReadSession( + session = bigquery_storage.types.ReadSession( streams=streams, arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, ) @@ -3045,103 +3551,11 @@ def test_to_dataframe_w_bqstorage_nonempty(self): self.assertEqual(len(got.index), total_rows) # Don't close the client if it was passed in. - bqstorage_client.transport.channel.close.assert_not_called() - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf( - bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`" - ) - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_to_dataframe_w_bqstorage_v1beta1_nonempty(self): - from google.cloud.bigquery import schema - from google.cloud.bigquery import table as mut - from google.cloud.bigquery_storage_v1beta1 import reader - - arrow_fields = [ - pyarrow.field("colA", pyarrow.int64()), - # Not alphabetical to test column order. - pyarrow.field("colC", pyarrow.float64()), - pyarrow.field("colB", pyarrow.utf8()), - ] - arrow_schema = pyarrow.schema(arrow_fields) - - bqstorage_client = mock.create_autospec( - bigquery_storage_v1beta1.BigQueryStorageClient - ) - bqstorage_client.transport = mock.create_autospec( - big_query_storage_grpc_transport_v1beta1.BigQueryStorageGrpcTransport - ) - streams = [ - # Use two streams we want to check frames are read from each stream. - {"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}, - {"name": "/projects/proj/dataset/dset/tables/tbl/streams/5678"}, - ] - session = bigquery_storage_v1beta1.types.ReadSession( - streams=streams, - arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, - ) - bqstorage_client.create_read_session.return_value = session - - mock_rowstream = mock.create_autospec(reader.ReadRowsStream) - bqstorage_client.read_rows.return_value = mock_rowstream - - mock_rows = mock.create_autospec(reader.ReadRowsIterable) - mock_rowstream.rows.return_value = mock_rows - page_items = [ - pyarrow.array([1, -1]), - pyarrow.array([2.0, 4.0]), - pyarrow.array(["abc", "def"]), - ] - page_record_batch = pyarrow.RecordBatch.from_arrays( - page_items, schema=arrow_schema - ) - mock_page = mock.create_autospec(reader.ReadRowsPage) - mock_page.to_arrow.return_value = page_record_batch - mock_pages = (mock_page, mock_page, mock_page) - type(mock_rows).pages = mock.PropertyMock(return_value=mock_pages) - - schema = [ - schema.SchemaField("colA", "IGNORED"), - schema.SchemaField("colC", "IGNORED"), - schema.SchemaField("colB", "IGNORED"), - ] - - row_iterator = mut.RowIterator( - _mock_client(), - None, # api_request: ignored - None, # path: ignored - schema, - table=mut.TableReference.from_string("proj.dset.tbl"), - selected_fields=schema, - ) - - with warnings.catch_warnings(record=True) as warned: - got = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) - - # Was a deprecation warning emitted? - expected_warnings = [ - warning - for warning in warned - if issubclass(warning.category, DeprecationWarning) - and "v1beta1" in str(warning) - ] - self.assertEqual(len(expected_warnings), 1, "Deprecation warning not raised.") - - # Are the columns in the expected order? - column_names = ["colA", "colC", "colB"] - self.assertEqual(list(got), column_names) - - # Have expected number of rows? - total_pages = len(streams) * len(mock_pages) - total_rows = len(page_items[0]) * total_pages - self.assertEqual(len(got.index), total_rows) - - # Don't close the client if it was passed in. - bqstorage_client.transport.channel.close.assert_not_called() + bqstorage_client._transport.grpc_channel.close.assert_not_called() @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): @@ -3156,12 +3570,12 @@ def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): {"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}, {"name": "/projects/proj/dataset/dset/tables/tbl/streams/5678"}, ] - session = bigquery_storage_v1.types.ReadSession( + session = bigquery_storage.types.ReadSession( streams=streams, arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, ) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) bqstorage_client.create_read_session.return_value = session mock_rowstream = mock.create_autospec(reader.ReadRowsStream) @@ -3195,7 +3609,7 @@ def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(tqdm is None, "Requires `tqdm`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @@ -3211,14 +3625,14 @@ def test_to_dataframe_w_bqstorage_updates_progress_bar(self, tqdm_mock): arrow_fields = [pyarrow.field("testcol", pyarrow.int64())] arrow_schema = pyarrow.schema(arrow_fields) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) streams = [ # Use two streams we want to check that progress bar updates are # sent from each stream. {"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}, {"name": "/projects/proj/dataset/dset/tables/tbl/streams/5678"}, ] - session = bigquery_storage_v1.types.ReadSession( + session = bigquery_storage.types.ReadSession( streams=streams, arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, ) @@ -3274,7 +3688,7 @@ def blocking_to_arrow(*args, **kwargs): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_exits_on_keyboardinterrupt(self): @@ -3293,8 +3707,8 @@ def test_to_dataframe_w_bqstorage_exits_on_keyboardinterrupt(self): ] arrow_schema = pyarrow.schema(arrow_fields) - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - session = bigquery_storage_v1.types.ReadSession( + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + session = bigquery_storage.types.ReadSession( streams=[ # Use multiple streams because one will fail with a # KeyboardInterrupt, and we want to check that the other streams @@ -3393,12 +3807,12 @@ def test_to_dataframe_tabledata_list_w_multiple_pages_return_unique_index(self): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_to_dataframe_w_bqstorage_raises_auth_error(self): from google.cloud.bigquery import table as mut - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) bqstorage_client.create_read_session.side_effect = google.api_core.exceptions.Forbidden( "TEST BigQuery Storage API not enabled. TEST" ) @@ -3412,13 +3826,13 @@ def test_to_dataframe_w_bqstorage_raises_auth_error(self): row_iterator.to_dataframe(bqstorage_client=bqstorage_client) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_to_dataframe_w_bqstorage_partition(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) row_iterator = mut.RowIterator( _mock_client(), @@ -3432,13 +3846,13 @@ def test_to_dataframe_w_bqstorage_partition(self): row_iterator.to_dataframe(bqstorage_client) @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_to_dataframe_w_bqstorage_snapshot(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) row_iterator = mut.RowIterator( _mock_client(), @@ -3453,7 +3867,7 @@ def test_to_dataframe_w_bqstorage_snapshot(self): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( - bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): @@ -3472,11 +3886,11 @@ def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): arrow_schema = pyarrow.schema(arrow_fields) # create a mock BQ storage client - bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) - bqstorage_client.transport = mock.create_autospec( + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + bqstorage_client._transport = mock.create_autospec( big_query_read_grpc_transport.BigQueryReadGrpcTransport ) - session = bigquery_storage_v1.types.ReadSession( + session = bigquery_storage.types.ReadSession( streams=[{"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}], arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, ) @@ -3560,69 +3974,200 @@ def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): ) # Don't close the client if it was passed in. - bqstorage_client.transport.channel.close.assert_not_called() + bqstorage_client._transport.grpc_channel.close.assert_not_called() + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_dataframe_geography_as_object(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY")), + ( + ("foo", "Point(0 0)"), + ("bar", None), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), + ), + ) + df = row_iterator.to_dataframe( + create_bqstorage_client=False, geography_as_object=True, + ) + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 3) # verify the number of rows + self.assertEqual(list(df), ["name", "geog"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.geog.dtype.name, "object") + self.assertIsInstance(df.geog, pandas.Series) + self.assertEqual( + [v.__class__.__name__ for v in df.geog], ["Point", "float", "Polygon"] + ) - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_concat_categorical_dtype_wo_pyarrow(self): - from google.cloud.bigquery.schema import SchemaField + @mock.patch("google.cloud.bigquery.table.geopandas", new=None) + def test_to_geodataframe_error_if_geopandas_is_none(self): + with self.assertRaisesRegex( + ValueError, + re.escape( + "The geopandas library is not installed, please install " + "geopandas to use the to_geodataframe() function." + ), + ): + self._make_one_from_data().to_geodataframe() + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY")), + ( + ("foo", "Point(0 0)"), + ("bar", None), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), + ), + ) + df = row_iterator.to_geodataframe(create_bqstorage_client=False) + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 3) # verify the number of rows + self.assertEqual(list(df), ["name", "geog"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.geog.dtype.name, "geometry") + self.assertIsInstance(df.geog, geopandas.GeoSeries) + self.assertEqual(list(map(str, df.area)), ["0.0", "nan", "0.5"]) + self.assertEqual(list(map(str, df.geog.area)), ["0.0", "nan", "0.5"]) + self.assertEqual(df.crs.srs, "EPSG:4326") + self.assertEqual(df.crs.name, "WGS 84") + self.assertEqual(df.geog.crs.srs, "EPSG:4326") + self.assertEqual(df.geog.crs.name, "WGS 84") + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_ambiguous_geog(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), () + ) + with self.assertRaisesRegex( + ValueError, + re.escape( + "There is more than one GEOGRAPHY column in the result. " + "The geography_column argument must be used to specify which " + "one to use to create a GeoDataFrame" + ), + ): + row_iterator.to_geodataframe(create_bqstorage_client=False) - schema = [ - SchemaField("col_str", "STRING"), - SchemaField("col_category", "STRING"), - ] - row_data = [ - [u"foo", u"low"], - [u"bar", u"medium"], - [u"baz", u"low"], - [u"foo_page2", u"medium"], - [u"bar_page2", u"high"], - [u"baz_page2", u"low"], - ] - path = "/foo" + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_bad_geography_column(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), () + ) + with self.assertRaisesRegex( + ValueError, + re.escape( + "The given geography column, xxx, doesn't name" + " a GEOGRAPHY column in the result." + ), + ): + row_iterator.to_geodataframe( + create_bqstorage_client=False, geography_column="xxx" + ) - rows = [{"f": [{"v": field} for field in row]} for row in row_data[:3]] - rows_page2 = [{"f": [{"v": field} for field in row]} for row in row_data[3:]] - api_request = mock.Mock( - side_effect=[{"rows": rows, "pageToken": "NEXTPAGE"}, {"rows": rows_page2}] + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_no_geog(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "STRING")), () + ) + with self.assertRaisesRegex( + TypeError, + re.escape( + "There must be at least one GEOGRAPHY column" + " to create a GeoDataFrame" + ), + ): + row_iterator.to_geodataframe(create_bqstorage_client=False) + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_w_geography_column(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), + ( + ("foo", "Point(0 0)", "Point(1 1)"), + ("bar", None, "Point(2 2)"), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))", "Point(3 3)"), + ), + ) + df = row_iterator.to_geodataframe( + create_bqstorage_client=False, geography_column="geog" + ) + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 3) # verify the number of rows + self.assertEqual(list(df), ["name", "geog", "geog2"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.geog.dtype.name, "geometry") + self.assertEqual(df.geog2.dtype.name, "object") + self.assertIsInstance(df.geog, geopandas.GeoSeries) + self.assertEqual(list(map(str, df.area)), ["0.0", "nan", "0.5"]) + self.assertEqual(list(map(str, df.geog.area)), ["0.0", "nan", "0.5"]) + self.assertEqual( + [v.__class__.__name__ for v in df.geog], ["Point", "NoneType", "Polygon"] ) - row_iterator = self._make_one(_mock_client(), api_request, path, schema) + # Geog2 isn't a GeoSeries, but it contains geomentries: + self.assertIsInstance(df.geog2, pandas.Series) + self.assertEqual( + [v.__class__.__name__ for v in df.geog2], ["Point", "Point", "Point"] + ) + # and can easily be converted to a GeoSeries + self.assertEqual( + list(map(str, geopandas.GeoSeries(df.geog2).area)), ["0.0", "0.0", "0.0"] + ) - mock_pyarrow = mock.patch("google.cloud.bigquery.table.pyarrow", None) - catch_warnings = warnings.catch_warnings(record=True) + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + @mock.patch("google.cloud.bigquery.table.RowIterator.to_dataframe") + def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): + """ + RowIterator.to_geodataframe just delegates to RowIterator.to_dataframe. - with mock_pyarrow, catch_warnings as warned: - got = row_iterator.to_dataframe( - dtypes={ - "col_category": pandas.core.dtypes.dtypes.CategoricalDtype( - categories=["low", "medium", "high"], ordered=False, - ), - }, - ) + This test just demonstrates that. We don't need to test all the + variations, which are tested for to_dataframe. + """ + import numpy + from shapely import wkt - self.assertIsInstance(got, pandas.DataFrame) - self.assertEqual(len(got), 6) # verify the number of rows - expected_columns = [field.name for field in schema] - self.assertEqual(list(got), expected_columns) # verify the column names + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("g", "GEOGRAPHY")) + ) + bqstorage_client = object() + dtypes = dict(xxx=numpy.dtype("int64")) + progress_bar_type = "normal" + create_bqstorage_client = False + date_as_object = False + geography_column = "g" + + to_dataframe.return_value = pandas.DataFrame( + dict(name=["foo"], g=[wkt.loads("point(0 0)")],) + ) - # Are column types correct? - expected_dtypes = [ - pandas.core.dtypes.dtypes.np.dtype("O"), # the default for string data - pandas.core.dtypes.dtypes.CategoricalDtype( - categories=["low", "medium", "high"], ordered=False, - ), - ] - self.assertEqual(list(got.dtypes), expected_dtypes) + df = row_iterator.to_geodataframe( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + geography_column=geography_column, + ) - # And the data in the categorical column? - self.assertEqual( - list(got["col_category"]), - ["low", "medium", "low", "medium", "high", "low"], + to_dataframe.assert_called_once_with( + bqstorage_client, + dtypes, + progress_bar_type, + create_bqstorage_client, + date_as_object, + geography_as_object=True, ) - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 1) # verify the number of rows + self.assertEqual(list(df), ["name", "g"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.g.dtype.name, "geometry") + self.assertIsInstance(df.g, geopandas.GeoSeries) + self.assertEqual(list(map(str, df.area)), ["0.0"]) + self.assertEqual(list(map(str, df.g.area)), ["0.0"]) + self.assertEqual([v.__class__.__name__ for v in df.g], ["Point"]) class TestPartitionRange(unittest.TestCase): @@ -3682,7 +4227,7 @@ def test__eq___type_mismatch(self): def test_unhashable_object(self): object_under_test1 = self._make_one(start=1, end=10, interval=2) - with six.assertRaisesRegex(self, TypeError, r".*unhashable type.*"): + with self.assertRaisesRegex(TypeError, r".*unhashable type.*"): hash(object_under_test1) def test_repr(self): @@ -3782,7 +4327,7 @@ def test_unhashable_object(self): object_under_test1 = self._make_one( range_=PartitionRange(start=1, end=10, interval=2), field="integer_col" ) - with six.assertRaisesRegex(self, TypeError, r".*unhashable type.*"): + with self.assertRaisesRegex(TypeError, r".*unhashable type.*"): hash(object_under_test1) def test_repr(self): @@ -3984,7 +4529,7 @@ def test___hash__not_equals(self): def test___repr___minimal(self): time_partitioning = self._make_one() - expected = "TimePartitioning(type=DAY)" + expected = "TimePartitioning(type_='DAY')" self.assertEqual(repr(time_partitioning), expected) def test___repr___explicit(self): @@ -3993,7 +4538,7 @@ def test___repr___explicit(self): time_partitioning = self._make_one( type_=TimePartitioningType.DAY, field="name", expiration_ms=10000 ) - expected = "TimePartitioning(" "expirationMs=10000," "field=name," "type=DAY)" + expected = "TimePartitioning(expiration_ms=10000,field='name',type_='DAY')" self.assertEqual(repr(time_partitioning), expected) def test_set_expiration_w_none(self): @@ -4003,7 +4548,7 @@ def test_set_expiration_w_none(self): @pytest.mark.skipif( - bigquery_storage_v1 is None, reason="Requires `google-cloud-bigquery-storage`" + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) @pytest.mark.parametrize( "table_path", @@ -4022,43 +4567,3 @@ def test_table_reference_to_bqstorage_v1_stable(table_path): for klass in (mut.TableReference, mut.Table, mut.TableListItem): got = klass.from_string(table_path).to_bqstorage() assert got == expected - - -@pytest.mark.skipif( - bigquery_storage_v1beta1 is None, reason="Requires `google-cloud-bigquery-storage`" -) -def test_table_reference_to_bqstorage_v1beta1(): - from google.cloud.bigquery import table as mut - - # Can't use parametrized pytest because bigquery_storage_v1beta1 may not be - # available. - expected = bigquery_storage_v1beta1.types.TableReference( - project_id="my-project", dataset_id="my_dataset", table_id="my_table" - ) - cases = ( - "my-project.my_dataset.my_table", - "my-project.my_dataset.my_table$20181225", - "my-project.my_dataset.my_table@1234567890", - "my-project.my_dataset.my_table$20181225@1234567890", - ) - - classes = (mut.TableReference, mut.Table, mut.TableListItem) - - for case, cls in itertools.product(cases, classes): - got = cls.from_string(case).to_bqstorage(v1beta1=True) - assert got == expected - - -@unittest.skipIf( - bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`" -) -def test_table_reference_to_bqstorage_v1beta1_raises_import_error(): - from google.cloud.bigquery import table as mut - - classes = (mut.TableReference, mut.Table, mut.TableListItem) - for cls in classes: - with mock.patch.object(mut, "bigquery_storage_v1beta1", None), pytest.raises( - ValueError - ) as exc_context: - cls.from_string("my-project.my_dataset.my_table").to_bqstorage(v1beta1=True) - assert mut._NO_BQSTORAGE_ERROR in str(exc_context.value)