Skip to content

Commit 74cde71

Browse files
committed
Implement select object
Selects and filters out data from stored object. Client for select feature
1 parent 7393d66 commit 74cde71

File tree

8 files changed

+644
-3
lines changed

8 files changed

+644
-3
lines changed

docs/API.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ s3Client = Minio('s3.amazonaws.com',
4040
| | [`fput_object`](#fput_object) | | |
4141
| | [`fget_object`](#fget_object) | | |
4242
| | [`get_partial_object`](#get_partial_object) | | |
43+
| | [`select_object_content`](#select_object_content) | | |
4344

4445
## 1. Constructor
4546

@@ -668,6 +669,70 @@ except ResponseError as err:
668669
print(err)
669670
```
670671

672+
<a name="select_object_content"></a>
673+
### select_object_content(self, bucket_name, object_name, obj)
674+
Select object content filters the contents of object based on a simple structured query language (SQL).
675+
676+
__Parameters__
677+
678+
|Param |Type |Description |
679+
|:---|:---|:---|
680+
|``bucket_name`` |_string_ |Name of the bucket. |
681+
|``object_name`` |_string_ |Name of the object. |
682+
|``obj`` | _Object_ | Query Options |
683+
684+
685+
__Return Value__
686+
687+
|Param |Type |Description |
688+
|:---|:---|:---|
689+
|``object`` | _urllib3.response.HTTPResponse_ |Represents http streaming reader. |
690+
691+
__Example__
692+
693+
694+
```py
695+
client = Minio('s3.amazonaws.com',
696+
access_key='YOUR-ACCESSKEY',
697+
secret_key='YOUR-SECRETKEY')
698+
699+
obj = SelectObjectOptions(
700+
expression=" select * from s3object",
701+
iser = InputSerialization(compression_type="NONE",
702+
csv=CSVInput(FileHeaderInfo="USE",
703+
RecordDelimiter="\n",
704+
FieldDelimiter=",",
705+
QuoteCharacter='"',
706+
QuoteEscapeCharacter='"',
707+
Comments="#",
708+
AllowQuotedRecordDelimiter="FALSE"
709+
),
710+
),
711+
oser=OutputSerialization(
712+
csv = CSVOutput(
713+
QuoteFields="ASNEEDED",
714+
RecordDelimiter="\n",
715+
FieldDelimiter=",",
716+
QuoteCharacter='"',
717+
QuoteEscapeCharacter='"',
718+
)
719+
),
720+
rp=RequestProgress(
721+
enabled="TRUE"
722+
)
723+
)
724+
725+
726+
try:
727+
records, stats = client.select_object_content('sinha', 'test.csv', obj)
728+
with open('my-record', 'wb') as record_data:
729+
record_data.write(records)
730+
with open('my-stat', 'wb') as stat_data:
731+
stat_data.write(stats)
732+
except ResponseError as err:
733+
print(err)
734+
```
735+
671736
<a name="fget_object"></a>
672737
### fget_object(bucket_name, object_name, file_path, request_headers=None)
673738
Downloads and saves the object as a file in the local filesystem.

examples/select_object_content.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# -*- coding: utf-8 -*-
2+
# MinIO Python Library for Amazon S3 Compatible Cloud Storage, (C) 2019 MinIO, Inc.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Note: YOUR-ACCESSKEYID, YOUR-SECRETACCESSKEY, my-bucketname, my-objectname and
17+
# my-testfile are dummy values, please replace them with original values.
18+
19+
from minio import Minio
20+
from minio.error import ResponseError
21+
from minio.select_object import CRCValidationError
22+
from minio.select_object_options import (SelectObjectOptions, CSVInput, JSONInput, RequestProgress, ParquetInput, InputSerialization, OutputSerialization, CSVOutput, JsonOutput)
23+
client = Minio('s3.amazonaws.com',
24+
access_key='YOUR-ACCESSKEY',
25+
secret_key='YOUR-SECRETKEY')
26+
27+
obj = SelectObjectOptions(
28+
expression=" select * from s3object",
29+
iser = InputSerialization(compression_type="NONE",
30+
csv=CSVInput(FileHeaderInfo="USE",
31+
RecordDelimiter="\n",
32+
FieldDelimiter=",",
33+
QuoteCharacter='"',
34+
QuoteEscapeCharacter='"',
35+
Comments="#",
36+
AllowQuotedRecordDelimiter="FALSE"
37+
),
38+
),
39+
# If input is JSON
40+
# json = JSONInput(Type="DOCUMENT"
41+
# )
42+
# ),
43+
44+
oser=OutputSerialization(
45+
csv = CSVOutput(
46+
QuoteFields="ASNEEDED",
47+
RecordDelimiter="\n",
48+
FieldDelimiter=",",
49+
QuoteCharacter='"',
50+
QuoteEscapeCharacter='"',
51+
)
52+
),
53+
54+
# If output is JSON
55+
# oser=OutputSerialization(
56+
# json = JsonOutput(
57+
# RecordDelimiter="\n"
58+
# )
59+
# ),
60+
rp=RequestProgress(
61+
enabled="TRUE"
62+
)
63+
)
64+
65+
try:
66+
records, stats = client.select_object_content('sinha', 'test.csv', obj)
67+
with open('my-record', 'wb') as record_data:
68+
record_data.write(records)
69+
with open('my-stat', 'wb') as stat_data:
70+
stat_data.write(stats)
71+
except CRCValidationError as err:
72+
print(err)
73+
except ResponseError as err:
74+
print(err)

minio/api.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
is_valid_source_sse_object,
7777
is_valid_bucket_notification_config, is_valid_policy_type,
7878
mkdir_p, dump_http, amzprefix_user_metadata,
79-
is_supported_header,is_amz_header)
79+
is_supported_header,is_amz_header,is_valid_input_type)
8080
from .helpers import (MAX_MULTIPART_OBJECT_SIZE,
8181
MAX_PART_SIZE,
8282
MAX_POOL_SIZE,
@@ -90,7 +90,8 @@
9090
from .xml_marshal import (xml_marshal_bucket_constraint,
9191
xml_marshal_complete_multipart_upload,
9292
xml_marshal_bucket_notifications,
93-
xml_marshal_delete_objects)
93+
xml_marshal_delete_objects,
94+
xml_marshal_select)
9495
from .fold_case_dict import FoldCaseDict
9596
from .thread_pool import ThreadPool
9697

@@ -235,6 +236,42 @@ def trace_off(self):
235236
"""
236237
self._trace_output_stream = None
237238

239+
# Select from object
240+
def select_object_content(self, bucket_name,
241+
object_name,
242+
obj):
243+
244+
'''
245+
Select from object
246+
'''
247+
is_valid_bucket_name(bucket_name)
248+
is_non_empty_string(object_name)
249+
is_valid_input_type(object_name)
250+
251+
content = xml_marshal_select(obj)
252+
253+
url_values = dict()
254+
url_values["select"] = ""
255+
url_values["select-type"] = "2"
256+
257+
headers = {
258+
'Content-Length': str(len(content)),
259+
'Content-Md5': get_md5_base64digest(content)
260+
}
261+
content_sha256_hex = get_sha256_hexdigest(content)
262+
response = self._url_open(
263+
'POST',
264+
bucket_name=bucket_name,
265+
object_name=object_name,
266+
query=url_values,
267+
headers=headers,
268+
body=content,
269+
content_sha256=content_sha256_hex,
270+
preload_content=False)
271+
272+
return extract_message(response)
273+
274+
238275
# Bucket level
239276
def make_bucket(self, bucket_name, location='us-east-1'):
240277
"""

minio/helpers.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@
5555
MIN_PART_SIZE = 5 * 1024 * 1024 # 5MiB
5656
DEFAULT_PART_SIZE = MIN_PART_SIZE # Currently its 5MiB
5757

58+
READ_BUFFER_SELECT = 32 * 1024 # Buffer size for select object content
59+
CONS_READ_SIZE = 4 # total byte length, header byte length , CRC are
60+
# 4 bytes in S3 select response
61+
5862
_VALID_BUCKETNAME_REGEX = re.compile('^[a-z0-9][a-z0-9\\.\\-]+[a-z0-9]$')
5963
_ALLOWED_HOSTNAME_REGEX = re.compile(
6064
'^((?!-)(?!_)[A-Z_\\d-]{1,63}(?<!-)(?<!_)\\.)*((?!_)(?!-)[A-Z_\\d-]{1,63}(?<!-)(?<!_))$',
@@ -696,3 +700,18 @@ def is_supported_header(key):
696700
# returns true if header is a storage class header
697701
def is_storageclass_header(key):
698702
return key.lower() == "x-amz-storage-class"
703+
704+
# return the input type if valid i.e. csv, json, parquet, par
705+
# or raise an InvalidArgumentError
706+
def is_valid_input_type(object_name):
707+
if object_name.__contains__('.csv') | object_name.__contains__('.CSV') :
708+
return 'csv'
709+
elif object_name.__contains__('.json') | object_name.__contains__('.JSON') :
710+
return 'json'
711+
elif object_name.__contains__('.parquet') | object_name.__contains__('.par') :
712+
return 'parquet'
713+
else:
714+
raise InvalidArgumentError('Input type not supported '
715+
' Only csv, json and parquet.')
716+
717+

0 commit comments

Comments
 (0)