Skip to content

Commit eb39c48

Browse files
committed
Implement select object
Selects and filters out data from stored object. Client for select feature
1 parent 7393d66 commit eb39c48

File tree

10 files changed

+1009
-138
lines changed

10 files changed

+1009
-138
lines changed

docs/API.md

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ s3Client = Minio('s3.amazonaws.com',
4040
| | [`fput_object`](#fput_object) | | |
4141
| | [`fget_object`](#fget_object) | | |
4242
| | [`get_partial_object`](#get_partial_object) | | |
43+
| | [`select_object_content`](#select_object_content) | | |
4344

4445
## 1. Constructor
4546

@@ -668,6 +669,79 @@ except ResponseError as err:
668669
print(err)
669670
```
670671

672+
<a name="select_object_content"></a>
673+
### select_object_content(self, bucket_name, object_name, options)
674+
Select object content filters the contents of object based on a simple structured query language (SQL).
675+
676+
__Parameters__
677+
678+
|Param |Type |Description |
679+
|:---|:---|:---|
680+
|``bucket_name`` |_string_ |Name of the bucket. |
681+
|``object_name`` |_string_ |Name of the object. |
682+
|``options`` | _Object_ | Query Options |
683+
684+
685+
__Return Value__
686+
687+
|Param |Type |Description |
688+
|:---|:---|:---|
689+
|``obj``|_Object_ |Select_object_reader object. |
690+
691+
692+
693+
__Example__
694+
695+
696+
```py
697+
client = Minio('s3.amazonaws.com',
698+
access_key='YOUR-ACCESSKEY',
699+
secret_key='YOUR-SECRETKEY')
700+
701+
options = SelectObjectOptions(
702+
expression=" select * from s3object",
703+
input_serialization=InputSerialization(
704+
compression_type="NONE",
705+
csv=CSVInput(FileHeaderInfo="USE",
706+
RecordDelimiter="\n",
707+
FieldDelimiter=",",
708+
QuoteCharacter='"',
709+
QuoteEscapeCharacter='"',
710+
Comments="#",
711+
AllowQuotedRecordDelimiter="FALSE",
712+
),
713+
),
714+
715+
output_serialization=OutputSerialization(
716+
csv=CSVOutput(QuoteFields="ASNEEDED",
717+
RecordDelimiter="\n",
718+
FieldDelimiter=",",
719+
QuoteCharacter='"',
720+
QuoteEscapeCharacter='"',)
721+
),
722+
request_progress=RequestProgress(
723+
enabled="FLASE"
724+
)
725+
)
726+
727+
try:
728+
data = client.select_object_content('my-bucket', 'my-object', options)
729+
730+
# Get the records
731+
with open('my-record-file', 'w') as record_data:
732+
for d in data.stream(10*1024):
733+
record_data.write(d)
734+
735+
# Get the stats
736+
print(data.stats)
737+
738+
except CRCValidationError as err:
739+
print(err)
740+
except ResponseError as err:
741+
print(err)
742+
743+
```
744+
671745
<a name="fget_object"></a>
672746
### fget_object(bucket_name, object_name, file_path, request_headers=None)
673747
Downloads and saves the object as a file in the local filesystem.

examples/select_object_content.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# -*- coding: utf-8 -*-
2+
# MinIO Python Library for Amazon S3 Compatible Cloud Storage, (C)
3+
# 2019 MinIO, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
from minio import Minio
19+
from minio.error import ResponseError
20+
from minio.select_object_reader import CRCValidationError
21+
from minio.select_object_options import (SelectObjectOptions, CSVInput,
22+
JSONInput, RequestProgress,
23+
ParquetInput, InputSerialization,
24+
OutputSerialization, CSVOutput,
25+
JsonOutput)
26+
27+
client = Minio('s3.amazonaws.com',
28+
access_key='YOUR-ACCESSKEY',
29+
secret_key='YOUR-SECRETKEY')
30+
31+
options = SelectObjectOptions(
32+
expression="select * from s3object",
33+
input_serialization=InputSerialization(
34+
compression_type="NONE",
35+
csv=CSVInput(FileHeaderInfo="USE",
36+
RecordDelimiter="\n",
37+
FieldDelimiter=",",
38+
QuoteCharacter='"',
39+
QuoteEscapeCharacter='"',
40+
Comments="#",
41+
AllowQuotedRecordDelimiter="FALSE",
42+
),
43+
# If input is JSON
44+
# json=JSONInput(Type="DOCUMENT",)
45+
),
46+
47+
output_serialization=OutputSerialization(
48+
csv=CSVOutput(QuoteFields="ASNEEDED",
49+
RecordDelimiter="\n",
50+
FieldDelimiter=",",
51+
QuoteCharacter='"',
52+
QuoteEscapeCharacter='"',)
53+
54+
# json = JsonOutput(
55+
# RecordDelimiter="\n",
56+
# )
57+
),
58+
request_progress=RequestProgress(
59+
enabled="False"
60+
)
61+
)
62+
63+
try:
64+
data = client.select_object_content('your-bucket', 'your-object', options)
65+
66+
# Get the records
67+
with open('my-record-file', 'w') as record_data:
68+
for d in data.stream(10*1024):
69+
record_data.write(d)
70+
71+
# Get the stats
72+
print(data.stats())
73+
74+
except CRCValidationError as err:
75+
print(err)
76+
except ResponseError as err:
77+
print(err)

minio/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,13 @@
3131
__author__ = 'MinIO, Inc.'
3232
__version__ = '4.0.19'
3333
__license__ = 'Apache 2.0'
34-
__copyright__ = 'Copyright 2015, 2016, 2017 MinIO, Inc.'
34+
__copyright__ = 'Copyright 2015, 2016, 2017, 2018, 2019 MinIO, Inc.'
3535

3636
from .api import Minio
3737
from .error import ResponseError
3838
from .post_policy import PostPolicy
3939
from .copy_conditions import CopyConditions
4040
from .definitions import Bucket, Object
41+
from .select_object_reader import SelectObjectReader
42+
43+

minio/api.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,11 @@
9090
from .xml_marshal import (xml_marshal_bucket_constraint,
9191
xml_marshal_complete_multipart_upload,
9292
xml_marshal_bucket_notifications,
93-
xml_marshal_delete_objects)
93+
xml_marshal_delete_objects,
94+
xml_marshal_select)
9495
from .fold_case_dict import FoldCaseDict
9596
from .thread_pool import ThreadPool
97+
from .select_object_reader import SelectObjectReader
9698

9799
# Comment format.
98100
_COMMENTS = '({0}; {1})'
@@ -235,6 +237,45 @@ def trace_off(self):
235237
"""
236238
self._trace_output_stream = None
237239

240+
# Select Object Content
241+
def select_object_content(self, bucket_name, object_name, opts):
242+
"""
243+
It filters the contents of an object based on a simple
244+
structured query language (SQL).
245+
246+
Examples:
247+
data = client.select_object_content('foo', 'test.csv', options)
248+
249+
:param bucket_name: Bucket to read object from
250+
:param object_name: Name of object to read
251+
:param options: Options for select object
252+
"""
253+
is_valid_bucket_name(bucket_name)
254+
is_non_empty_string(object_name)
255+
256+
content = xml_marshal_select(opts)
257+
url_values = dict()
258+
url_values["select"] = ""
259+
url_values["select-type"] = "2"
260+
261+
headers = {
262+
'Content-Length': str(len(content)),
263+
'Content-Md5': get_md5_base64digest(content)
264+
}
265+
content_sha256_hex = get_sha256_hexdigest(content)
266+
response = self._url_open(
267+
'POST',
268+
bucket_name=bucket_name,
269+
object_name=object_name,
270+
query=url_values,
271+
headers=headers,
272+
body=content,
273+
content_sha256=content_sha256_hex,
274+
preload_content=False)
275+
276+
return SelectObjectReader(response)
277+
278+
238279
# Bucket level
239280
def make_bucket(self, bucket_name, location='us-east-1'):
240281
"""

minio/helpers.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,23 @@
4848
InvalidArgumentError)
4949

5050
# Constants
51-
MAX_MULTIPART_COUNT = 10000 # 10000 parts
51+
MAX_MULTIPART_COUNT = 10000 # 10000 parts
5252
MAX_MULTIPART_OBJECT_SIZE = 5 * 1024 * 1024 * 1024 * 1024 # 5TiB
5353
MAX_PART_SIZE = 5 * 1024 * 1024 * 1024 # 5GiB
5454
MAX_POOL_SIZE = 10
5555
MIN_PART_SIZE = 5 * 1024 * 1024 # 5MiB
56-
DEFAULT_PART_SIZE = MIN_PART_SIZE # Currently its 5MiB
56+
DEFAULT_PART_SIZE = MIN_PART_SIZE # Currently its 5MiB
57+
58+
59+
# Select Object Content
60+
READ_SIZE_SELECT = 32 * 1024 # Buffer size
61+
SQL = 'SQL' # Value for ExpressionType
62+
EVENT_RECORDS = 'Records' # Event Type is Records
63+
EVENT_PROGRESS = 'Progress' # Event Type Progress
64+
EVENT_STATS = 'Stats' # Event Type Stats
65+
EVENT = 'event' # Message Type is event
66+
EVENT_END = 'End' # Event Type is End
67+
ERROR = 'error' # Message Type is error
5768

5869
_VALID_BUCKETNAME_REGEX = re.compile('^[a-z0-9][a-z0-9\\.\\-]+[a-z0-9]$')
5970
_ALLOWED_HOSTNAME_REGEX = re.compile(

minio/select_object_options.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# -*- coding: utf-8 -*-
2+
# MinIO Python Library for Amazon S3 Compatible Cloud Storage, (C)
3+
# 2019 MinIO, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""
18+
19+
This module creates the request for Select
20+
21+
:copyright: (c) 2019 by MinIO, Inc.
22+
:license: Apache 2.0, see LICENSE for more details.
23+
24+
"""
25+
from .helpers import (SQL)
26+
27+
28+
class CSVInput:
29+
"""
30+
CSVInput: Input Format as CSV.
31+
"""
32+
def __init__(self, FileHeaderInfo=None, RecordDelimiter="\n",
33+
FieldDelimiter=",", QuoteCharacter='"',
34+
QuoteEscapeCharacter='"', Comments="#",
35+
AllowQuotedRecordDelimiter=False):
36+
self.FileHeaderInfo = FileHeaderInfo
37+
self.RecordDelimiter = RecordDelimiter
38+
self.FieldDelimiter = FieldDelimiter
39+
self.QuoteCharacter = QuoteCharacter
40+
self.QuoteEscapeCharacter = QuoteEscapeCharacter
41+
self.Comments = Comments
42+
self.AllowQuotedRecordDelimiter = AllowQuotedRecordDelimiter
43+
44+
45+
class JSONInput:
46+
"""
47+
JSONInput: Input format as JSON.
48+
"""
49+
def __init__(self, Type=None):
50+
self.Type = Type
51+
52+
53+
class ParquetInput:
54+
"""
55+
ParquetInput: Input format as Parquet
56+
"""
57+
58+
59+
class InputSerialization:
60+
"""
61+
InputSerialization: nput Format.
62+
"""
63+
def __init__(self, compression_type="NONE", csv=None, json=None, par=None):
64+
self.compression_type = compression_type
65+
self.csv_input = csv
66+
self.json_input = json
67+
self.parquet_input = par
68+
69+
70+
class CSVOutput:
71+
"""
72+
CSVOutput: Output as CSV.
73+
74+
"""
75+
def __init__(self, QuoteFields="ASNEEDED", RecordDelimiter="\n",
76+
FieldDelimiter=",", QuoteCharacter='"',
77+
QuoteEscapeCharacter='"'):
78+
self.QuoteFields = QuoteFields
79+
self.RecordDelimiter = RecordDelimiter
80+
self.FieldDelimiter = FieldDelimiter
81+
self.QuoteCharacter = QuoteCharacter
82+
self.QuoteEscapeCharacter = QuoteEscapeCharacter
83+
84+
85+
class JsonOutput:
86+
"""
87+
JsonOutput- Output as JSON.
88+
"""
89+
def __init__(self, RecordDelimiter="\n"):
90+
self.RecordDelimiter = RecordDelimiter
91+
92+
93+
class OutputSerialization:
94+
"""
95+
OutputSerialization: Output Format.
96+
"""
97+
def __init__(self, csv=None, json=None):
98+
self.csv_output = csv
99+
self.json_output = json
100+
101+
102+
class RequestProgress:
103+
"""
104+
RequestProgress: Sends progress message.
105+
"""
106+
def __init__(self, enabled=False):
107+
self.enabled = enabled
108+
109+
110+
class SelectObjectOptions:
111+
"""
112+
SelectObjectOptions: Options for select object
113+
"""
114+
expression_type = SQL
115+
116+
def __init__(self, expression, input_serialization,
117+
output_serialization, request_progress):
118+
self.expression = expression
119+
self.in_ser = input_serialization
120+
self.out_ser = output_serialization
121+
self.req_progress = request_progress

0 commit comments

Comments
 (0)