-
Notifications
You must be signed in to change notification settings - Fork 849
/
Copy pathstring-tokenizer-as-table-function.sql
270 lines (245 loc) · 9.72 KB
/
string-tokenizer-as-table-function.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
/*
This is a PL/SQL implementation to tokenize a string, based on a delimiter c
haracter, into a set of "tokens". Two tokenizers are being provided:
One for VARCHAR2, one for CLOB. Both can be used procedurally in a
PL/SQL loop or in SQL as a table function.
Created by Carsten Czarski
*/
-- Create type to describe a table row
-- This type describes a row of the String Tokenizers' result table.
create type token_t as object(
token_text varchar2(4000),
start_pos number,
length number
)
/
-- Create a table type to describe the result set
create type token_list as table of token_t
/
-- CLOB Tokenizer: Type definition
/*
The String tokenizer is being implemented as two object types. One is for
tokenizing CLOBs, the other one for tokenizing VARCHAR2s. The functionality is
contained in its static and member functions. The types can be used procedurally
or as a SQL table function.
*/
create type clob_tokenizer as object(
value_string clob,
delimiter varchar2(10),
parser_current_pos number,
last_token varchar2(4000),
constructor function clob_tokenizer (p_string in clob, p_delim in varchar2)
return self as result,
member function has_more_tokens return number,
member function next_token(self in out nocopy clob_tokenizer) return varchar2,
static function all_tokens (p_string in clob, p_delim in varchar2)
return token_list pipelined parallel_enable,
static function all_tokens_cur (p_cursor in sys_refcursor, p_delim in varchar2)
return token_list pipelined parallel_enable (partition p_cursor by any)
);
/
-- CLOB tokenizer type implementation
-- The type body contains the actual implementations for the static and member procedures.
create or replace type body clob_tokenizer is
constructor function clob_tokenizer (p_string in clob, p_delim in varchar2) return self as result as
begin
self.value_string := p_string;
self.delimiter := p_delim;
self.parser_current_pos := 1;
self.last_token := null;
return ;
end;
member function has_more_tokens return number as
begin
if self.parser_current_pos <= dbms_lob.getlength(value_string) then
return 1;
else
return 0;
end if;
end;
member function next_token(self in out nocopy clob_tokenizer) return varchar2 is
l_next_delim_pos number;
l_token varchar2(4000);
begin
if self.has_more_tokens() = 1 then
l_next_delim_pos := dbms_lob.instr(self.value_string, self.delimiter, self.parser_current_pos);
if l_next_delim_pos = 0 then
l_token := dbms_lob.substr(
lob_loc => self.value_string,
amount => (dbms_lob.getlength(self.value_string) - self.parser_current_pos) + 1,
offset => self.parser_current_pos
);
parser_current_pos := dbms_lob.getlength(self.value_string) + 1;
else
l_token := dbms_lob.substr(
lob_loc => self.value_string,
amount => l_next_delim_pos - self.parser_current_pos,
offset => self.parser_current_pos
);
parser_current_pos := l_next_delim_pos + length(self.delimiter);
end if;
else
l_token := null;
end if;
self.last_token := l_token;
return l_token;
end;
static function all_tokens (p_string in clob, p_delim in varchar2)
return token_list pipelined parallel_enable is
l_st clob_tokenizer := clob_tokenizer(p_string, p_delim);
l_startpos number;
l_token varchar2(4000);
begin
while l_st.has_more_tokens = 1 loop
l_startpos := l_st.parser_current_pos;
l_token := l_st.next_token();
pipe row (token_t(l_token, l_startpos, nvl(length(l_token),0)));
end loop;
return;
end;
static function all_tokens_cur (p_cursor in sys_refcursor, p_delim in varchar2)
return token_list pipelined parallel_enable (partition p_cursor by any) is
l_st clob_tokenizer;
l_string clob;
l_startpos number;
l_token varchar2(4000);
begin
loop
fetch p_cursor into l_string;
exit when p_cursor%notfound;
l_st := clob_tokenizer(l_string, p_delim);
while l_st.has_more_tokens = 1 loop
l_startpos := l_st.parser_current_pos;
l_token := l_st.next_token();
pipe row (token_t(l_token, l_startpos, nvl(length(l_token),0)));
end loop;
end loop;
return;
end;
end;
/
-- VARCHAR2 Tokenizer: Type definition
/*
The String tokenizer is being implemented as two object types. One is for
tokenizing CLOBs, the other one for tokenizing VARCHAR2s. The functionality is
contained in its static and member functions. The types can be used procedurally
or as a SQL table function.
*/
create type string_tokenizer as object(
value_string varchar2(4000),
delimiter varchar2(10),
parser_current_pos number,
last_token varchar2(4000),
constructor function string_tokenizer (p_string in varchar2, p_delim in varchar2)
return self as result,
member function has_more_tokens(self in out nocopy string_tokenizer) return number,
member function next_token(self in out nocopy string_tokenizer) return varchar2,
static function all_tokens (p_string in varchar2, p_delim in varchar2)
return token_list pipelined parallel_enable,
static function all_tokens_cur (p_cursor in sys_refcursor, p_delim in varchar2)
return token_list pipelined parallel_enable (partition p_cursor by any)
);
/
-- VARCHAR2 tokenizer type implementation
-- The type body contains the actual implementations for the static and member procedures.
create or replace type body string_tokenizer is
constructor function string_tokenizer (p_string in varchar2, p_delim in varchar2)
return self as result as
begin
self.value_string := p_string;
self.delimiter := p_delim;
self.parser_current_pos := 1;
self.last_token := null;
return ;
end;
member function has_more_tokens(self in out nocopy string_tokenizer) return number as
begin
if self.parser_current_pos <= length(value_string) then
return 1;
else
return 0;
end if;
end;
member function next_token(self in out nocopy string_tokenizer) return varchar2 as
l_next_delim_pos number;
l_next_enclose_pos number;
l_token varchar2(4000);
begin
if self.has_more_tokens() = 1 then
l_next_delim_pos := instr(self.value_string, self.delimiter, self.parser_current_pos);
if l_next_delim_pos = 0 then
l_token := substr(value_string, self.parser_current_pos);
parser_current_pos := length(self.value_string) + 1;
else
l_token := substr(self.value_string, self.parser_current_pos,
l_next_delim_pos - self.parser_current_pos);
parser_current_pos := l_next_delim_pos + length(self.delimiter);
end if;
else
l_token := null;
end if;
self.last_token := l_token;
return l_token;
end;
static function all_tokens (p_string in varchar2, p_delim in varchar2)
return token_list pipelined parallel_enable is
l_st string_tokenizer := string_tokenizer(p_string, p_delim);
l_startpos number;
l_token varchar2(4000);
begin
while l_st.has_more_tokens = 1 loop
l_startpos := l_st.parser_current_pos;
l_token := l_st.next_token();
pipe row (token_t(l_token, l_startpos, nvl(length(l_token),0)));
end loop;
return;
end;
static function all_tokens_cur (p_cursor in sys_refcursor, p_delim in varchar2)
return token_list pipelined parallel_enable (partition p_cursor by any) is
l_st string_tokenizer;
l_string varchar2(4000);
l_startpos number;
l_token varchar2(4000);
begin
loop
fetch p_cursor into l_string;
exit when p_cursor%notfound;
l_st := string_tokenizer(l_string, p_delim);
while l_st.has_more_tokens = 1 loop
l_startpos := l_st.parser_current_pos;
l_token := l_st.next_token();
pipe row (token_t(l_token, l_startpos, nvl(length(l_token),0)));
end loop;
end loop;
return;
end;
end;
/
-- Using STRING_TOKENIZER as table function.
-- This is the table function example for STRING_TOKENIZER. It tokenized a string with " " as the delimiter.
select rownum, token_text, start_pos, length
from table(string_tokenizer.all_tokens('The quick brown fox jumps over the lazy dog.', ' ') ) ;
-- Using STRING_TOKENIZER as table function
-- This is the table function example for STRING_TOKENIZER. It tokenized a string with "#" as the delimiter.
select rownum, token_text, start_pos, length
from table(string_tokenizer.all_tokens('##a#b#c#d#e#f##', '#') ) ;
-- Procedural use of STRING_TOKENIZER
-- This example uses the STRING_TOKENIZER within a PLSQL WHILE loop. The loop is
-- running until HAS_MORE_TOKENS() returns zero and NEXT_TOKEN() returns the next token in the string.
create or replace procedure st_tester (p_string in varchar2, p_delim in varchar2) is
v_st string_tokenizer := string_tokenizer(p_string, p_delim);
v_cnt pls_integer := 0;
begin
while v_st.has_more_tokens() = 1 loop
dbms_output.put_line(v_cnt||': '||v_st.next_token());
v_cnt := v_cnt + 1;
end loop;
end;
/
-- Procedural use of STRING_TOKENIZER
-- This example uses the STRING_TOKENIZER within a PLSQL WHILE loop. The loop is
-- running until HAS_MORE_TOKENS() returns zero and NEXT_TOKEN() returns the next token in the string.
begin
st_tester('The quick brown fox jumps over the lazy dog.', ' ');
end;
/