00001 // 00002 // Copyright 2017 The Abseil Authors. 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // 00008 // https://www.apache.org/licenses/LICENSE-2.0 00009 // 00010 // Unless required by applicable law or agreed to in writing, software 00011 // distributed under the License is distributed on an "AS IS" BASIS, 00012 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 // See the License for the specific language governing permissions and 00014 // limitations under the License. 00015 // 00016 // ----------------------------------------------------------------------------- 00017 // File: str_split.h 00018 // ----------------------------------------------------------------------------- 00019 // 00020 // This file contains functions for splitting strings. It defines the main 00021 // `StrSplit()` function, several delimiters for determining the boundaries on 00022 // which to split the string, and predicates for filtering delimited results. 00023 // `StrSplit()` adapts the returned collection to the type specified by the 00024 // caller. 00025 // 00026 // Example: 00027 // 00028 // // Splits the given string on commas. Returns the results in a 00029 // // vector of strings. 00030 // std::vector<std::string> v = absl::StrSplit("a,b,c", ','); 00031 // // Can also use "," 00032 // // v[0] == "a", v[1] == "b", v[2] == "c" 00033 // 00034 // See StrSplit() below for more information. 00035 #ifndef ABSL_STRINGS_STR_SPLIT_H_ 00036 #define ABSL_STRINGS_STR_SPLIT_H_ 00037 00038 #include <algorithm> 00039 #include <cstddef> 00040 #include <map> 00041 #include <set> 00042 #include <string> 00043 #include <utility> 00044 #include <vector> 00045 00046 #include "absl/base/internal/raw_logging.h" 00047 #include "absl/strings/internal/str_split_internal.h" 00048 #include "absl/strings/string_view.h" 00049 #include "absl/strings/strip.h" 00050 00051 namespace absl { 00052 00053 //------------------------------------------------------------------------------ 00054 // Delimiters 00055 //------------------------------------------------------------------------------ 00056 // 00057 // `StrSplit()` uses delimiters to define the boundaries between elements in the 00058 // provided input. Several `Delimiter` types are defined below. If a string 00059 // (`const char*`, `std::string`, or `absl::string_view`) is passed in place of 00060 // an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it 00061 // were passed a `ByString` delimiter. 00062 // 00063 // A `Delimiter` is an object with a `Find()` function that knows how to find 00064 // the first occurrence of itself in a given `absl::string_view`. 00065 // 00066 // The following `Delimiter` types are available for use within `StrSplit()`: 00067 // 00068 // - `ByString` (default for string arguments) 00069 // - `ByChar` (default for a char argument) 00070 // - `ByAnyChar` 00071 // - `ByLength` 00072 // - `MaxSplits` 00073 // 00074 // A Delimiter's `Find()` member function will be passed an input `text` that is 00075 // to be split and a position (`pos`) to begin searching for the next delimiter 00076 // in `text`. The returned absl::string_view should refer to the next occurrence 00077 // (after `pos`) of the represented delimiter; this returned absl::string_view 00078 // represents the next location where the input `text` should be broken. 00079 // 00080 // The returned absl::string_view may be zero-length if the Delimiter does not 00081 // represent a part of the string (e.g., a fixed-length delimiter). If no 00082 // delimiter is found in the input `text`, a zero-length absl::string_view 00083 // referring to `text.end()` should be returned (e.g., 00084 // `text.substr(text.size())`). It is important that the returned 00085 // absl::string_view always be within the bounds of the input `text` given as an 00086 // argument--it must not refer to a string that is physically located outside of 00087 // the given string. 00088 // 00089 // The following example is a simple Delimiter object that is created with a 00090 // single char and will look for that char in the text passed to the `Find()` 00091 // function: 00092 // 00093 // struct SimpleDelimiter { 00094 // const char c_; 00095 // explicit SimpleDelimiter(char c) : c_(c) {} 00096 // absl::string_view Find(absl::string_view text, size_t pos) { 00097 // auto found = text.find(c_, pos); 00098 // if (found == absl::string_view::npos) 00099 // return text.substr(text.size()); 00100 // 00101 // return text.substr(found, 1); 00102 // } 00103 // }; 00104 00105 // ByString 00106 // 00107 // A sub-string delimiter. If `StrSplit()` is passed a string in place of a 00108 // `Delimiter` object, the string will be implicitly converted into a 00109 // `ByString` delimiter. 00110 // 00111 // Example: 00112 // 00113 // // Because a string literal is converted to an `absl::ByString`, 00114 // // the following two splits are equivalent. 00115 // 00116 // std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", "); 00117 // 00118 // using absl::ByString; 00119 // std::vector<std::string> v2 = absl::StrSplit("a, b, c", 00120 // ByString(", ")); 00121 // // v[0] == "a", v[1] == "b", v[2] == "c" 00122 class ByString { 00123 public: 00124 explicit ByString(absl::string_view sp); 00125 absl::string_view Find(absl::string_view text, size_t pos) const; 00126 00127 private: 00128 const std::string delimiter_; 00129 }; 00130 00131 // ByChar 00132 // 00133 // A single character delimiter. `ByChar` is functionally equivalent to a 00134 // 1-char string within a `ByString` delimiter, but slightly more efficient. 00135 // 00136 // Example: 00137 // 00138 // // Because a char literal is converted to a absl::ByChar, 00139 // // the following two splits are equivalent. 00140 // std::vector<std::string> v1 = absl::StrSplit("a,b,c", ','); 00141 // using absl::ByChar; 00142 // std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(',')); 00143 // // v[0] == "a", v[1] == "b", v[2] == "c" 00144 // 00145 // `ByChar` is also the default delimiter if a single character is given 00146 // as the delimiter to `StrSplit()`. For example, the following calls are 00147 // equivalent: 00148 // 00149 // std::vector<std::string> v = absl::StrSplit("a-b", '-'); 00150 // 00151 // using absl::ByChar; 00152 // std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-')); 00153 // 00154 class ByChar { 00155 public: 00156 explicit ByChar(char c) : c_(c) {} 00157 absl::string_view Find(absl::string_view text, size_t pos) const; 00158 00159 private: 00160 char c_; 00161 }; 00162 00163 // ByAnyChar 00164 // 00165 // A delimiter that will match any of the given byte-sized characters within 00166 // its provided string. 00167 // 00168 // Note: this delimiter works with single-byte string data, but does not work 00169 // with variable-width encodings, such as UTF-8. 00170 // 00171 // Example: 00172 // 00173 // using absl::ByAnyChar; 00174 // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); 00175 // // v[0] == "a", v[1] == "b", v[2] == "c" 00176 // 00177 // If `ByAnyChar` is given the empty string, it behaves exactly like 00178 // `ByString` and matches each individual character in the input string. 00179 // 00180 class ByAnyChar { 00181 public: 00182 explicit ByAnyChar(absl::string_view sp); 00183 absl::string_view Find(absl::string_view text, size_t pos) const; 00184 00185 private: 00186 const std::string delimiters_; 00187 }; 00188 00189 // ByLength 00190 // 00191 // A delimiter for splitting into equal-length strings. The length argument to 00192 // the constructor must be greater than 0. 00193 // 00194 // Note: this delimiter works with single-byte string data, but does not work 00195 // with variable-width encodings, such as UTF-8. 00196 // 00197 // Example: 00198 // 00199 // using absl::ByLength; 00200 // std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3)); 00201 00202 // // v[0] == "123", v[1] == "456", v[2] == "789" 00203 // 00204 // Note that the string does not have to be a multiple of the fixed split 00205 // length. In such a case, the last substring will be shorter. 00206 // 00207 // using absl::ByLength; 00208 // std::vector<std::string> v = absl::StrSplit("12345", ByLength(2)); 00209 // 00210 // // v[0] == "12", v[1] == "34", v[2] == "5" 00211 class ByLength { 00212 public: 00213 explicit ByLength(ptrdiff_t length); 00214 absl::string_view Find(absl::string_view text, size_t pos) const; 00215 00216 private: 00217 const ptrdiff_t length_; 00218 }; 00219 00220 namespace strings_internal { 00221 00222 // A traits-like metafunction for selecting the default Delimiter object type 00223 // for a particular Delimiter type. The base case simply exposes type Delimiter 00224 // itself as the delimiter's Type. However, there are specializations for 00225 // string-like objects that map them to the ByString delimiter object. 00226 // This allows functions like absl::StrSplit() and absl::MaxSplits() to accept 00227 // string-like objects (e.g., ',') as delimiter arguments but they will be 00228 // treated as if a ByString delimiter was given. 00229 template <typename Delimiter> 00230 struct SelectDelimiter { 00231 using type = Delimiter; 00232 }; 00233 00234 template <> 00235 struct SelectDelimiter<char> { 00236 using type = ByChar; 00237 }; 00238 template <> 00239 struct SelectDelimiter<char*> { 00240 using type = ByString; 00241 }; 00242 template <> 00243 struct SelectDelimiter<const char*> { 00244 using type = ByString; 00245 }; 00246 template <> 00247 struct SelectDelimiter<absl::string_view> { 00248 using type = ByString; 00249 }; 00250 template <> 00251 struct SelectDelimiter<std::string> { 00252 using type = ByString; 00253 }; 00254 00255 // Wraps another delimiter and sets a max number of matches for that delimiter. 00256 template <typename Delimiter> 00257 class MaxSplitsImpl { 00258 public: 00259 MaxSplitsImpl(Delimiter delimiter, int limit) 00260 : delimiter_(delimiter), limit_(limit), count_(0) {} 00261 absl::string_view Find(absl::string_view text, size_t pos) { 00262 if (count_++ == limit_) { 00263 return absl::string_view(text.data() + text.size(), 00264 0); // No more matches. 00265 } 00266 return delimiter_.Find(text, pos); 00267 } 00268 00269 private: 00270 Delimiter delimiter_; 00271 const int limit_; 00272 int count_; 00273 }; 00274 00275 } // namespace strings_internal 00276 00277 // MaxSplits() 00278 // 00279 // A delimiter that limits the number of matches which can occur to the passed 00280 // `limit`. The last element in the returned collection will contain all 00281 // remaining unsplit pieces, which may contain instances of the delimiter. 00282 // The collection will contain at most `limit` + 1 elements. 00283 // Example: 00284 // 00285 // using absl::MaxSplits; 00286 // std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1)); 00287 // 00288 // // v[0] == "a", v[1] == "b,c" 00289 template <typename Delimiter> 00290 inline strings_internal::MaxSplitsImpl< 00291 typename strings_internal::SelectDelimiter<Delimiter>::type> 00292 MaxSplits(Delimiter delimiter, int limit) { 00293 typedef 00294 typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType; 00295 return strings_internal::MaxSplitsImpl<DelimiterType>( 00296 DelimiterType(delimiter), limit); 00297 } 00298 00299 //------------------------------------------------------------------------------ 00300 // Predicates 00301 //------------------------------------------------------------------------------ 00302 // 00303 // Predicates filter the results of a `StrSplit()` by determining whether or not 00304 // a resultant element is included in the result set. A predicate may be passed 00305 // as an optional third argument to the `StrSplit()` function. 00306 // 00307 // Predicates are unary functions (or functors) that take a single 00308 // `absl::string_view` argument and return a bool indicating whether the 00309 // argument should be included (`true`) or excluded (`false`). 00310 // 00311 // Predicates are useful when filtering out empty substrings. By default, empty 00312 // substrings may be returned by `StrSplit()`, which is similar to the way split 00313 // functions work in other programming languages. 00314 00315 // AllowEmpty() 00316 // 00317 // Always returns `true`, indicating that all strings--including empty 00318 // strings--should be included in the split output. This predicate is not 00319 // strictly needed because this is the default behavior of `StrSplit()`; 00320 // however, it might be useful at some call sites to make the intent explicit. 00321 // 00322 // Example: 00323 // 00324 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty()); 00325 // 00326 // // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == "" 00327 struct AllowEmpty { 00328 bool operator()(absl::string_view) const { return true; } 00329 }; 00330 00331 // SkipEmpty() 00332 // 00333 // Returns `false` if the given `absl::string_view` is empty, indicating that 00334 // `StrSplit()` should omit the empty string. 00335 // 00336 // Example: 00337 // 00338 // std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty()); 00339 // 00340 // // v[0] == "a", v[1] == "b" 00341 // 00342 // Note: `SkipEmpty()` does not consider a string containing only whitespace 00343 // to be empty. To skip such whitespace as well, use the `SkipWhitespace()` 00344 // predicate. 00345 struct SkipEmpty { 00346 bool operator()(absl::string_view sp) const { return !sp.empty(); } 00347 }; 00348 00349 // SkipWhitespace() 00350 // 00351 // Returns `false` if the given `absl::string_view` is empty *or* contains only 00352 // whitespace, indicating that `StrSplit()` should omit the string. 00353 // 00354 // Example: 00355 // 00356 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", 00357 // ',', SkipWhitespace()); 00358 // // v[0] == " a ", v[1] == "b" 00359 // 00360 // // SkipEmpty() would return whitespace elements 00361 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty()); 00362 // // v[0] == " a ", v[1] == " ", v[2] == "b" 00363 struct SkipWhitespace { 00364 bool operator()(absl::string_view sp) const { 00365 sp = absl::StripAsciiWhitespace(sp); 00366 return !sp.empty(); 00367 } 00368 }; 00369 00370 //------------------------------------------------------------------------------ 00371 // StrSplit() 00372 //------------------------------------------------------------------------------ 00373 00374 // StrSplit() 00375 // 00376 // Splits a given string based on the provided `Delimiter` object, returning the 00377 // elements within the type specified by the caller. Optionally, you may pass a 00378 // `Predicate` to `StrSplit()` indicating whether to include or exclude the 00379 // resulting element within the final result set. (See the overviews for 00380 // Delimiters and Predicates above.) 00381 // 00382 // Example: 00383 // 00384 // std::vector<std::string> v = absl::StrSplit("a,b,c,d", ','); 00385 // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d" 00386 // 00387 // You can also provide an explicit `Delimiter` object: 00388 // 00389 // Example: 00390 // 00391 // using absl::ByAnyChar; 00392 // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); 00393 // // v[0] == "a", v[1] == "b", v[2] == "c" 00394 // 00395 // See above for more information on delimiters. 00396 // 00397 // By default, empty strings are included in the result set. You can optionally 00398 // include a third `Predicate` argument to apply a test for whether the 00399 // resultant element should be included in the result set: 00400 // 00401 // Example: 00402 // 00403 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", 00404 // ',', SkipWhitespace()); 00405 // // v[0] == " a ", v[1] == "b" 00406 // 00407 // See above for more information on predicates. 00408 // 00409 //------------------------------------------------------------------------------ 00410 // StrSplit() Return Types 00411 //------------------------------------------------------------------------------ 00412 // 00413 // The `StrSplit()` function adapts the returned collection to the collection 00414 // specified by the caller (e.g. `std::vector` above). The returned collections 00415 // may contain `std::string`, `absl::string_view` (in which case the original 00416 // string being split must ensure that it outlives the collection), or any 00417 // object that can be explicitly created from an `absl::string_view`. This 00418 // behavior works for: 00419 // 00420 // 1) All standard STL containers including `std::vector`, `std::list`, 00421 // `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap` 00422 // 2) `std::pair` (which is not actually a container). See below. 00423 // 00424 // Example: 00425 // 00426 // // The results are returned as `absl::string_view` objects. Note that we 00427 // // have to ensure that the input string outlives any results. 00428 // std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ','); 00429 // 00430 // // Stores results in a std::set<std::string>, which also performs 00431 // // de-duplication and orders the elements in ascending order. 00432 // std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ','); 00433 // // v[0] == "a", v[1] == "b", v[2] = "c" 00434 // 00435 // // `StrSplit()` can be used within a range-based for loop, in which case 00436 // // each element will be of type `absl::string_view`. 00437 // std::vector<std::string> v; 00438 // for (const auto sv : absl::StrSplit("a,b,c", ',')) { 00439 // if (sv != "b") v.emplace_back(sv); 00440 // } 00441 // // v[0] == "a", v[1] == "c" 00442 // 00443 // // Stores results in a map. The map implementation assumes that the input 00444 // // is provided as a series of key/value pairs. For example, the 0th element 00445 // // resulting from the split will be stored as a key to the 1st element. If 00446 // // an odd number of elements are resolved, the last element is paired with 00447 // // a default-constructed value (e.g., empty string). 00448 // std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ','); 00449 // // m["a"] == "b", m["c"] == "" // last component value equals "" 00450 // 00451 // Splitting to `std::pair` is an interesting case because it can hold only two 00452 // elements and is not a collection type. When splitting to a `std::pair` the 00453 // first two split strings become the `std::pair` `.first` and `.second` 00454 // members, respectively. The remaining split substrings are discarded. If there 00455 // are less than two split substrings, the empty string is used for the 00456 // corresponding 00457 // `std::pair` member. 00458 // 00459 // Example: 00460 // 00461 // // Stores first two split strings as the members in a std::pair. 00462 // std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ','); 00463 // // p.first == "a", p.second == "b" // "c" is omitted. 00464 // 00465 // The `StrSplit()` function can be used multiple times to perform more 00466 // complicated splitting logic, such as intelligently parsing key-value pairs. 00467 // 00468 // Example: 00469 // 00470 // // The input string "a=b=c,d=e,f=,g" becomes 00471 // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } 00472 // std::map<std::string, std::string> m; 00473 // for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) { 00474 // m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1))); 00475 // } 00476 // EXPECT_EQ("b=c", m.find("a")->second); 00477 // EXPECT_EQ("e", m.find("d")->second); 00478 // EXPECT_EQ("", m.find("f")->second); 00479 // EXPECT_EQ("", m.find("g")->second); 00480 // 00481 // WARNING: Due to a legacy bug that is maintained for backward compatibility, 00482 // splitting the following empty string_views produces different results: 00483 // 00484 // absl::StrSplit(absl::string_view(""), '-'); // {""} 00485 // absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""} 00486 // 00487 // Try not to depend on this distinction because the bug may one day be fixed. 00488 template <typename Delimiter> 00489 strings_internal::Splitter< 00490 typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty> 00491 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) { 00492 using DelimiterType = 00493 typename strings_internal::SelectDelimiter<Delimiter>::type; 00494 return strings_internal::Splitter<DelimiterType, AllowEmpty>( 00495 std::move(text), DelimiterType(d), AllowEmpty()); 00496 } 00497 00498 template <typename Delimiter, typename Predicate> 00499 strings_internal::Splitter< 00500 typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate> 00501 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d, 00502 Predicate p) { 00503 using DelimiterType = 00504 typename strings_internal::SelectDelimiter<Delimiter>::type; 00505 return strings_internal::Splitter<DelimiterType, Predicate>( 00506 std::move(text), DelimiterType(d), std::move(p)); 00507 } 00508 00509 } // namespace absl 00510 00511 #endif // ABSL_STRINGS_STR_SPLIT_H_