2 * Copyright (C) 2005-2013 Team XBMC
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with XBMC; see the file COPYING. If not, see
17 * <http://www.gnu.org/licenses/>.
20 //-----------------------------------------------------------------------
22 // File: StringUtils.cpp
24 // Purpose: ATL split string utility
25 // Author: Paul J. Weiss
27 // Modified to use J O'Leary's CStdString class by kraqh3d
29 //------------------------------------------------------------------------
32 #include "StringUtils.h"
33 #include "utils/RegExp.h"
34 #include "utils/fstrcmp.h"
41 #define FORMAT_BLOCK_SIZE 2048 // # of bytes to increment per try
45 const char* ADDON_GUID_RE = "^(\\{){0,1}[0-9a-fA-F]{8}\\-[0-9a-fA-F]{4}\\-[0-9a-fA-F]{4}\\-[0-9a-fA-F]{4}\\-[0-9a-fA-F]{12}(\\}){0,1}$";
47 /* empty string for use in returns by ref */
48 const CStdString StringUtils::EmptyString = "";
49 const std::string StringUtils::Empty = "";
50 CStdString StringUtils::m_lastUUID = "";
52 string StringUtils::Format(const char *fmt, ...)
56 string str = FormatV(fmt, args);
62 string StringUtils::FormatV(const char *fmt, va_list args)
67 int size = FORMAT_BLOCK_SIZE;
70 char *cstr = reinterpret_cast<char*>(malloc(sizeof(char) * size));
76 va_copy(argCopy, args);
78 int nActual = vsnprintf(cstr, size, fmt, argCopy);
81 if (nActual > -1 && nActual < size) // We got a valid result
83 string str(cstr, nActual);
87 if (nActual > -1) // Exactly what we will need (glibc 2.1)
89 else // Let's try to double the size (glibc 2.0)
92 char *new_cstr = reinterpret_cast<char*>(realloc(cstr, sizeof(char) * size));
106 wstring StringUtils::Format(const wchar_t *fmt, ...)
110 wstring str = FormatV(fmt, args);
116 wstring StringUtils::FormatV(const wchar_t *fmt, va_list args)
121 int size = FORMAT_BLOCK_SIZE;
124 wchar_t *cstr = reinterpret_cast<wchar_t*>(malloc(sizeof(wchar_t) * size));
130 va_copy(argCopy, args);
132 int nActual = vswprintf(cstr, size, fmt, argCopy);
135 if (nActual > -1 && nActual < size) // We got a valid result
137 wstring str(cstr, nActual);
141 if (nActual > -1) // Exactly what we will need (glibc 2.1)
143 else // Let's try to double the size (glibc 2.0)
146 wchar_t *new_cstr = reinterpret_cast<wchar_t*>(realloc(cstr, sizeof(wchar_t) * size));
147 if (new_cstr == NULL)
159 void StringUtils::ToUpper(string &str)
161 transform(str.begin(), str.end(), str.begin(), ::toupper);
164 void StringUtils::ToUpper(wstring &str)
166 transform(str.begin(), str.end(), str.begin(), ::towupper);
169 void StringUtils::ToLower(string &str)
171 transform(str.begin(), str.end(), str.begin(), ::tolower);
174 void StringUtils::ToLower(wstring &str)
176 transform(str.begin(), str.end(), str.begin(), ::towlower);
179 bool StringUtils::EqualsNoCase(const std::string &str1, const std::string &str2)
181 return EqualsNoCase(str1.c_str(), str2.c_str());
184 bool StringUtils::EqualsNoCase(const std::string &str1, const char *s2)
186 return EqualsNoCase(str1.c_str(), s2);
189 bool StringUtils::EqualsNoCase(const char *s1, const char *s2)
191 char c2; // we need only one char outside the loop
194 const char c1 = *s1++; // const local variable should help compiler to optimize
196 if (c1 != c2 && ::tolower(c1) != ::tolower(c2)) // This includes the possibility that one of the characters is the null-terminator, which implies a string mismatch.
198 } while (c2 != '\0'); // At this point, we know c1 == c2, so there's no need to test them both.
202 int StringUtils::CompareNoCase(const std::string &str1, const std::string &str2)
204 return CompareNoCase(str1.c_str(), str2.c_str());
207 int StringUtils::CompareNoCase(const char *s1, const char *s2)
209 char c2; // we need only one char outside the loop
212 const char c1 = *s1++; // const local variable should help compiler to optimize
214 if (c1 != c2 && ::tolower(c1) != ::tolower(c2)) // This includes the possibility that one of the characters is the null-terminator, which implies a string mismatch.
215 return ::tolower(c1) - ::tolower(c2);
216 } while (c2 != '\0'); // At this point, we know c1 == c2, so there's no need to test them both.
220 string StringUtils::Left(const string &str, size_t count)
222 count = max((size_t)0, min(count, str.size()));
223 return str.substr(0, count);
226 string StringUtils::Mid(const string &str, size_t first, size_t count /* = string::npos */)
228 if (first + count > str.size())
229 count = str.size() - first;
231 if (first > str.size())
234 ASSERT(first + count <= str.size());
236 return str.substr(first, count);
239 string StringUtils::Right(const string &str, size_t count)
241 count = max((size_t)0, min(count, str.size()));
242 return str.substr(str.size() - count);
245 std::string& StringUtils::Trim(std::string &str)
248 return TrimRight(str);
251 // hack to ensure that std::string::iterator will be dereferenced as _unsigned_ char
252 // without this hack "TrimX" functions failed on Win32 with UTF-8 strings
253 static int isspace_c(char c)
255 return ::isspace((unsigned char)c);
258 std::string& StringUtils::TrimLeft(std::string &str)
260 str.erase(str.begin(), ::find_if(str.begin(), str.end(), ::not1(::ptr_fun(isspace_c))));
264 std::string& StringUtils::TrimLeft(std::string &str, const std::string& chars)
266 size_t nidx = str.find_first_not_of(chars);
271 std::string& StringUtils::TrimRight(std::string &str)
273 str.erase(::find_if(str.rbegin(), str.rend(), ::not1(::ptr_fun(isspace_c))).base(), str.end());
277 std::string& StringUtils::TrimRight(std::string &str, const std::string& chars)
279 size_t nidx = str.find_last_not_of(chars);
280 str.erase(str.npos == nidx ? 0 : ++nidx);
284 std::string& StringUtils::RemoveDuplicatedSpacesAndTabs(std::string& str)
286 std::string::iterator it = str.begin();
287 bool onSpace = false;
288 while(it != str.end())
311 int StringUtils::Replace(string &str, char oldChar, char newChar)
313 int replacedChars = 0;
314 for (string::iterator it = str.begin(); it != str.end(); it++)
323 return replacedChars;
326 int StringUtils::Replace(std::string &str, const std::string &oldStr, const std::string &newStr)
331 int replacedChars = 0;
334 while (index < str.size() && (index = str.find(oldStr, index)) != string::npos)
336 str.replace(index, oldStr.size(), newStr);
337 index += newStr.size();
341 return replacedChars;
344 int StringUtils::Replace(std::wstring &str, const std::wstring &oldStr, const std::wstring &newStr)
349 int replacedChars = 0;
352 while (index < str.size() && (index = str.find(oldStr, index)) != string::npos)
354 str.replace(index, oldStr.size(), newStr);
355 index += newStr.size();
359 return replacedChars;
362 bool StringUtils::StartsWith(const std::string &str1, const std::string &str2)
364 return str1.compare(0, str2.size(), str2) == 0;
367 bool StringUtils::StartsWith(const std::string &str1, const char *s2)
369 return StartsWith(str1.c_str(), s2);
372 bool StringUtils::StartsWith(const char *s1, const char *s2)
384 bool StringUtils::StartsWithNoCase(const std::string &str1, const std::string &str2)
386 return StartsWithNoCase(str1.c_str(), str2.c_str());
389 bool StringUtils::StartsWithNoCase(const std::string &str1, const char *s2)
391 return StartsWithNoCase(str1.c_str(), s2);
394 bool StringUtils::StartsWithNoCase(const char *s1, const char *s2)
398 if (::tolower(*s1) != ::tolower(*s2))
406 bool StringUtils::EndsWith(const std::string &str1, const std::string &str2)
408 if (str1.size() < str2.size())
410 return str1.compare(str1.size() - str2.size(), str2.size(), str2) == 0;
413 bool StringUtils::EndsWith(const std::string &str1, const char *s2)
415 size_t len2 = strlen(s2);
416 if (str1.size() < len2)
418 return str1.compare(str1.size() - len2, len2, s2) == 0;
421 bool StringUtils::EndsWithNoCase(const std::string &str1, const std::string &str2)
423 if (str1.size() < str2.size())
425 const char *s1 = str1.c_str() + str1.size() - str2.size();
426 const char *s2 = str2.c_str();
429 if (::tolower(*s1) != ::tolower(*s2))
437 bool StringUtils::EndsWithNoCase(const std::string &str1, const char *s2)
439 size_t len2 = strlen(s2);
440 if (str1.size() < len2)
442 const char *s1 = str1.c_str() + str1.size() - len2;
445 if (::tolower(*s1) != ::tolower(*s2))
453 void StringUtils::JoinString(const CStdStringArray &strings, const CStdString& delimiter, CStdString& result)
456 for(CStdStringArray::const_iterator it = strings.begin(); it != strings.end(); it++ )
457 result += (*it) + delimiter;
460 result.erase(result.size()-delimiter.size(), delimiter.size());
463 CStdString StringUtils::JoinString(const CStdStringArray &strings, const CStdString& delimiter)
466 JoinString(strings, delimiter, result);
470 CStdString StringUtils::Join(const vector<string> &strings, const CStdString& delimiter)
472 CStdStringArray strArray;
473 for (unsigned int index = 0; index < strings.size(); index++)
474 strArray.push_back(strings.at(index));
476 return JoinString(strArray, delimiter);
479 // Splits the string input into pieces delimited by delimiter.
480 // if 2 delimiters are in a row, it will include the empty string between them.
481 // added MaxStrings parameter to restrict the number of returned substrings (like perl and python)
482 int StringUtils::SplitString(const CStdString& input, const CStdString& delimiter, CStdStringArray &results, unsigned int iMaxStrings /* = 0 */)
484 size_t iPos = std::string::npos;
485 size_t newPos = std::string::npos;
486 size_t sizeS2 = delimiter.size();
487 size_t isize = input.size();
491 vector<unsigned int> positions;
493 newPos = input.find(delimiter, 0);
495 if (newPos == std::string::npos)
497 results.push_back(input);
501 while (newPos != std::string::npos)
503 positions.push_back(newPos);
505 newPos = input.find(delimiter, iPos + sizeS2);
508 // numFound is the number of delimiters which is one less
509 // than the number of substrings
510 unsigned int numFound = positions.size();
511 if (iMaxStrings > 0 && numFound >= iMaxStrings)
512 numFound = iMaxStrings - 1;
514 for ( unsigned int i = 0; i <= numFound; i++ )
522 s = input.substr(i, positions[i]);
526 size_t offset = positions[i - 1] + sizeS2;
527 if ( offset < isize )
530 s = input.substr(offset);
532 s = input.substr( positions[i - 1] + sizeS2,
533 positions[i] - positions[i - 1] - sizeS2 );
536 results.push_back(s);
538 // return the number of substrings
539 return results.size();
542 CStdStringArray StringUtils::SplitString(const CStdString& input, const CStdString& delimiter, unsigned int iMaxStrings /* = 0 */)
544 CStdStringArray result;
545 SplitString(input, delimiter, result, iMaxStrings);
549 vector<string> StringUtils::Split(const std::string& input, const std::string& delimiter, unsigned int iMaxStrings /* = 0 */)
551 CStdStringArray result;
552 SplitString(input, delimiter, result, iMaxStrings);
554 vector<string> strArray;
555 for (unsigned int index = 0; index < result.size(); index++)
556 strArray.push_back(result.at(index));
561 // returns the number of occurrences of strFind in strInput.
562 int StringUtils::FindNumber(const CStdString& strInput, const CStdString &strFind)
564 size_t pos = strInput.find(strFind, 0);
566 while (pos != std::string::npos)
569 pos = strInput.find(strFind, pos + 1);
574 // Compares separately the numeric and alphabetic parts of a string.
575 // returns negative if left < right, positive if left > right
576 // and 0 if they are identical (essentially calculates left - right)
577 int64_t StringUtils::AlphaNumericCompare(const wchar_t *left, const wchar_t *right)
579 wchar_t *l = (wchar_t *)left;
580 wchar_t *r = (wchar_t *)right;
584 const collate<wchar_t>& coll = use_facet< collate<wchar_t> >( locale() );
586 while (*l != 0 && *r != 0)
588 // check if we have a numerical value
589 if (*l >= L'0' && *l <= L'9' && *r >= L'0' && *r <= L'9')
593 while (*ld >= L'0' && *ld <= L'9' && ld < l + 15)
594 { // compare only up to 15 digits
600 while (*rd >= L'0' && *rd <= L'9' && rd < r + 15)
601 { // compare only up to 15 digits
603 rnum += *rd++ - L'0';
605 // do we have numbers?
607 { // yes - and they're different!
614 // do case less comparison
616 if (lc >= L'A' && lc <= L'Z')
619 if (rc >= L'A' && rc <= L'Z')
622 // ok, do a normal comparison, taking current locale into account. Add special case stuff (eg '(' characters)) in here later
623 if ((cmp_res = coll.compare(&lc, &lc + 1, &rc, &rc + 1)) != 0)
637 return 0; // files are the same
640 int StringUtils::DateStringToYYYYMMDD(const CStdString &dateString)
642 CStdStringArray days;
643 int splitCount = StringUtils::SplitString(dateString, "-", days);
645 return atoi(days[0].c_str());
646 else if (splitCount == 2)
647 return atoi(days[0].c_str())*100+atoi(days[1].c_str());
648 else if (splitCount == 3)
649 return atoi(days[0].c_str())*10000+atoi(days[1].c_str())*100+atoi(days[2].c_str());
654 long StringUtils::TimeStringToSeconds(const CStdString &timeString)
656 CStdString strCopy(timeString);
657 StringUtils::Trim(strCopy);
658 if(StringUtils::EndsWithNoCase(strCopy, " min"))
660 // this is imdb format of "XXX min"
661 return 60 * atoi(strCopy.c_str());
665 CStdStringArray secs;
666 StringUtils::SplitString(strCopy, ":", secs);
668 for (unsigned int i = 0; i < 3 && i < secs.size(); i++)
671 timeInSecs += atoi(secs[i]);
677 CStdString StringUtils::SecondsToTimeString(long lSeconds, TIME_FORMAT format)
679 int hh = lSeconds / 3600;
680 lSeconds = lSeconds % 3600;
681 int mm = lSeconds / 60;
682 int ss = lSeconds % 60;
684 if (format == TIME_FORMAT_GUESS)
685 format = (hh >= 1) ? TIME_FORMAT_HH_MM_SS : TIME_FORMAT_MM_SS;
687 if (format & TIME_FORMAT_HH)
688 strHMS += StringUtils::Format("%02.2i", hh);
689 else if (format & TIME_FORMAT_H)
690 strHMS += StringUtils::Format("%i", hh);
691 if (format & TIME_FORMAT_MM)
692 strHMS += StringUtils::Format(strHMS.empty() ? "%02.2i" : ":%02.2i", mm);
693 if (format & TIME_FORMAT_SS)
694 strHMS += StringUtils::Format(strHMS.empty() ? "%02.2i" : ":%02.2i", ss);
698 bool StringUtils::IsNaturalNumber(const CStdString& str)
701 // allow whitespace,digits,whitespace
702 while (i < str.size() && isspace((unsigned char) str[i]))
704 while (i < str.size() && isdigit((unsigned char) str[i]))
708 while (i < str.size() && isspace((unsigned char) str[i]))
710 return i == str.size() && n > 0;
713 bool StringUtils::IsInteger(const CStdString& str)
716 // allow whitespace,-,digits,whitespace
717 while (i < str.size() && isspace((unsigned char) str[i]))
719 if (i < str.size() && str[i] == '-')
721 while (i < str.size() && isdigit((unsigned char) str[i]))
725 while (i < str.size() && isspace((unsigned char) str[i]))
727 return i == str.size() && n > 0;
730 void StringUtils::RemoveCRLF(CStdString& strLine)
732 StringUtils::TrimRight(strLine, "\n\r");
735 CStdString StringUtils::SizeToString(int64_t size)
738 const char prefixes[] = {' ','k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'};
740 double s = (double)size;
741 while (i < sizeof(prefixes)/sizeof(prefixes[0]) && s >= 1000.0)
748 strLabel = StringUtils::Format("%.0lf %cB ", s, prefixes[i]);
750 strLabel = StringUtils::Format("%.1lf %cB", s, prefixes[i]);
752 strLabel = StringUtils::Format("%.2lf %cB", s, prefixes[i]);
757 // return -1 if not, else return the utf8 char length.
758 int IsUTF8Letter(const unsigned char *str)
761 // unicode -> utf8 table: http://www.utf8-chartable.de/
762 // latin characters in unicode: http://en.wikipedia.org/wiki/Latin_characters_in_Unicode
763 unsigned char ch = str[0];
766 if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))
770 unsigned char ch2 = str[1];
773 // check latin 1 letter table: http://en.wikipedia.org/wiki/C1_Controls_and_Latin-1_Supplement
774 if (ch == 0xC3 && ch2 >= 0x80 && ch2 <= 0xBF && ch2 != 0x97 && ch2 != 0xB7)
776 // check latin extended A table: http://en.wikipedia.org/wiki/Latin_Extended-A
777 if (ch >= 0xC4 && ch <= 0xC7 && ch2 >= 0x80 && ch2 <= 0xBF)
779 // check latin extended B table: http://en.wikipedia.org/wiki/Latin_Extended-B
780 // and International Phonetic Alphabet: http://en.wikipedia.org/wiki/IPA_Extensions_(Unicode_block)
781 if (((ch == 0xC8 || ch == 0xC9) && ch2 >= 0x80 && ch2 <= 0xBF)
782 || (ch == 0xCA && ch2 >= 0x80 && ch2 <= 0xAF))
787 size_t StringUtils::FindWords(const char *str, const char *wordLowerCase)
789 // NOTE: This assumes word is lowercase!
790 unsigned char *s = (unsigned char *)str;
793 // start with a compare
794 unsigned char *c = s;
795 unsigned char *w = (unsigned char *)wordLowerCase;
797 while (same && *c && *w)
799 unsigned char lc = *c++;
800 if (lc >= 'A' && lc <= 'Z')
803 if (lc != *w++) // different
806 if (same && *w == 0) // only the same if word has been exhausted
807 return (const char *)s - str;
809 // otherwise, skip current word (composed by latin letters) or number
811 if (*s >= '0' && *s <= '9')
814 while (*s >= '0' && *s <= '9') ++s;
816 else if ((l = IsUTF8Letter(s)) > 0)
819 while ((l = IsUTF8Letter(s)) > 0) s += l;
823 while (*s && *s == ' ') s++;
825 // and repeat until we're done
828 return CStdString::npos;
831 // assumes it is called from after the first open bracket is found
832 int StringUtils::FindEndBracket(const CStdString &str, char opener, char closer, int startPos)
835 for (unsigned int i = startPos; i < str.size(); i++)
837 if (str[i] == opener)
839 else if (str[i] == closer)
847 return (int)CStdString::npos;
850 void StringUtils::WordToDigits(CStdString &word)
852 static const char word_to_letter[] = "22233344455566677778889999";
853 StringUtils::ToLower(word);
854 for (unsigned int i = 0; i < word.size(); ++i)
855 { // NB: This assumes ascii, which probably needs extending at some point.
856 char letter = word[i];
857 if ((letter >= 'a' && letter <= 'z')) // assume contiguous letter range
859 word[i] = word_to_letter[letter-'a'];
861 else if (letter < '0' || letter > '9') // We want to keep 0-9!
863 word[i] = ' '; // replace everything else with a space
868 CStdString StringUtils::CreateUUID()
870 /* This function generate a DCE 1.1, ISO/IEC 11578:1996 and IETF RFC-4122
871 * Version 4 conform local unique UUID based upon random number generation.
874 char *pUuidStr = UuidStrTmp;
877 static bool m_uuidInitialized = false;
878 if (!m_uuidInitialized)
880 /* use current time as the seed for rand()*/
882 m_uuidInitialized = true;
885 /*Data1 - 8 characters.*/
886 for(i = 0; i < 8; i++, pUuidStr++)
887 ((*pUuidStr = (rand() % 16)) < 10) ? *pUuidStr += 48 : *pUuidStr += 55;
889 /*Data2 - 4 characters.*/
891 for(i = 0; i < 4; i++, pUuidStr++)
892 ((*pUuidStr = (rand() % 16)) < 10) ? *pUuidStr += 48 : *pUuidStr += 55;
894 /*Data3 - 4 characters.*/
896 for(i = 0; i < 4; i++, pUuidStr++)
897 ((*pUuidStr = (rand() % 16)) < 10) ? *pUuidStr += 48 : *pUuidStr += 55;
899 /*Data4 - 4 characters.*/
901 for(i = 0; i < 4; i++, pUuidStr++)
902 ((*pUuidStr = (rand() % 16)) < 10) ? *pUuidStr += 48 : *pUuidStr += 55;
904 /*Data5 - 12 characters.*/
906 for(i = 0; i < 12; i++, pUuidStr++)
907 ((*pUuidStr = (rand() % 16)) < 10) ? *pUuidStr += 48 : *pUuidStr += 55;
911 m_lastUUID = UuidStrTmp;
915 bool StringUtils::ValidateUUID(const CStdString &uuid)
918 guidRE.RegComp(ADDON_GUID_RE);
919 return (guidRE.RegFind(uuid.c_str()) == 0);
922 double StringUtils::CompareFuzzy(const CStdString &left, const CStdString &right)
924 return (0.5 + fstrcmp(left.c_str(), right.c_str(), 0.0) * (left.length() + right.length())) / 2.0;
927 int StringUtils::FindBestMatch(const CStdString &str, const CStdStringArray &strings, double &matchscore)
933 for (CStdStringArray::const_iterator it = strings.begin(); it != strings.end(); it++, i++)
935 int maxlength = max(str.length(), it->length());
936 double score = StringUtils::CompareFuzzy(str, *it) / maxlength;
937 if (score > matchscore)
946 bool StringUtils::ContainsKeyword(const CStdString &str, const CStdStringArray &keywords)
948 for (CStdStringArray::const_iterator it = keywords.begin(); it != keywords.end(); it++)
950 if (str.find(*it) != str.npos)
956 size_t StringUtils::utf8_strlen(const char *s)
961 if ((*s++ & 0xC0) != 0x80)
967 std::string StringUtils::Paramify(const std::string ¶m)
969 std::string result = param;
971 StringUtils::Replace(result, "\\", "\\\\");
972 // escape double quotes
973 StringUtils::Replace(result, "\"", "\\\"");
975 // add double quotes around the whole string
976 return "\"" + result + "\"";
979 void StringUtils::Tokenize(const std::string& input, std::vector<std::string>& tokens, const std::string& delimiters)
981 // Tokenize ripped from http://www.linuxselfhelp.com/HOWTO/C++Programming-HOWTO-7.html
982 // Skip delimiters at beginning.
983 string::size_type lastPos = input.find_first_not_of(delimiters, 0);
984 // Find first "non-delimiter".
985 string::size_type pos = input.find_first_of(delimiters, lastPos);
987 while (string::npos != pos || string::npos != lastPos)
989 // Found a token, add it to the vector.
990 tokens.push_back(input.substr(lastPos, pos - lastPos));
991 // Skip delimiters. Note the "not_of"
992 lastPos = input.find_first_not_of(delimiters, pos);
993 // Find next "non-delimiter"
994 pos = input.find_first_of(delimiters, lastPos);