00001 /*========================================================================= 00002 00003 Program: Visualization Toolkit 00004 Module: vtkTokenizer.h 00005 00006 Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen 00007 All rights reserved. 00008 See Copyright.txt or http://www.kitware.com/Copyright.htm for details. 00009 00010 This software is distributed WITHOUT ANY WARRANTY; without even 00011 the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 00012 PURPOSE. See the above copyright notice for more information. 00013 00014 =========================================================================*/ 00015 /*------------------------------------------------------------------------- 00016 Copyright 2008 Sandia Corporation. 00017 Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 00018 the U.S. Government retains certain rights in this software. 00019 -------------------------------------------------------------------------*/ 00020 00081 #ifndef __vtkTokenizer_h 00082 #define __vtkTokenizer_h 00083 00084 #include <vtkTableAlgorithm.h> 00085 #include <vtkUnicodeString.h> //Needed for delimiter specification 00086 00087 class VTK_TEXT_ANALYSIS_EXPORT vtkTokenizer : 00088 public vtkTableAlgorithm 00089 { 00090 public: 00091 static vtkTokenizer* New(); 00092 vtkTypeMacro(vtkTokenizer, vtkTableAlgorithm); 00093 void PrintSelf(ostream& os, vtkIndent indent); 00094 00095 //BTX 00097 00099 typedef vtkstd::pair<vtkUnicodeString::value_type, vtkUnicodeString::value_type> DelimiterRange; 00100 // Description: 00101 // Defines storage for a collection of half-open ranges of Unicode characters. 00102 typedef vtkstd::vector<DelimiterRange> DelimiterRanges; 00104 00106 00108 static const DelimiterRanges Punctuation(); 00109 // Description: 00110 // Returns a set of delimiter ranges that match Unicode whitespace codepoints. 00111 static const DelimiterRanges Whitespace(); 00112 // Description: 00113 // Returns a set of delimiter ranges that match logosyllabic languages where characters represent 00114 // words instead of sounds, such as Chinese, Japanese, and Korean. 00115 static const DelimiterRanges Logosyllabic(); 00117 00119 00121 void AddDroppedDelimiters(vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end); 00122 // Description: 00123 // Adds a collection of delimiter ranges to the set of "dropped" delimiters. 00124 void AddDroppedDelimiters(const DelimiterRanges& ranges); 00126 00128 00130 void AddKeptDelimiters(vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end); 00131 // Description: 00132 // Adds a collection of delimiter ranges to the set of "kept" delimiters. 00133 void AddKeptDelimiters(const DelimiterRanges& ranges); 00134 //ETX 00136 00138 00142 void DropPunctuation(); 00143 void DropWhitespace(); 00144 void KeepPunctuation(); 00145 void KeepWhitespace(); 00146 void KeepLogosyllabic(); 00148 00150 00151 void ClearDroppedDelimiters(); 00152 // Description: 00153 // Clears the set of "kept" delimiters. 00154 void ClearKeptDelimiters(); 00156 00157 //BTX 00158 protected: 00159 vtkTokenizer(); 00160 ~vtkTokenizer(); 00161 00162 int FillInputPortInformation(int port, vtkInformation* info); 00163 00164 virtual int RequestData( 00165 vtkInformation* request, 00166 vtkInformationVector** inputVector, 00167 vtkInformationVector* outputVector); 00168 00169 private: 00170 vtkTokenizer(const vtkTokenizer &); // Not implemented. 00171 void operator=(const vtkTokenizer &); // Not implemented. 00172 00173 class Internals; 00174 Internals* const Implementation; 00175 //ETX 00176 }; 00177 00178 #endif // __vtkTokenizer_h 00179