View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.tika.utils;
19  
20  // JDK imports
21  import java.util.HashMap;
22  import java.nio.charset.Charset;
23  
24  /**
25   * A collection of String processing utility methods.
26   */
27  public class StringUtil {
28  
29      /**
30       * Returns a copy of <code>s</code> padded with trailing spaces so that
31       * it's length is <code>length</code>. Strings already
32       * <code>length</code> characters long or longer are not altered.
33       */
34      public static String rightPad(String s, int length) {
35          StringBuffer sb = new StringBuffer(s);
36          for (int i = length - s.length(); i > 0; i--)
37              sb.append(" ");
38          return sb.toString();
39      }
40  
41      /**
42       * Returns a copy of <code>s</code> padded with leading spaces so that
43       * it's length is <code>length</code>. Strings already
44       * <code>length</code> characters long or longer are not altered.
45       */
46      public static String leftPad(String s, int length) {
47          StringBuffer sb = new StringBuffer();
48          for (int i = length - s.length(); i > 0; i--)
49              sb.append(" ");
50          sb.append(s);
51          return sb.toString();
52      }
53  
54      private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5',
55              '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
56  
57      /**
58       * Convenience call for {@link #toHexString(byte[], String, int)}, where
59       * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
60       * 
61       * @param buf
62       */
63      public static String toHexString(byte[] buf) {
64          return toHexString(buf, null, Integer.MAX_VALUE);
65      }
66  
67      /**
68       * Get a text representation of a byte[] as hexadecimal String, where each
69       * pair of hexadecimal digits corresponds to consecutive bytes in the array.
70       * 
71       * @param buf
72       *            input data
73       * @param sep
74       *            separate every pair of hexadecimal digits with this separator,
75       *            or null if no separation is needed.
76       * @param lineLen
77       *            break the output String into lines containing output for
78       *            lineLen bytes.
79       */
80      public static String toHexString(byte[] buf, String sep, int lineLen) {
81          if (buf == null)
82              return null;
83          if (lineLen <= 0)
84              lineLen = Integer.MAX_VALUE;
85          StringBuffer res = new StringBuffer(buf.length * 2);
86          for (int i = 0; i < buf.length; i++) {
87              int b = buf[i];
88              res.append(HEX_DIGITS[(b >> 4) & 0xf]);
89              res.append(HEX_DIGITS[b & 0xf]);
90              if (i > 0 && (i % lineLen) == 0)
91                  res.append('\n');
92              else if (sep != null && i < lineLen - 1)
93                  res.append(sep);
94          }
95          return res.toString();
96      }
97  
98      /**
99       * Convert a String containing consecutive (no inside whitespace)
100      * hexadecimal digits into a corresponding byte array. If the number of
101      * digits is not even, a '0' will be appended in the front of the String
102      * prior to conversion. Leading and trailing whitespace is ignored.
103      * 
104      * @param text
105      *            input text
106      * @return converted byte array, or null if unable to convert
107      */
108     public static byte[] fromHexString(String text) {
109         text = text.trim();
110         if (text.length() % 2 != 0)
111             text = "0" + text;
112         int resLen = text.length() / 2;
113         int loNibble, hiNibble;
114         byte[] res = new byte[resLen];
115         for (int i = 0; i < resLen; i++) {
116             int j = i << 1;
117             hiNibble = charToNibble(text.charAt(j));
118             loNibble = charToNibble(text.charAt(j + 1));
119             if (loNibble == -1 || hiNibble == -1)
120                 return null;
121             res[i] = (byte) (hiNibble << 4 | loNibble);
122         }
123         return res;
124     }
125 
126     private static final int charToNibble(char c) {
127         if (c >= '0' && c <= '9') {
128             return c - '0';
129         } else if (c >= 'a' && c <= 'f') {
130             return 0xa + (c - 'a');
131         } else if (c >= 'A' && c <= 'F') {
132             return 0xA + (c - 'A');
133         } else {
134             return -1;
135         }
136     }
137 
138     /**
139      * Parse the character encoding from the specified content type header. If
140      * the content type is null, or there is no explicit character encoding,
141      * <code>null</code> is returned. <br />
142      * This method was copy from org.apache.catalina.util.RequestUtil is
143      * licensed under the Apache License, Version 2.0 (the "License").
144      * 
145      * @param contentType
146      *            a content type header
147      */
148     public static String parseCharacterEncoding(String contentType) {
149         if (contentType == null)
150             return (null);
151         int start = contentType.indexOf("charset=");
152         if (start < 0)
153             return (null);
154         String encoding = contentType.substring(start + 8);
155         int end = encoding.indexOf(';');
156         if (end >= 0)
157             encoding = encoding.substring(0, end);
158         encoding = encoding.trim();
159         if ((encoding.length() > 2) && (encoding.startsWith("\""))
160                 && (encoding.endsWith("\"")))
161             encoding = encoding.substring(1, encoding.length() - 1);
162         return (encoding.trim());
163 
164     }
165 
166     /**
167      * Checks if a string is empty (ie is null or empty).
168      */
169     public static boolean isEmpty(String str) {
170         return (str == null) || (str.equals(""));
171     }
172 
173     private static HashMap encodingAliases = new HashMap();
174 
175     /**
176      * the following map is not an alias mapping table, but maps character
177      * encodings which are often used in mislabelled documents to their correct
178      * encodings. For instance, there are a lot of documents labelled
179      * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but
180      * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1
181      * (sharing code points for the common part), it's better to treat
182      * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid,
183      * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1.
184      */
185     static {
186         encodingAliases.put("ISO-8859-1", "windows-1252");
187         encodingAliases.put("EUC-KR", "x-windows-949");
188         encodingAliases.put("x-EUC-CN", "GB18030");
189         encodingAliases.put("GBK", "GB18030");
190         // encodingAliases.put("Big5", "Big5HKSCS");
191         // encodingAliases.put("TIS620", "Cp874");
192         // encodingAliases.put("ISO-8859-11", "Cp874");
193 
194     }
195 
196     public static String resolveEncodingAlias(String encoding) {
197         if (!Charset.isSupported(encoding))
198             return null;
199         String canonicalName = new String(Charset.forName(encoding).name());
200         return encodingAliases.containsKey(canonicalName) ? (String) encodingAliases
201                 .get(canonicalName)
202                 : canonicalName;
203     }
204 
205     public static void main(String[] args) {
206         if (args.length != 1)
207             System.out.println("Usage: StringUtil <encoding name>");
208         else
209             System.out.println(args[0] + " is resolved to "
210                     + resolveEncodingAlias(args[0]));
211     }
212 }