1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.tika.utils;
19
20 // JDK imports
21 import java.util.HashMap;
22 import java.nio.charset.Charset;
23
24 /**
25 * A collection of String processing utility methods.
26 */
27 public class StringUtil {
28
29 /**
30 * Returns a copy of <code>s</code> padded with trailing spaces so that
31 * it's length is <code>length</code>. Strings already
32 * <code>length</code> characters long or longer are not altered.
33 */
34 public static String rightPad(String s, int length) {
35 StringBuffer sb = new StringBuffer(s);
36 for (int i = length - s.length(); i > 0; i--)
37 sb.append(" ");
38 return sb.toString();
39 }
40
41 /**
42 * Returns a copy of <code>s</code> padded with leading spaces so that
43 * it's length is <code>length</code>. Strings already
44 * <code>length</code> characters long or longer are not altered.
45 */
46 public static String leftPad(String s, int length) {
47 StringBuffer sb = new StringBuffer();
48 for (int i = length - s.length(); i > 0; i--)
49 sb.append(" ");
50 sb.append(s);
51 return sb.toString();
52 }
53
54 private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5',
55 '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
56
57 /**
58 * Convenience call for {@link #toHexString(byte[], String, int)}, where
59 * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
60 *
61 * @param buf
62 */
63 public static String toHexString(byte[] buf) {
64 return toHexString(buf, null, Integer.MAX_VALUE);
65 }
66
67 /**
68 * Get a text representation of a byte[] as hexadecimal String, where each
69 * pair of hexadecimal digits corresponds to consecutive bytes in the array.
70 *
71 * @param buf
72 * input data
73 * @param sep
74 * separate every pair of hexadecimal digits with this separator,
75 * or null if no separation is needed.
76 * @param lineLen
77 * break the output String into lines containing output for
78 * lineLen bytes.
79 */
80 public static String toHexString(byte[] buf, String sep, int lineLen) {
81 if (buf == null)
82 return null;
83 if (lineLen <= 0)
84 lineLen = Integer.MAX_VALUE;
85 StringBuffer res = new StringBuffer(buf.length * 2);
86 for (int i = 0; i < buf.length; i++) {
87 int b = buf[i];
88 res.append(HEX_DIGITS[(b >> 4) & 0xf]);
89 res.append(HEX_DIGITS[b & 0xf]);
90 if (i > 0 && (i % lineLen) == 0)
91 res.append('\n');
92 else if (sep != null && i < lineLen - 1)
93 res.append(sep);
94 }
95 return res.toString();
96 }
97
98 /**
99 * Convert a String containing consecutive (no inside whitespace)
100 * hexadecimal digits into a corresponding byte array. If the number of
101 * digits is not even, a '0' will be appended in the front of the String
102 * prior to conversion. Leading and trailing whitespace is ignored.
103 *
104 * @param text
105 * input text
106 * @return converted byte array, or null if unable to convert
107 */
108 public static byte[] fromHexString(String text) {
109 text = text.trim();
110 if (text.length() % 2 != 0)
111 text = "0" + text;
112 int resLen = text.length() / 2;
113 int loNibble, hiNibble;
114 byte[] res = new byte[resLen];
115 for (int i = 0; i < resLen; i++) {
116 int j = i << 1;
117 hiNibble = charToNibble(text.charAt(j));
118 loNibble = charToNibble(text.charAt(j + 1));
119 if (loNibble == -1 || hiNibble == -1)
120 return null;
121 res[i] = (byte) (hiNibble << 4 | loNibble);
122 }
123 return res;
124 }
125
126 private static final int charToNibble(char c) {
127 if (c >= '0' && c <= '9') {
128 return c - '0';
129 } else if (c >= 'a' && c <= 'f') {
130 return 0xa + (c - 'a');
131 } else if (c >= 'A' && c <= 'F') {
132 return 0xA + (c - 'A');
133 } else {
134 return -1;
135 }
136 }
137
138 /**
139 * Parse the character encoding from the specified content type header. If
140 * the content type is null, or there is no explicit character encoding,
141 * <code>null</code> is returned. <br />
142 * This method was copy from org.apache.catalina.util.RequestUtil is
143 * licensed under the Apache License, Version 2.0 (the "License").
144 *
145 * @param contentType
146 * a content type header
147 */
148 public static String parseCharacterEncoding(String contentType) {
149 if (contentType == null)
150 return (null);
151 int start = contentType.indexOf("charset=");
152 if (start < 0)
153 return (null);
154 String encoding = contentType.substring(start + 8);
155 int end = encoding.indexOf(';');
156 if (end >= 0)
157 encoding = encoding.substring(0, end);
158 encoding = encoding.trim();
159 if ((encoding.length() > 2) && (encoding.startsWith("\""))
160 && (encoding.endsWith("\"")))
161 encoding = encoding.substring(1, encoding.length() - 1);
162 return (encoding.trim());
163
164 }
165
166 /**
167 * Checks if a string is empty (ie is null or empty).
168 */
169 public static boolean isEmpty(String str) {
170 return (str == null) || (str.equals(""));
171 }
172
173 private static HashMap encodingAliases = new HashMap();
174
175 /**
176 * the following map is not an alias mapping table, but maps character
177 * encodings which are often used in mislabelled documents to their correct
178 * encodings. For instance, there are a lot of documents labelled
179 * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but
180 * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1
181 * (sharing code points for the common part), it's better to treat
182 * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid,
183 * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1.
184 */
185 static {
186 encodingAliases.put("ISO-8859-1", "windows-1252");
187 encodingAliases.put("EUC-KR", "x-windows-949");
188 encodingAliases.put("x-EUC-CN", "GB18030");
189 encodingAliases.put("GBK", "GB18030");
190 // encodingAliases.put("Big5", "Big5HKSCS");
191 // encodingAliases.put("TIS620", "Cp874");
192 // encodingAliases.put("ISO-8859-11", "Cp874");
193
194 }
195
196 public static String resolveEncodingAlias(String encoding) {
197 if (!Charset.isSupported(encoding))
198 return null;
199 String canonicalName = new String(Charset.forName(encoding).name());
200 return encodingAliases.containsKey(canonicalName) ? (String) encodingAliases
201 .get(canonicalName)
202 : canonicalName;
203 }
204
205 public static void main(String[] args) {
206 if (args.length != 1)
207 System.out.println("Usage: StringUtil <encoding name>");
208 else
209 System.out.println(args[0] + " is resolved to "
210 + resolveEncodingAlias(args[0]));
211 }
212 }