View Javadoc

1   /*
2    * LICENSE
3    *
4    * "THE BEER-WARE LICENSE" (Revision 42):
5    * "Ralph Lange" <Ralph.Lange@gmx.de> wrote this file.
6    * As long as you retain this notice you can do whatever you want with
7    * this stuff. If we meet some day, and you think this stuff is worth it,
8    * you can buy me a beer in return.
9    */
10  package org.jenkinsci.plugins.darcs;
11  
12  import java.io.File;
13  import java.io.FileInputStream;
14  import java.io.IOException;
15  import java.io.InputStream;
16  import java.nio.ByteBuffer;
17  import java.nio.CharBuffer;
18  import java.nio.charset.Charset;
19  import java.nio.charset.CharsetDecoder;
20  import java.nio.charset.CoderResult;
21  import java.nio.charset.CodingErrorAction;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.List;
25  
26  /**
27   * Darcs XML Sanitizer.
28   *
29   * The output of "darcs changes --xml-output" might be invalid XML. Darcs treats the patch comments as binary blobs, and
30   * the changes command returns them as-is inside the XML structure, without ensuring that the encoding is consistent. If
31   * some of the patches in your repository were recorded on UTF-8 machines and others on e.g. ISO-8859 machines, the XML
32   * output will contain characters in both encodings.
33   *
34   * Some parsers (e.g. xerxes) choke on invalid characters in the XML input, so this sanitizer is designed to ensure that
35   * the encoding is consistent.
36   *
37   * @author Ralph Lange <Ralph.Lange@gmx.de>
38   */
39  class DarcsXmlSanitizer {
40  
41      private static final List<String> ADDL_CHARSETS = Arrays.asList("ISO-8859-1", "UTF-16");
42      private final List<CharsetDecoder> decoders = new ArrayList<CharsetDecoder>();
43  
44      /**
45       * States which indicates where in the comment string we are.
46       */
47      private enum State {
48  
49          /**
50           * Outside a name or comment tag.
51           */
52          OUTSIDE,
53          /**
54           * Inside a name tag.
55           */
56          IN_NAME,
57          /**
58           * Inside a comment tag.
59           */
60          IN_COMMENT;
61      };
62  
63      /**
64       * Dedicated constructor.
65       */
66      public DarcsXmlSanitizer() {
67          super();
68          decoders.add(Charset.forName("UTF-8").newDecoder());
69  
70          for (final String cs : ADDL_CHARSETS) {
71              decoders.add(Charset.forName(cs).newDecoder());
72          }
73  
74          // last resort: UTF-8 with replacement
75          decoders.add(Charset.forName("UTF-8").newDecoder()
76                  .onMalformedInput(CodingErrorAction.REPLACE)
77                  .onUnmappableCharacter(CodingErrorAction.REPLACE));
78      }
79  
80      /**
81       * Knuth-Morris-Pratt pattern matching algorithm.
82       *
83       * @param data
84       * @param start
85       * @param pattern
86       * @return
87       */
88      private static int positionBeforeNext(final byte[] data, final int start, final byte[] pattern) {
89          final int[] failure = computeFailure(pattern);
90          int j = 0;
91  
92          if (0 == data.length || start >= data.length) {
93              return -1;
94          }
95  
96          for (int i = start; i < data.length; i++) {
97              while (j > 0 && pattern[j] != data[i]) {
98                  j = failure[j - 1];
99              }
100             if (pattern[j] == data[i]) {
101                 j++;
102             }
103             if (j == pattern.length) {
104                 return i - pattern.length + 1;
105             }
106         }
107 
108         return -1;
109     }
110 
111     /**
112      *
113      * @param data
114      * @param start
115      * @param pattern
116      * @return
117      */
118     private static int positionAfterNext(final byte[] data, final int start, final byte[] pattern) {
119         int pos = positionBeforeNext(data, start, pattern);
120 
121         if (-1 != pos) {
122             pos += pattern.length;
123         }
124 
125         return pos;
126     }
127 
128     /**
129      * Computes the failure function using a bootstrapping process, where the pattern is matched against itself.
130      *
131      * @param pattern
132      * @return
133      */
134     private static int[] computeFailure(final byte[] pattern) {
135         final int[] failure = new int[pattern.length];
136         int j = 0;
137 
138         for (int i = 1; i < pattern.length; i++) {
139             while (j > 0 && pattern[j] != pattern[i]) {
140                 j = failure[j - 1];
141             }
142 
143             if (pattern[j] == pattern[i]) {
144                 j++;
145             }
146 
147             failure[i] = j;
148         }
149 
150         return failure;
151     }
152 
153     /**
154      * Cleanse the mixed encoding in the input byte array.
155      *
156      * @param input
157      * @return
158      */
159     public String cleanse(final byte[] input) {
160         final CharBuffer cb = CharBuffer.allocate(input.length);
161         CoderResult result;
162         State state = State.OUTSIDE;
163         int currentPosition = 0;
164         int nextPosition = 0;
165         int nextName;
166         int nextComment;
167 
168         while (currentPosition < input.length) {
169             switch (state) {
170                 case OUTSIDE:
171                     nextName = positionAfterNext(input, currentPosition, "<name>".getBytes());
172                     nextComment = positionAfterNext(input, currentPosition, "<comment>".getBytes());
173 
174                     if (-1 != nextName && nextName < nextComment) {
175                         nextPosition = nextName;
176                         state = State.IN_NAME;
177                     } else {
178                         nextPosition = nextComment;
179                         state = State.IN_COMMENT;
180                     }
181 
182                     if (-1 == nextPosition) {
183                         nextPosition = input.length;
184                         state = State.OUTSIDE;
185                     }
186                     break;
187                 case IN_NAME:
188                     nextPosition = positionBeforeNext(input, nextPosition, "</name>".getBytes());
189 
190                     if (-1 != nextPosition) {
191                         state = State.OUTSIDE;
192                     }
193 
194                     break;
195                 case IN_COMMENT:
196                     nextPosition = positionBeforeNext(input, nextPosition, "</comment>".getBytes());
197 
198                     if (-1 != nextPosition) {
199                         state = State.OUTSIDE;
200                     }
201 
202                     break;
203                 default:
204                     throw new IllegalStateException(String.format("Illegal state %s!", state));
205             }
206 
207             final ByteBuffer in = ByteBuffer.wrap(input, currentPosition, nextPosition - currentPosition);
208             in.mark();
209             cb.mark();
210 
211             for (final CharsetDecoder dec : decoders) {
212                 dec.reset();
213                 result = dec.decode(in, cb, true);
214 
215                 if (result.isError()) {
216                     in.reset();
217                     cb.reset();
218                     continue;
219                 } else {
220                     dec.flush(cb);
221                     break;
222                 }
223             }
224             currentPosition += nextPosition - currentPosition;
225         }
226 
227         cb.flip();
228         return cb.toString();
229     }
230 
231     /**
232      * @see #cleanse(byte[])
233      * @param file
234      * @return
235      * @throws IOException
236      */
237     public String cleanse(final File file) throws IOException {
238         return cleanse(readFile(file));
239     }
240 
241     private byte[] readFile(final File file) throws IOException {
242         // Taken from www.exampledepot.com
243         // Get the size of the file
244         final long length = file.length();
245 
246         // You cannot create an array using a long type.
247         // It needs to be an int type.
248         // Before converting to an int type, check
249         // to ensure that file is not larger than Integer.MAX_VALUE.
250         if (length > Integer.MAX_VALUE) {
251             throw new IOException("File is too large " + file.getName());
252         }
253 
254         // Create the byte array to hold the data
255         final byte[] bytes = new byte[(int) length];
256         int offset = 0;
257         InputStream is = null;
258 
259         try {
260             // Read in the bytes
261             is = new FileInputStream(file);
262 
263             int numRead = 0;
264             while (offset < bytes.length
265                     && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) {
266                 offset += numRead;
267             }
268         } finally {
269             if (null != is) {
270                 is.close();
271             }
272         }
273 
274         // Ensure all the bytes have been read in
275         if (offset < bytes.length) {
276             throw new IOException("Could not completely read file " + file.getName());
277         }
278 
279         return bytes;
280     }
281 }