可将数据存入HashSet,如下,但如果文件很大,大于虚拟机内存的话,会报异常java.lang.OutOfMemoryError: Java heap space
HashSet set = new HashSet();File file = new File("E:\\aa.txt");BufferedReader reader = new BufferedReader(new FileReader(file));String tempString = null;while ((tempString = reader.readLine()) != null) {tempString = tempString.trim();if(tempString != ""){System.out.println(tempString);set.add(tempString);}}
可尝试用分批读取,用Hash取模方法将大文件拆分成若干小文件,再将若干个小文件的数据存入HashSet,最后汇总结果
首先插入测试数据aa.txt
//多线程插入测试数据public void set() throws FileNotFoundException {File file = new File("E:\\aa.txt");PrintWriter pws = new PrintWriter(file);CountDownLatch latch = new CountDownLatch(9);ExecutorService executorService = Executors.newFixedThreadPool(9);for(int i=0;i<9;i++){executorService.execute(new SetClass("name+"+UUID.randomUUID().toString(),latch,file,pws));}try {latch.await(); //线程阻塞, 当latch中数量为0时,放行} catch (InterruptedException e) {e.printStackTrace();}executorService.shutdown(); //关闭线程 pws.close();}public class SetClass extends Thread{private final CountDownLatch countDownLatch;private File file;private PrintWriter pws;public SetClass(String name, CountDownLatch countDownLatch1,File file,PrintWriter pws){super(name);this.countDownLatch = countDownLatch1;this.file = file;this.pws=pws;}@Overridepublic void run() {for(int i=0;i<100000;i++){pws.println(UUID.randomUUID().toString());System.out.println(Thread.currentThread().getName()+":"+i);}countDownLatch.countDown();}}
大文件进行拆分,利用Hash取模将重复的数据存入同一个小文件
/*** 将文件hash取模之后放到不同的小文件中* @param targetFile 要去重的文件路径* @param splitSize 将目标文件切割成多少份hash取模的小文件个数* @return*/public static File[] splitFile(String targetFile,int splitSize){File file = new File(targetFile);BufferedReader reader = null;PrintWriter[] pws = new PrintWriter[splitSize];File[] littleFiles = new File[splitSize];String parentPath = file.getParent();File tempFolder = new File(parentPath + File.separator + "test");if(!tempFolder.exists()){tempFolder.mkdir();}for(int i=0;i<splitSize;i++){littleFiles[i] = new File(tempFolder.getAbsolutePath() + File.separator + i + ".txt");if(littleFiles[i].exists()){littleFiles[i].delete();}try {pws[i] = new PrintWriter(littleFiles[i]);} catch (FileNotFoundException e) {e.printStackTrace();}}try {reader = new BufferedReader(new FileReader(file));String tempString = null;while ((tempString = reader.readLine()) != null) { // reader.readLine()逐行读取,避免一次性读完整个文件tempString = tempString.trim();if(tempString != ""){//关键是将每行数据hash取模之后放到对应取模值的文件中,确保hash值相同的字符串都在同一个文件里面int index = Math.abs(tempString.hashCode() % splitSize);pws[index].println(tempString);}}} catch (Exception e) {e.printStackTrace();} finally {if (reader != null) {try {reader.close();} catch (IOException e1) {e1.printStackTrace();}}for(int i=0;i<splitSize;i++){if(pws[i] != null){pws[i].close();}}}return littleFiles;}
对小文件进行去重并合并结果
/*** 对小文件进行去重合并* @param littleFiles 切割之后的小文件数组* @param distinctFilePath 去重之后的文件路径* @param splitSize 小文件大小*/public static void distinct(File[] littleFiles,String distinctFilePath,int splitSize){File distinctedFile = new File(distinctFilePath);FileReader[] frs = new FileReader[splitSize];BufferedReader[] brs = new BufferedReader[splitSize];PrintWriter pw = null;try {if(distinctedFile.exists()){distinctedFile.delete();}distinctedFile.createNewFile();pw = new PrintWriter(distinctedFile);Set<String> unicSet = new HashSet<String>();for(int i=0;i<splitSize;i++){if(littleFiles[i].exists()){System.out.println("开始对小文件:" + littleFiles[i].getName() + "去重");frs[i] = new FileReader(littleFiles[i]);brs[i] = new BufferedReader(frs[i]);String line = null;while((line = brs[i].readLine())!=null){if(line != ""){unicSet.add(line);}}for(String s:unicSet){pw.println(s);}unicSet.clear();System.gc();}}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e1){e1.printStackTrace();} finally {for(int i=0;i<splitSize;i++){try {if(null != brs[i]){brs[i].close();}if(null != frs[i]){frs[i].close();}} catch (IOException e) {e.printStackTrace();}//合并完成之后删除临时小文件if(littleFiles[i].exists()){littleFiles[i].delete();}}if(null != pw){pw.close();}}}